From b86b0b150fed840c376145383ef5105116c81b0c Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 27 Mar 2020 11:32:14 -0700 Subject: Bluetooth: L2CAP: Fix handling LE modes by L2CAP_OPTIONS L2CAP_OPTIONS shall only be used with BR/EDR modes. Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Marcel Holtmann --- net/bluetooth/l2cap_sock.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index 117ba20ea194..cfb402645c26 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -424,6 +424,20 @@ static int l2cap_sock_getsockopt_old(struct socket *sock, int optname, break; } + /* Only BR/EDR modes are supported here */ + switch (chan->mode) { + case L2CAP_MODE_BASIC: + case L2CAP_MODE_ERTM: + case L2CAP_MODE_STREAMING: + break; + default: + err = -EINVAL; + break; + } + + if (err < 0) + break; + memset(&opts, 0, sizeof(opts)); opts.imtu = chan->imtu; opts.omtu = chan->omtu; @@ -698,10 +712,8 @@ static int l2cap_sock_setsockopt_old(struct socket *sock, int optname, break; } - chan->mode = opts.mode; - switch (chan->mode) { - case L2CAP_MODE_LE_FLOWCTL: - break; + /* Only BR/EDR modes are supported here */ + switch (opts.mode) { case L2CAP_MODE_BASIC: clear_bit(CONF_STATE2_DEVICE, &chan->conf_state); break; @@ -715,6 +727,11 @@ static int l2cap_sock_setsockopt_old(struct socket *sock, int optname, break; } + if (err < 0) + break; + + chan->mode = opts.mode; + BT_DBG("mode 0x%2.2x", chan->mode); chan->imtu = opts.imtu; -- cgit v1.2.3 From 3ee7b7cd83900bb711efadbf16fa096a615a1566 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 27 Mar 2020 11:32:15 -0700 Subject: Bluetooth: Add BT_MODE socket option This adds BT_MODE socket option which can be used to set L2CAP modes, including modes only supported over LE which were not supported using the L2CAP_OPTIONS. Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Marcel Holtmann --- include/net/bluetooth/bluetooth.h | 8 +++ net/bluetooth/l2cap_sock.c | 113 +++++++++++++++++++++++++++++++++++++- 2 files changed, 120 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index 1576353a2773..3fa7b1e3c5d9 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -139,6 +139,14 @@ struct bt_voice { #define BT_PHY_LE_CODED_TX 0x00002000 #define BT_PHY_LE_CODED_RX 0x00004000 +#define BT_MODE 15 + +#define BT_MODE_BASIC 0x00 +#define BT_MODE_ERTM 0x01 +#define BT_MODE_STREAMING 0x02 +#define BT_MODE_LE_FLOWCTL 0x03 +#define BT_MODE_EXT_FLOWCTL 0x04 + __printf(1, 2) void bt_info(const char *fmt, ...); __printf(1, 2) diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index cfb402645c26..1cea42ee1e92 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -395,6 +395,24 @@ static int l2cap_sock_getname(struct socket *sock, struct sockaddr *addr, return sizeof(struct sockaddr_l2); } +static int l2cap_get_mode(struct l2cap_chan *chan) +{ + switch (chan->mode) { + case L2CAP_MODE_BASIC: + return BT_MODE_BASIC; + case L2CAP_MODE_ERTM: + return BT_MODE_ERTM; + case L2CAP_MODE_STREAMING: + return BT_MODE_STREAMING; + case L2CAP_MODE_LE_FLOWCTL: + return BT_MODE_LE_FLOWCTL; + case L2CAP_MODE_EXT_FLOWCTL: + return BT_MODE_EXT_FLOWCTL; + } + + return -EINVAL; +} + static int l2cap_sock_getsockopt_old(struct socket *sock, int optname, char __user *optval, int __user *optlen) { @@ -522,7 +540,7 @@ static int l2cap_sock_getsockopt(struct socket *sock, int level, int optname, struct bt_security sec; struct bt_power pwr; u32 phys; - int len, err = 0; + int len, mode, err = 0; BT_DBG("sk %p", sk); @@ -638,6 +656,27 @@ static int l2cap_sock_getsockopt(struct socket *sock, int level, int optname, err = -EFAULT; break; + case BT_MODE: + if (!enable_ecred) { + err = -ENOPROTOOPT; + break; + } + + if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED) { + err = -EINVAL; + break; + } + + mode = l2cap_get_mode(chan); + if (mode < 0) { + err = mode; + break; + } + + if (put_user(mode, (u8 __user *) optval)) + err = -EFAULT; + break; + default: err = -ENOPROTOOPT; break; @@ -780,6 +819,45 @@ static int l2cap_sock_setsockopt_old(struct socket *sock, int optname, return err; } +static int l2cap_set_mode(struct l2cap_chan *chan, u8 mode) +{ + switch (mode) { + case BT_MODE_BASIC: + if (bdaddr_type_is_le(chan->src_type)) + return -EINVAL; + mode = L2CAP_MODE_BASIC; + clear_bit(CONF_STATE2_DEVICE, &chan->conf_state); + break; + case BT_MODE_ERTM: + if (!disable_ertm || bdaddr_type_is_le(chan->src_type)) + return -EINVAL; + mode = L2CAP_MODE_ERTM; + break; + case BT_MODE_STREAMING: + if (!disable_ertm || bdaddr_type_is_le(chan->src_type)) + return -EINVAL; + mode = L2CAP_MODE_STREAMING; + break; + case BT_MODE_LE_FLOWCTL: + if (!bdaddr_type_is_le(chan->src_type)) + return -EINVAL; + mode = L2CAP_MODE_LE_FLOWCTL; + break; + case BT_MODE_EXT_FLOWCTL: + /* TODO: Add support for ECRED PDUs to BR/EDR */ + if (!bdaddr_type_is_le(chan->src_type)) + return -EINVAL; + mode = L2CAP_MODE_EXT_FLOWCTL; + break; + default: + return -EINVAL; + } + + chan->mode = mode; + + return 0; +} + static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen) { @@ -985,6 +1063,39 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, break; + case BT_MODE: + if (!enable_ecred) { + err = -ENOPROTOOPT; + break; + } + + BT_DBG("sk->sk_state %u", sk->sk_state); + + if (sk->sk_state != BT_BOUND) { + err = -EINVAL; + break; + } + + if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED) { + err = -EINVAL; + break; + } + + if (get_user(opt, (u8 __user *) optval)) { + err = -EFAULT; + break; + } + + BT_DBG("opt %u", opt); + + err = l2cap_set_mode(chan, opt); + if (err) + break; + + BT_DBG("mode 0x%2.2x", chan->mode); + + break; + default: err = -ENOPROTOOPT; break; -- cgit v1.2.3 From 92516cd97fd4d8ad5b1421a0d51771044f453a5f Mon Sep 17 00:00:00 2001 From: Sonny Sasaka Date: Fri, 27 Mar 2020 17:34:23 -0700 Subject: Bluetooth: Always request for user confirmation for Just Works To improve security, always give the user-space daemon a chance to accept or reject a Just Works pairing (LE). The daemon may decide to auto-accept based on the user's intent. Signed-off-by: Sonny Sasaka Signed-off-by: Marcel Holtmann --- net/bluetooth/smp.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 1476a91ce935..d0b695ee49f6 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -855,6 +855,7 @@ static int tk_request(struct l2cap_conn *conn, u8 remote_oob, u8 auth, struct smp_chan *smp = chan->data; u32 passkey = 0; int ret = 0; + int err; /* Initialize key for JUST WORKS */ memset(smp->tk, 0, sizeof(smp->tk)); @@ -883,9 +884,16 @@ static int tk_request(struct l2cap_conn *conn, u8 remote_oob, u8 auth, hcon->io_capability == HCI_IO_NO_INPUT_OUTPUT) smp->method = JUST_WORKS; - /* If Just Works, Continue with Zero TK */ + /* If Just Works, Continue with Zero TK and ask user-space for + * confirmation */ if (smp->method == JUST_WORKS) { - set_bit(SMP_FLAG_TK_VALID, &smp->flags); + err = mgmt_user_confirm_request(hcon->hdev, &hcon->dst, + hcon->type, + hcon->dst_type, + passkey, 1); + if (err) + return SMP_UNSPECIFIED; + set_bit(SMP_FLAG_WAIT_USER, &smp->flags); return 0; } -- cgit v1.2.3 From 7fedd3bb6b77f9b6eefb0e4dcd8f79d0d00b86d7 Mon Sep 17 00:00:00 2001 From: Abhishek Pandit-Subedi Date: Mon, 23 Mar 2020 12:45:07 -0700 Subject: Bluetooth: Prioritize SCO traffic When scheduling TX packets, send all SCO/eSCO packets first, check for pending SCO/eSCO packets after every ACL/LE packet and send them if any are pending. This is done to make sure that we can meet SCO deadlines on slow interfaces like UART. If we were to queue up multiple ACL packets without checking for a SCO packet, we might miss the SCO timing. For example: The time it takes to send a maximum size ACL packet (1024 bytes): t = 10/8 * 1024 bytes * 8 bits/byte * 1 packet / baudrate where 10/8 is uart overhead due to start/stop bits per byte Replace t = 3.75ms (SCO deadline), which gives us a baudrate of 2730666. At a baudrate of 3000000, if we didn't check for SCO packets within 1024 bytes, we would miss the 3.75ms timing window. Signed-off-by: Abhishek Pandit-Subedi Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_core.c | 106 +++++++++++++++++++++++++---------------------- 1 file changed, 57 insertions(+), 49 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 2e7bc2da8371..5fb9db0b2b7b 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -4240,6 +4240,54 @@ static void __check_timeout(struct hci_dev *hdev, unsigned int cnt) } } +/* Schedule SCO */ +static void hci_sched_sco(struct hci_dev *hdev) +{ + struct hci_conn *conn; + struct sk_buff *skb; + int quote; + + BT_DBG("%s", hdev->name); + + if (!hci_conn_num(hdev, SCO_LINK)) + return; + + while (hdev->sco_cnt && (conn = hci_low_sent(hdev, SCO_LINK, "e))) { + while (quote-- && (skb = skb_dequeue(&conn->data_q))) { + BT_DBG("skb %p len %d", skb, skb->len); + hci_send_frame(hdev, skb); + + conn->sent++; + if (conn->sent == ~0) + conn->sent = 0; + } + } +} + +static void hci_sched_esco(struct hci_dev *hdev) +{ + struct hci_conn *conn; + struct sk_buff *skb; + int quote; + + BT_DBG("%s", hdev->name); + + if (!hci_conn_num(hdev, ESCO_LINK)) + return; + + while (hdev->sco_cnt && (conn = hci_low_sent(hdev, ESCO_LINK, + "e))) { + while (quote-- && (skb = skb_dequeue(&conn->data_q))) { + BT_DBG("skb %p len %d", skb, skb->len); + hci_send_frame(hdev, skb); + + conn->sent++; + if (conn->sent == ~0) + conn->sent = 0; + } + } +} + static void hci_sched_acl_pkt(struct hci_dev *hdev) { unsigned int cnt = hdev->acl_cnt; @@ -4271,6 +4319,10 @@ static void hci_sched_acl_pkt(struct hci_dev *hdev) hdev->acl_cnt--; chan->sent++; chan->conn->sent++; + + /* Send pending SCO packets right away */ + hci_sched_sco(hdev); + hci_sched_esco(hdev); } } @@ -4355,54 +4407,6 @@ static void hci_sched_acl(struct hci_dev *hdev) } } -/* Schedule SCO */ -static void hci_sched_sco(struct hci_dev *hdev) -{ - struct hci_conn *conn; - struct sk_buff *skb; - int quote; - - BT_DBG("%s", hdev->name); - - if (!hci_conn_num(hdev, SCO_LINK)) - return; - - while (hdev->sco_cnt && (conn = hci_low_sent(hdev, SCO_LINK, "e))) { - while (quote-- && (skb = skb_dequeue(&conn->data_q))) { - BT_DBG("skb %p len %d", skb, skb->len); - hci_send_frame(hdev, skb); - - conn->sent++; - if (conn->sent == ~0) - conn->sent = 0; - } - } -} - -static void hci_sched_esco(struct hci_dev *hdev) -{ - struct hci_conn *conn; - struct sk_buff *skb; - int quote; - - BT_DBG("%s", hdev->name); - - if (!hci_conn_num(hdev, ESCO_LINK)) - return; - - while (hdev->sco_cnt && (conn = hci_low_sent(hdev, ESCO_LINK, - "e))) { - while (quote-- && (skb = skb_dequeue(&conn->data_q))) { - BT_DBG("skb %p len %d", skb, skb->len); - hci_send_frame(hdev, skb); - - conn->sent++; - if (conn->sent == ~0) - conn->sent = 0; - } - } -} - static void hci_sched_le(struct hci_dev *hdev) { struct hci_chan *chan; @@ -4437,6 +4441,10 @@ static void hci_sched_le(struct hci_dev *hdev) cnt--; chan->sent++; chan->conn->sent++; + + /* Send pending SCO packets right away */ + hci_sched_sco(hdev); + hci_sched_esco(hdev); } } @@ -4459,9 +4467,9 @@ static void hci_tx_work(struct work_struct *work) if (!hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { /* Schedule queues and send stuff to HCI driver */ - hci_sched_acl(hdev); hci_sched_sco(hdev); hci_sched_esco(hdev); + hci_sched_acl(hdev); hci_sched_le(hdev); } -- cgit v1.2.3 From 1e5479be46a70389e1059818a2e9358858eaa5fc Mon Sep 17 00:00:00 2001 From: Alain Michaud Date: Fri, 3 Apr 2020 13:49:05 +0000 Subject: Bluetooth: fixing minor typo in comment This changes a simple typo in hci_event.c Signed-off-by: Alain Michaud Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 0a591be8b0ae..ddf77304aa8e 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -5269,7 +5269,7 @@ static struct hci_conn *check_pending_le_conn(struct hci_dev *hdev, case HCI_AUTO_CONN_ALWAYS: /* Devices advertising with ADV_IND or ADV_DIRECT_IND * are triggering a connection attempt. This means - * that incoming connectioms from slave device are + * that incoming connections from slave device are * accepted and also outgoing connections to slave * devices are established when found. */ -- cgit v1.2.3 From 1f8330ea1692c9c490b1e566e31d96d8cef99dd8 Mon Sep 17 00:00:00 2001 From: Sathish Narsimman Date: Fri, 3 Apr 2020 21:43:58 +0200 Subject: Bluetooth: add support to notify using SCO air mode notifying using HCI_NOTIFY_CONN_ADD for SCO connection is generic in case of mSBC audio. To differntiate SCO air mode introducing HCI_NOTIFY_ENABLE_SCO_CVSD and HCI_NOTIFY_ENABLE_SCO_TRANSP. Signed-off-by: Sathish Narsimman Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- include/net/bluetooth/hci.h | 3 +++ net/bluetooth/hci_conn.c | 25 +++++++++++++++++++++---- net/bluetooth/hci_event.c | 23 ++++++++++++++++++++++- 3 files changed, 46 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 5f60e135aeb6..9ff2f7a9e131 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -53,6 +53,9 @@ #define HCI_NOTIFY_CONN_ADD 1 #define HCI_NOTIFY_CONN_DEL 2 #define HCI_NOTIFY_VOICE_SETTING 3 +#define HCI_NOTIFY_ENABLE_SCO_CVSD 4 +#define HCI_NOTIFY_ENABLE_SCO_TRANSP 5 +#define HCI_NOTIFY_DISABLE_SCO 6 /* HCI bus types */ #define HCI_VIRTUAL 0 diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index e245bc155cc2..07c34c55fc50 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -122,8 +122,18 @@ static void hci_conn_cleanup(struct hci_conn *conn) hci_conn_hash_del(hdev, conn); - if (hdev->notify) - hdev->notify(hdev, HCI_NOTIFY_CONN_DEL); + if (conn->type == SCO_LINK || conn->type == ESCO_LINK) { + switch (conn->setting & SCO_AIRMODE_MASK) { + case SCO_AIRMODE_CVSD: + case SCO_AIRMODE_TRANSP: + if (hdev->notify) + hdev->notify(hdev, HCI_NOTIFY_DISABLE_SCO); + break; + } + } else { + if (hdev->notify) + hdev->notify(hdev, HCI_NOTIFY_CONN_DEL); + } hci_conn_del_sysfs(conn); @@ -577,8 +587,15 @@ struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst, hci_dev_hold(hdev); hci_conn_hash_add(hdev, conn); - if (hdev->notify) - hdev->notify(hdev, HCI_NOTIFY_CONN_ADD); + + /* The SCO and eSCO connections will only be notified when their + * setup has been completed. This is different to ACL links which + * can be notified right away. + */ + if (conn->type != SCO_LINK && conn->type != ESCO_LINK) { + if (hdev->notify) + hdev->notify(hdev, HCI_NOTIFY_CONN_ADD); + } hci_conn_init_sysfs(conn); diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index ddf77304aa8e..af396cb69602 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -2607,8 +2607,16 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) if (ev->status) { hci_connect_cfm(conn, ev->status); hci_conn_del(conn); - } else if (ev->link_type != ACL_LINK) + } else if (ev->link_type == SCO_LINK) { + switch (conn->setting & SCO_AIRMODE_MASK) { + case SCO_AIRMODE_CVSD: + if (hdev->notify) + hdev->notify(hdev, HCI_NOTIFY_ENABLE_SCO_CVSD); + break; + } + hci_connect_cfm(conn, ev->status); + } unlock: hci_dev_unlock(hdev); @@ -4307,6 +4315,19 @@ static void hci_sync_conn_complete_evt(struct hci_dev *hdev, break; } + bt_dev_dbg(hdev, "SCO connected with air mode: %02x", ev->air_mode); + + switch (conn->setting & SCO_AIRMODE_MASK) { + case SCO_AIRMODE_CVSD: + if (hdev->notify) + hdev->notify(hdev, HCI_NOTIFY_ENABLE_SCO_CVSD); + break; + case SCO_AIRMODE_TRANSP: + if (hdev->notify) + hdev->notify(hdev, HCI_NOTIFY_ENABLE_SCO_TRANSP); + break; + } + hci_connect_cfm(conn, ev->status); if (ev->status) hci_conn_del(conn); -- cgit v1.2.3 From 3d2336042ae3555d4b77995402291c5795882d20 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Fri, 3 Apr 2020 21:44:00 +0200 Subject: Bluetooth: Move debugfs configuration above the selftests This is just a cosmetic clean to move the selftests configuration option to the bottom of the list of options. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/Kconfig | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/bluetooth/Kconfig b/net/bluetooth/Kconfig index 165148c7c4ce..77703216a2e3 100644 --- a/net/bluetooth/Kconfig +++ b/net/bluetooth/Kconfig @@ -93,6 +93,14 @@ config BT_LEDS This option selects a few LED triggers for different Bluetooth events. +config BT_DEBUGFS + bool "Export Bluetooth internals in debugfs" + depends on BT && DEBUG_FS + default y + help + Provide extensive information about internal Bluetooth states + in debugfs. + config BT_SELFTEST bool "Bluetooth self testing support" depends on BT && DEBUG_KERNEL @@ -120,12 +128,4 @@ config BT_SELFTEST_SMP Run test cases for SMP cryptographic functionality, including both legacy SMP as well as the Secure Connections features. -config BT_DEBUGFS - bool "Export Bluetooth internals in debugfs" - depends on BT && DEBUG_FS - default y - help - Provide extensive information about internal Bluetooth states - in debugfs. - source "drivers/bluetooth/Kconfig" -- cgit v1.2.3 From 145373cb1b1fcdba2059e945d0aa2613af2e84d1 Mon Sep 17 00:00:00 2001 From: Miao-chen Chou Date: Fri, 3 Apr 2020 21:44:01 +0200 Subject: Bluetooth: Add framework for Microsoft vendor extension Micrsoft defined a set for HCI vendor extensions. Check the following link for details: https://docs.microsoft.com/en-us/windows-hardware/drivers/bluetooth/microsoft-defined-bluetooth-hci-commands-and-events This provides the basic framework to enable the extension and read its supported features. Drivers still have to declare support for this extension before it can be utilized by the host stack. Signed-off-by: Miao-chen Chou Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- include/net/bluetooth/hci_core.h | 13 ++++ net/bluetooth/Kconfig | 7 ++ net/bluetooth/Makefile | 1 + net/bluetooth/hci_core.c | 5 ++ net/bluetooth/hci_event.c | 5 ++ net/bluetooth/msft.c | 141 +++++++++++++++++++++++++++++++++++++++ net/bluetooth/msft.h | 18 +++++ 7 files changed, 190 insertions(+) create mode 100644 net/bluetooth/msft.c create mode 100644 net/bluetooth/msft.h (limited to 'net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index d4e28773d378..3cb0f82d0c83 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -484,6 +484,11 @@ struct hci_dev { struct led_trigger *power_led; #endif +#if IS_ENABLED(CONFIG_BT_MSFTEXT) + __u16 msft_opcode; + void *msft_data; +#endif + int (*open)(struct hci_dev *hdev); int (*close)(struct hci_dev *hdev); int (*flush)(struct hci_dev *hdev); @@ -1116,6 +1121,14 @@ int hci_recv_frame(struct hci_dev *hdev, struct sk_buff *skb); int hci_recv_diag(struct hci_dev *hdev, struct sk_buff *skb); __printf(2, 3) void hci_set_hw_info(struct hci_dev *hdev, const char *fmt, ...); __printf(2, 3) void hci_set_fw_info(struct hci_dev *hdev, const char *fmt, ...); + +static inline void hci_set_msft_opcode(struct hci_dev *hdev, __u16 opcode) +{ +#if IS_ENABLED(CONFIG_BT_MSFTEXT) + hdev->msft_opcode = opcode; +#endif +} + int hci_dev_open(__u16 dev); int hci_dev_close(__u16 dev); int hci_dev_do_close(struct hci_dev *hdev); diff --git a/net/bluetooth/Kconfig b/net/bluetooth/Kconfig index 77703216a2e3..9e25c6570170 100644 --- a/net/bluetooth/Kconfig +++ b/net/bluetooth/Kconfig @@ -93,6 +93,13 @@ config BT_LEDS This option selects a few LED triggers for different Bluetooth events. +config BT_MSFTEXT + bool "Enable Microsoft extensions" + depends on BT + help + This options enables support for the Microsoft defined HCI + vendor extensions. + config BT_DEBUGFS bool "Export Bluetooth internals in debugfs" depends on BT && DEBUG_FS diff --git a/net/bluetooth/Makefile b/net/bluetooth/Makefile index fda41c0b4781..41dd541a44a5 100644 --- a/net/bluetooth/Makefile +++ b/net/bluetooth/Makefile @@ -19,5 +19,6 @@ bluetooth-y := af_bluetooth.o hci_core.o hci_conn.o hci_event.o mgmt.o \ bluetooth-$(CONFIG_BT_BREDR) += sco.o bluetooth-$(CONFIG_BT_HS) += a2mp.o amp.o bluetooth-$(CONFIG_BT_LEDS) += leds.o +bluetooth-$(CONFIG_BT_MSFTEXT) += msft.o bluetooth-$(CONFIG_BT_DEBUGFS) += hci_debugfs.o bluetooth-$(CONFIG_BT_SELFTEST) += selftest.o diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 5fb9db0b2b7b..ef0ee3a3d9ed 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -44,6 +44,7 @@ #include "hci_debugfs.h" #include "smp.h" #include "leds.h" +#include "msft.h" static void hci_rx_work(struct work_struct *work); static void hci_cmd_work(struct work_struct *work); @@ -1563,6 +1564,8 @@ setup_failed: hci_dev_test_flag(hdev, HCI_VENDOR_DIAG) && hdev->set_diag) ret = hdev->set_diag(hdev, true); + msft_do_open(hdev); + clear_bit(HCI_INIT, &hdev->flags); if (!ret) { @@ -1758,6 +1761,8 @@ int hci_dev_do_close(struct hci_dev *hdev) hci_sock_dev_event(hdev, HCI_DEV_DOWN); + msft_do_close(hdev); + if (hdev->flush) hdev->flush(hdev); diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index af396cb69602..2803beaa1c44 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -35,6 +35,7 @@ #include "a2mp.h" #include "amp.h" #include "smp.h" +#include "msft.h" #define ZERO_KEY "\x00\x00\x00\x00\x00\x00\x00\x00" \ "\x00\x00\x00\x00\x00\x00\x00\x00" @@ -6166,6 +6167,10 @@ void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb) hci_num_comp_blocks_evt(hdev, skb); break; + case HCI_EV_VENDOR: + msft_vendor_evt(hdev, skb); + break; + default: BT_DBG("%s event 0x%2.2x", hdev->name, event); break; diff --git a/net/bluetooth/msft.c b/net/bluetooth/msft.c new file mode 100644 index 000000000000..d6c4e6b5ae77 --- /dev/null +++ b/net/bluetooth/msft.c @@ -0,0 +1,141 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2020 Google Corporation + */ + +#include +#include + +#include "msft.h" + +#define MSFT_OP_READ_SUPPORTED_FEATURES 0x00 +struct msft_cp_read_supported_features { + __u8 sub_opcode; +} __packed; +struct msft_rp_read_supported_features { + __u8 status; + __u8 sub_opcode; + __le64 features; + __u8 evt_prefix_len; + __u8 evt_prefix[0]; +} __packed; + +struct msft_data { + __u64 features; + __u8 evt_prefix_len; + __u8 *evt_prefix; +}; + +static bool read_supported_features(struct hci_dev *hdev, + struct msft_data *msft) +{ + struct msft_cp_read_supported_features cp; + struct msft_rp_read_supported_features *rp; + struct sk_buff *skb; + + cp.sub_opcode = MSFT_OP_READ_SUPPORTED_FEATURES; + + skb = __hci_cmd_sync(hdev, hdev->msft_opcode, sizeof(cp), &cp, + HCI_CMD_TIMEOUT); + if (IS_ERR(skb)) { + bt_dev_err(hdev, "Failed to read MSFT supported features (%ld)", + PTR_ERR(skb)); + return false; + } + + if (skb->len < sizeof(*rp)) { + bt_dev_err(hdev, "MSFT supported features length mismatch"); + goto failed; + } + + rp = (struct msft_rp_read_supported_features *)skb->data; + + if (rp->sub_opcode != MSFT_OP_READ_SUPPORTED_FEATURES) + goto failed; + + if (rp->evt_prefix_len > 0) { + msft->evt_prefix = kmemdup(rp->evt_prefix, rp->evt_prefix_len, + GFP_KERNEL); + if (!msft->evt_prefix) + goto failed; + } + + msft->evt_prefix_len = rp->evt_prefix_len; + msft->features = __le64_to_cpu(rp->features); + + kfree_skb(skb); + return true; + +failed: + kfree_skb(skb); + return false; +} + +void msft_do_open(struct hci_dev *hdev) +{ + struct msft_data *msft; + + if (hdev->msft_opcode == HCI_OP_NOP) + return; + + bt_dev_dbg(hdev, "Initialize MSFT extension"); + + msft = kzalloc(sizeof(*msft), GFP_KERNEL); + if (!msft) + return; + + if (!read_supported_features(hdev, msft)) { + kfree(msft); + return; + } + + hdev->msft_data = msft; +} + +void msft_do_close(struct hci_dev *hdev) +{ + struct msft_data *msft = hdev->msft_data; + + if (!msft) + return; + + bt_dev_dbg(hdev, "Cleanup of MSFT extension"); + + hdev->msft_data = NULL; + + kfree(msft->evt_prefix); + kfree(msft); +} + +void msft_vendor_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct msft_data *msft = hdev->msft_data; + u8 event; + + if (!msft) + return; + + /* When the extension has defined an event prefix, check that it + * matches, and otherwise just return. + */ + if (msft->evt_prefix_len > 0) { + if (skb->len < msft->evt_prefix_len) + return; + + if (memcmp(skb->data, msft->evt_prefix, msft->evt_prefix_len)) + return; + + skb_pull(skb, msft->evt_prefix_len); + } + + /* Every event starts at least with an event code and the rest of + * the data is variable and depends on the event code. + */ + if (skb->len < 1) + return; + + event = *skb->data; + skb_pull(skb, 1); + + bt_dev_dbg(hdev, "MSFT vendor event %u", event); +} diff --git a/net/bluetooth/msft.h b/net/bluetooth/msft.h new file mode 100644 index 000000000000..5aa9130e1f8a --- /dev/null +++ b/net/bluetooth/msft.h @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2020 Google Corporation + */ + +#if IS_ENABLED(CONFIG_BT_MSFTEXT) + +void msft_do_open(struct hci_dev *hdev); +void msft_do_close(struct hci_dev *hdev); +void msft_vendor_evt(struct hci_dev *hdev, struct sk_buff *skb); + +#else + +static inline void msft_do_open(struct hci_dev *hdev) {} +static inline void msft_do_close(struct hci_dev *hdev) {} +static inline void msft_vendor_evt(struct hci_dev *hdev, struct sk_buff *skb) {} + +#endif -- cgit v1.2.3 From a479036041d6a1bcf98f72b16a425e8d45e20ae9 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Fri, 3 Apr 2020 21:44:04 +0200 Subject: Bluetooth: Add support for Read Local Simple Pairing Options With the Read Local Simple Pairing Options command it is possible to retrieve the support for max encryption key size supported by the controller and also if the controller correctly verifies the ECDH public key during pairing. Signed-off-by: Marcel Holtmann Reviewed-by: Alain Michaud Signed-off-by: Johan Hedberg --- include/net/bluetooth/hci.h | 7 +++++++ include/net/bluetooth/hci_core.h | 2 ++ net/bluetooth/hci_core.c | 4 ++++ net/bluetooth/hci_event.c | 21 +++++++++++++++++++++ 4 files changed, 34 insertions(+) (limited to 'net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 9ff2f7a9e131..086a9e9d5d03 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -1275,6 +1275,13 @@ struct hci_rp_read_data_block_size { #define HCI_OP_READ_LOCAL_CODECS 0x100b +#define HCI_OP_READ_LOCAL_PAIRING_OPTS 0x100c +struct hci_rp_read_local_pairing_opts { + __u8 status; + __u8 pairing_opts; + __u8 max_key_size; +} __packed; + #define HCI_OP_READ_PAGE_SCAN_ACTIVITY 0x0c1b struct hci_rp_read_page_scan_activity { __u8 status; diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 3cb0f82d0c83..2f3275f1d1c4 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -312,6 +312,8 @@ struct hci_dev { __u16 conn_info_max_age; __u16 auth_payload_timeout; __u8 min_enc_key_size; + __u8 max_enc_key_size; + __u8 pairing_opts; __u8 ssp_debug_mode; __u8 hw_error_code; __u32 clock; diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index ef0ee3a3d9ed..589c4085499c 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -827,6 +827,10 @@ static int hci_init4_req(struct hci_request *req, unsigned long opt) if (hdev->commands[29] & 0x20) hci_req_add(req, HCI_OP_READ_LOCAL_CODECS, 0, NULL); + /* Read local pairing options if the HCI command is supported */ + if (hdev->commands[41] & 0x08) + hci_req_add(req, HCI_OP_READ_LOCAL_PAIRING_OPTS, 0, NULL); + /* Get MWS transport configuration if the HCI command is supported */ if (hdev->commands[30] & 0x08) hci_req_add(req, HCI_OP_GET_MWS_TRANSPORT_CONFIG, 0, NULL); diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 2803beaa1c44..51e6461f0b71 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -747,6 +747,23 @@ static void hci_cc_read_bd_addr(struct hci_dev *hdev, struct sk_buff *skb) bacpy(&hdev->setup_addr, &rp->bdaddr); } +static void hci_cc_read_local_pairing_opts(struct hci_dev *hdev, + struct sk_buff *skb) +{ + struct hci_rp_read_local_pairing_opts *rp = (void *) skb->data; + + BT_DBG("%s status 0x%2.2x", hdev->name, rp->status); + + if (rp->status) + return; + + if (hci_dev_test_flag(hdev, HCI_SETUP) || + hci_dev_test_flag(hdev, HCI_CONFIG)) { + hdev->pairing_opts = rp->pairing_opts; + hdev->max_enc_key_size = rp->max_key_size; + } +} + static void hci_cc_read_page_scan_activity(struct hci_dev *hdev, struct sk_buff *skb) { @@ -3343,6 +3360,10 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb, hci_cc_read_bd_addr(hdev, skb); break; + case HCI_OP_READ_LOCAL_PAIRING_OPTS: + hci_cc_read_local_pairing_opts(hdev, skb); + break; + case HCI_OP_READ_PAGE_SCAN_ACTIVITY: hci_cc_read_page_scan_activity(hdev, skb); break; -- cgit v1.2.3 From bc292258c580a82c9baef0a64f66971e010a40a9 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Fri, 3 Apr 2020 21:44:05 +0200 Subject: Bluetooth: Add support for reading security information To allow userspace to make correcty security policy decision, the kernel needs to export a few details of the supported security features and encryption key size information. This command exports this information and also allows future extensions if needed. Signed-off-by: Marcel Holtmann Reviewed-by: Alain Michaud Signed-off-by: Johan Hedberg --- include/net/bluetooth/mgmt.h | 7 ++++++ net/bluetooth/mgmt.c | 53 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) (limited to 'net') diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h index f41cd87550dc..65dd6fd1fff3 100644 --- a/include/net/bluetooth/mgmt.h +++ b/include/net/bluetooth/mgmt.h @@ -674,6 +674,13 @@ struct mgmt_cp_set_blocked_keys { #define MGMT_OP_SET_WIDEBAND_SPEECH 0x0047 +#define MGMT_OP_READ_SECURITY_INFO 0x0048 +#define MGMT_READ_SECURITY_INFO_SIZE 0 +struct mgmt_rp_read_security_info { + __le16 sec_len; + __u8 sec[0]; +} __packed; + #define MGMT_EV_CMD_COMPLETE 0x0001 struct mgmt_ev_cmd_complete { __le16 opcode; diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 6552003a170e..7b9eac339c87 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -108,6 +108,7 @@ static const u16 mgmt_commands[] = { MGMT_OP_SET_APPEARANCE, MGMT_OP_SET_BLOCKED_KEYS, MGMT_OP_SET_WIDEBAND_SPEECH, + MGMT_OP_READ_SECURITY_INFO, }; static const u16 mgmt_events[] = { @@ -155,6 +156,7 @@ static const u16 mgmt_untrusted_commands[] = { MGMT_OP_READ_CONFIG_INFO, MGMT_OP_READ_EXT_INDEX_LIST, MGMT_OP_READ_EXT_INFO, + MGMT_OP_READ_SECURITY_INFO, }; static const u16 mgmt_untrusted_events[] = { @@ -3659,6 +3661,55 @@ unlock: return err; } +static int read_security_info(struct sock *sk, struct hci_dev *hdev, + void *data, u16 data_len) +{ + char buf[16]; + struct mgmt_rp_read_security_info *rp = (void *)buf; + u16 sec_len = 0; + u8 flags = 0; + + bt_dev_dbg(hdev, "sock %p", sk); + + memset(&buf, 0, sizeof(buf)); + + hci_dev_lock(hdev); + + /* When the Read Simple Pairing Options command is supported, then + * the remote public key validation is supported. + */ + if (hdev->commands[41] & 0x08) + flags |= 0x01; /* Remote public key validation (BR/EDR) */ + + flags |= 0x02; /* Remote public key validation (LE) */ + + /* When the Read Encryption Key Size command is supported, then the + * encryption key size is enforced. + */ + if (hdev->commands[20] & 0x10) + flags |= 0x04; /* Encryption key size enforcement (BR/EDR) */ + + flags |= 0x08; /* Encryption key size enforcement (LE) */ + + sec_len = eir_append_data(rp->sec, sec_len, 0x01, &flags, 1); + + /* When the Read Simple Pairing Options command is supported, then + * also max encryption key size information is provided. + */ + if (hdev->commands[41] & 0x08) + sec_len = eir_append_le16(rp->sec, sec_len, 0x02, + hdev->max_enc_key_size); + + sec_len = eir_append_le16(rp->sec, sec_len, 0x03, SMP_MAX_ENC_KEY_SIZE); + + rp->sec_len = cpu_to_le16(sec_len); + + hci_dev_unlock(hdev); + + return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_READ_SECURITY_INFO, 0, + rp, sizeof(*rp) + sec_len); +} + static void read_local_oob_data_complete(struct hci_dev *hdev, u8 status, u16 opcode, struct sk_buff *skb) { @@ -7099,6 +7150,8 @@ static const struct hci_mgmt_handler mgmt_handlers[] = { { set_blocked_keys, MGMT_OP_SET_BLOCKED_KEYS_SIZE, HCI_MGMT_VAR_LEN }, { set_wideband_speech, MGMT_SETTING_SIZE }, + { read_security_info, MGMT_READ_SECURITY_INFO_SIZE, + HCI_MGMT_UNTRUSTED }, }; void mgmt_index_added(struct hci_dev *hdev) -- cgit v1.2.3 From 3679fe7d43c65e07f00afb216987f33e152ceb6f Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Fri, 3 Apr 2020 21:44:06 +0200 Subject: Bluetooth: Increment management interface revision Increment the mgmt revision due to the recently added new commands. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/mgmt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 7b9eac339c87..f8c0a4fc8090 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -38,7 +38,7 @@ #include "mgmt_util.h" #define MGMT_VERSION 1 -#define MGMT_REVISION 16 +#define MGMT_REVISION 17 static const u16 mgmt_commands[] = { MGMT_OP_READ_INDEX_LIST, -- cgit v1.2.3 From c2aa30db744d9cbdde127d4ed8aeea18273834c6 Mon Sep 17 00:00:00 2001 From: Archie Pusaka Date: Tue, 7 Apr 2020 12:26:27 +0800 Subject: Bluetooth: debugfs option to unset MITM flag The BT qualification test SM/MAS/PKE/BV-01-C needs us to turn off the MITM flag when pairing, and at the same time also set the io capability to something other than no input no output. Currently the MITM flag is only unset when the io capability is set to no input no output, therefore the test cannot be executed. This patch introduces a debugfs option to force MITM flag to be turned off. Signed-off-by: Archie Pusaka Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 1 + net/bluetooth/hci_debugfs.c | 46 +++++++++++++++++++++++++++++++++++++++++++++ net/bluetooth/smp.c | 15 ++++++++++----- 3 files changed, 57 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 79de2a659dd6..f4e8e2a0b7c1 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -298,6 +298,7 @@ enum { HCI_FORCE_STATIC_ADDR, HCI_LL_RPA_RESOLUTION, HCI_CMD_PENDING, + HCI_FORCE_NO_MITM, __HCI_NUM_FLAGS, }; diff --git a/net/bluetooth/hci_debugfs.c b/net/bluetooth/hci_debugfs.c index 6b1314c738b8..5e8af2658e44 100644 --- a/net/bluetooth/hci_debugfs.c +++ b/net/bluetooth/hci_debugfs.c @@ -1075,6 +1075,50 @@ DEFINE_SIMPLE_ATTRIBUTE(auth_payload_timeout_fops, auth_payload_timeout_get, auth_payload_timeout_set, "%llu\n"); +static ssize_t force_no_mitm_read(struct file *file, + char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct hci_dev *hdev = file->private_data; + char buf[3]; + + buf[0] = hci_dev_test_flag(hdev, HCI_FORCE_NO_MITM) ? 'Y' : 'N'; + buf[1] = '\n'; + buf[2] = '\0'; + return simple_read_from_buffer(user_buf, count, ppos, buf, 2); +} + +static ssize_t force_no_mitm_write(struct file *file, + const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct hci_dev *hdev = file->private_data; + char buf[32]; + size_t buf_size = min(count, (sizeof(buf) - 1)); + bool enable; + + if (copy_from_user(buf, user_buf, buf_size)) + return -EFAULT; + + buf[buf_size] = '\0'; + if (strtobool(buf, &enable)) + return -EINVAL; + + if (enable == hci_dev_test_flag(hdev, HCI_FORCE_NO_MITM)) + return -EALREADY; + + hci_dev_change_flag(hdev, HCI_FORCE_NO_MITM); + + return count; +} + +static const struct file_operations force_no_mitm_fops = { + .open = simple_open, + .read = force_no_mitm_read, + .write = force_no_mitm_write, + .llseek = default_llseek, +}; + DEFINE_QUIRK_ATTRIBUTE(quirk_strict_duplicate_filter, HCI_QUIRK_STRICT_DUPLICATE_FILTER); DEFINE_QUIRK_ATTRIBUTE(quirk_simultaneous_discovery, @@ -1134,6 +1178,8 @@ void hci_debugfs_create_le(struct hci_dev *hdev) &max_key_size_fops); debugfs_create_file("auth_payload_timeout", 0644, hdev->debugfs, hdev, &auth_payload_timeout_fops); + debugfs_create_file("force_no_mitm", 0644, hdev->debugfs, hdev, + &force_no_mitm_fops); debugfs_create_file("quirk_strict_duplicate_filter", 0644, hdev->debugfs, hdev, diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index d0b695ee49f6..a85e3e49cd0d 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -2393,12 +2393,17 @@ int smp_conn_security(struct hci_conn *hcon, __u8 sec_level) authreq |= SMP_AUTH_CT2; } - /* Require MITM if IO Capability allows or the security level - * requires it. + /* Don't attempt to set MITM if setting is overridden by debugfs + * Needed to pass certification test SM/MAS/PKE/BV-01-C */ - if (hcon->io_capability != HCI_IO_NO_INPUT_OUTPUT || - hcon->pending_sec_level > BT_SECURITY_MEDIUM) - authreq |= SMP_AUTH_MITM; + if (!hci_dev_test_flag(hcon->hdev, HCI_FORCE_NO_MITM)) { + /* Require MITM if IO Capability allows or the security level + * requires it. + */ + if (hcon->io_capability != HCI_IO_NO_INPUT_OUTPUT || + hcon->pending_sec_level > BT_SECURITY_MEDIUM) + authreq |= SMP_AUTH_MITM; + } if (hcon->role == HCI_ROLE_MASTER) { struct smp_cmd_pairing cp; -- cgit v1.2.3 From d1d900f822b6b2874de9c1ef8094fc8df56a2f9f Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Mon, 6 Apr 2020 11:54:38 -0700 Subject: Bluetooth: Simplify / fix return values from tk_request Some static checker run by 0day reports a variableScope warning. net/bluetooth/smp.c:870:6: warning: The scope of the variable 'err' can be reduced. [variableScope] There is no need for two separate variables holding return values. Stick with the existing variable. While at it, don't pre-initialize 'ret' because it is set in each code path. tk_request() is supposed to return a negative error code on errors, not a bluetooth return code. The calling code converts the return value to SMP_UNSPECIFIED if needed. Fixes: 92516cd97fd4 ("Bluetooth: Always request for user confirmation for Just Works") Cc: Sonny Sasaka Signed-off-by: Guenter Roeck Reviewed-by: Sonny Sasaka Signed-off-by: Sonny Sasaka Signed-off-by: Marcel Holtmann --- net/bluetooth/smp.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index a85e3e49cd0d..daf198fb2b31 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -854,8 +854,7 @@ static int tk_request(struct l2cap_conn *conn, u8 remote_oob, u8 auth, struct l2cap_chan *chan = conn->smp; struct smp_chan *smp = chan->data; u32 passkey = 0; - int ret = 0; - int err; + int ret; /* Initialize key for JUST WORKS */ memset(smp->tk, 0, sizeof(smp->tk)); @@ -887,12 +886,12 @@ static int tk_request(struct l2cap_conn *conn, u8 remote_oob, u8 auth, /* If Just Works, Continue with Zero TK and ask user-space for * confirmation */ if (smp->method == JUST_WORKS) { - err = mgmt_user_confirm_request(hcon->hdev, &hcon->dst, + ret = mgmt_user_confirm_request(hcon->hdev, &hcon->dst, hcon->type, hcon->dst_type, passkey, 1); - if (err) - return SMP_UNSPECIFIED; + if (ret) + return ret; set_bit(SMP_FLAG_WAIT_USER, &smp->flags); return 0; } -- cgit v1.2.3 From ffee202a78c2980688bc5d2f7d56480e69a5e0c9 Mon Sep 17 00:00:00 2001 From: Sonny Sasaka Date: Mon, 6 Apr 2020 11:04:02 -0700 Subject: Bluetooth: Always request for user confirmation for Just Works (LE SC) To improve security, always give the user-space daemon a chance to accept or reject a Just Works pairing (LE). The daemon may decide to auto-accept based on the user's intent. This patch is similar to the previous patch but applies for LE Secure Connections (SC). Signed-off-by: Sonny Sasaka Signed-off-by: Marcel Holtmann --- net/bluetooth/smp.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index daf198fb2b31..df22cbf94693 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -2201,7 +2201,7 @@ mackey_and_ltk: if (err) return SMP_UNSPECIFIED; - if (smp->method == JUST_WORKS || smp->method == REQ_OOB) { + if (smp->method == REQ_OOB) { if (hcon->out) { sc_dhkey_check(smp); SMP_ALLOW_CMD(smp, SMP_CMD_DHKEY_CHECK); @@ -2216,6 +2216,9 @@ mackey_and_ltk: confirm_hint = 0; confirm: + if (smp->method == JUST_WORKS) + confirm_hint = 1; + err = mgmt_user_confirm_request(hcon->hdev, &hcon->dst, hcon->type, hcon->dst_type, passkey, confirm_hint); if (err) -- cgit v1.2.3 From 943d5d92c5e87aa8293aae6de2b3ee977aa7d3cf Mon Sep 17 00:00:00 2001 From: Daniels Umanovskis Date: Thu, 9 Apr 2020 13:18:29 +0200 Subject: Bluetooth: log advertisement packet length if it gets corrected The error could indicate a problem with the Bluetooth device. It is easier to investigate if the packet's actual length gets logged, not just the fact that a discrepancy occurred. Signed-off-by: Daniels Umanovskis Reviewed-by: Alain Michaud Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_event.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 51e6461f0b71..966fc543c01d 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -5396,7 +5396,8 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr, /* Adjust for actual length */ if (len != real_len) { - bt_dev_err_ratelimited(hdev, "advertising data len corrected"); + bt_dev_err_ratelimited(hdev, "advertising data len corrected %u -> %u", + len, real_len); len = real_len; } -- cgit v1.2.3 From 849c9c35e80d73c215c65b6023658b371bdeb5ed Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Thu, 9 Apr 2020 08:05:48 +0200 Subject: Bluetooth: Use extra variable to make code more readable When starting active scanning for discovery the whitelist is not needed to be used. So the filter_policy is 0x00. To make the core more readable use a variable name instead of just setting 0 as paramter. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/hci_request.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index 649e1e5ed446..9ea40106ef17 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -2723,6 +2723,8 @@ static int active_scan(struct hci_request *req, unsigned long opt) uint16_t interval = opt; struct hci_dev *hdev = req->hdev; u8 own_addr_type; + /* White list is not used for discovery */ + u8 filter_policy = 0x00; int err; BT_DBG("%s", hdev->name); @@ -2744,7 +2746,7 @@ static int active_scan(struct hci_request *req, unsigned long opt) own_addr_type = ADDR_LE_DEV_PUBLIC; hci_req_start_scan(req, LE_SCAN_ACTIVE, interval, DISCOV_LE_SCAN_WIN, - own_addr_type, 0); + own_addr_type, filter_policy); return 0; } -- cgit v1.2.3 From ff3b8df2bd758d97aa3dd7c021864be05fec9bd5 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Thu, 9 Apr 2020 08:05:49 +0200 Subject: Bluetooth: Enable LE Enhanced Connection Complete event. In case LL Privacy is supported by the controller, it is also a good idea to use the LE Enhanced Connection Complete event for getting all information about the new connection and its addresses. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- include/net/bluetooth/hci.h | 1 + net/bluetooth/hci_core.c | 8 ++++++++ 2 files changed, 9 insertions(+) (limited to 'net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index ff42d05b3e72..1da8cec8e210 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -460,6 +460,7 @@ enum { #define HCI_LE_SLAVE_FEATURES 0x08 #define HCI_LE_PING 0x10 #define HCI_LE_DATA_LEN_EXT 0x20 +#define HCI_LE_LL_PRIVACY 0x40 #define HCI_LE_EXT_SCAN_POLICY 0x80 #define HCI_LE_PHY_2M 0x01 #define HCI_LE_PHY_CODED 0x08 diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 589c4085499c..0d726d59a492 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -638,6 +638,14 @@ static int hci_init3_req(struct hci_request *req, unsigned long opt) if (hdev->le_features[0] & HCI_LE_DATA_LEN_EXT) events[0] |= 0x40; /* LE Data Length Change */ + /* If the controller supports LL Privacy feature, enable + * the corresponding event. + */ + if (hdev->le_features[0] & HCI_LE_LL_PRIVACY) + events[1] |= 0x02; /* LE Enhanced Connection + * Complete + */ + /* If the controller supports Extended Scanner Filter * Policies, enable the correspondig event. */ -- cgit v1.2.3 From 7edc9079540b65026f3d3386b3642d1820d5fed5 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Wed, 15 Apr 2020 17:35:16 +0200 Subject: Bluetooth: Enhanced Connection Complete event belongs to LL Privacy The Enhanced Connection Complete event is use in conjunction with LL Privacy and not Extended Advertising. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/hci_core.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 0d726d59a492..51d399273276 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -719,14 +719,6 @@ static int hci_init3_req(struct hci_request *req, unsigned long opt) * Report */ - /* If the controller supports the LE Extended Create Connection - * command, enable the corresponding event. - */ - if (use_ext_conn(hdev)) - events[1] |= 0x02; /* LE Enhanced Connection - * Complete - */ - /* If the controller supports the LE Extended Advertising * command, enable the corresponding event. */ -- cgit v1.2.3 From eec517cdb4810b3843eb7707971de3164088bff1 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Mon, 20 Apr 2020 00:11:50 +0200 Subject: net: Add IF_OPER_TESTING RFC 2863 defines the operational state testing. Add support for this state, both as a IF_LINK_MODE_ and __LINK_STATE_. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/netdevice.h | 41 +++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/if.h | 1 + net/core/dev.c | 5 +++++ net/core/link_watch.c | 12 ++++++++++-- net/core/rtnetlink.c | 9 ++++++++- 5 files changed, 65 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 130a668049ab..0750b54b3765 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -288,6 +288,7 @@ enum netdev_state_t { __LINK_STATE_NOCARRIER, __LINK_STATE_LINKWATCH_PENDING, __LINK_STATE_DORMANT, + __LINK_STATE_TESTING, }; @@ -3907,6 +3908,46 @@ static inline bool netif_dormant(const struct net_device *dev) } +/** + * netif_testing_on - mark device as under test. + * @dev: network device + * + * Mark device as under test (as per RFC2863). + * + * The testing state indicates that some test(s) must be performed on + * the interface. After completion, of the test, the interface state + * will change to up, dormant, or down, as appropriate. + */ +static inline void netif_testing_on(struct net_device *dev) +{ + if (!test_and_set_bit(__LINK_STATE_TESTING, &dev->state)) + linkwatch_fire_event(dev); +} + +/** + * netif_testing_off - set device as not under test. + * @dev: network device + * + * Device is not in testing state. + */ +static inline void netif_testing_off(struct net_device *dev) +{ + if (test_and_clear_bit(__LINK_STATE_TESTING, &dev->state)) + linkwatch_fire_event(dev); +} + +/** + * netif_testing - test if device is under test + * @dev: network device + * + * Check if device is under test + */ +static inline bool netif_testing(const struct net_device *dev) +{ + return test_bit(__LINK_STATE_TESTING, &dev->state); +} + + /** * netif_oper_up - test if device is operational * @dev: network device diff --git a/include/uapi/linux/if.h b/include/uapi/linux/if.h index be714cd8c826..797ba2c1562a 100644 --- a/include/uapi/linux/if.h +++ b/include/uapi/linux/if.h @@ -178,6 +178,7 @@ enum { enum { IF_LINK_MODE_DEFAULT, IF_LINK_MODE_DORMANT, /* limit upward transition to dormant */ + IF_LINK_MODE_TESTING, /* limit upward transition to testing */ }; /* diff --git a/net/core/dev.c b/net/core/dev.c index 522288177bbd..fb61522b1ce1 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -9136,6 +9136,11 @@ void netif_stacked_transfer_operstate(const struct net_device *rootdev, else netif_dormant_off(dev); + if (rootdev->operstate == IF_OPER_TESTING) + netif_testing_on(dev); + else + netif_testing_off(dev); + if (netif_carrier_ok(rootdev)) netif_carrier_on(dev); else diff --git a/net/core/link_watch.c b/net/core/link_watch.c index f153e0601838..75431ca9300f 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -34,6 +34,9 @@ static DEFINE_SPINLOCK(lweventlist_lock); static unsigned char default_operstate(const struct net_device *dev) { + if (netif_testing(dev)) + return IF_OPER_TESTING; + if (!netif_carrier_ok(dev)) return (dev->ifindex != dev_get_iflink(dev) ? IF_OPER_LOWERLAYERDOWN : IF_OPER_DOWN); @@ -55,11 +58,15 @@ static void rfc2863_policy(struct net_device *dev) write_lock_bh(&dev_base_lock); switch(dev->link_mode) { + case IF_LINK_MODE_TESTING: + if (operstate == IF_OPER_UP) + operstate = IF_OPER_TESTING; + break; + case IF_LINK_MODE_DORMANT: if (operstate == IF_OPER_UP) operstate = IF_OPER_DORMANT; break; - case IF_LINK_MODE_DEFAULT: default: break; @@ -74,7 +81,8 @@ static void rfc2863_policy(struct net_device *dev) void linkwatch_init_dev(struct net_device *dev) { /* Handle pre-registration link state changes */ - if (!netif_carrier_ok(dev) || netif_dormant(dev)) + if (!netif_carrier_ok(dev) || netif_dormant(dev) || + netif_testing(dev)) rfc2863_policy(dev); } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 709ebbf8ab5b..d6f4f4a9e8ba 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -829,11 +829,18 @@ static void set_operstate(struct net_device *dev, unsigned char transition) switch (transition) { case IF_OPER_UP: if ((operstate == IF_OPER_DORMANT || + operstate == IF_OPER_TESTING || operstate == IF_OPER_UNKNOWN) && - !netif_dormant(dev)) + !netif_dormant(dev) && !netif_testing(dev)) operstate = IF_OPER_UP; break; + case IF_OPER_TESTING: + if (operstate == IF_OPER_UP || + operstate == IF_OPER_UNKNOWN) + operstate = IF_OPER_TESTING; + break; + case IF_OPER_DORMANT: if (operstate == IF_OPER_UP || operstate == IF_OPER_UNKNOWN) -- cgit v1.2.3 From db30a57779b18b7cef092c21887ed2d23ad2bd35 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Mon, 20 Apr 2020 00:11:51 +0200 Subject: net: Add testing sysfs attribute Similar to speed, duplex and dorment, report the testing status in sysfs. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- Documentation/ABI/testing/sysfs-class-net | 13 +++++++++++++ net/core/net-sysfs.c | 15 ++++++++++++++- 2 files changed, 27 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/Documentation/ABI/testing/sysfs-class-net b/Documentation/ABI/testing/sysfs-class-net index 664a8f6a634f..3b404577f380 100644 --- a/Documentation/ABI/testing/sysfs-class-net +++ b/Documentation/ABI/testing/sysfs-class-net @@ -124,6 +124,19 @@ Description: authentication is performed (e.g: 802.1x). 'link_mode' attribute will also reflect the dormant state. +What: /sys/class/net//testing +Date: April 2002 +KernelVersion: 5.8 +Contact: netdev@vger.kernel.org +Description: + Indicates whether the interface is under test. Possible + values are: + 0: interface is not being tested + 1: interface is being tested + + When an interface is under test, it cannot be expected + to pass packets as normal. + What: /sys/clas/net//duplex Date: October 2009 KernelVersion: 2.6.33 diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 4773ad6ec111..0d9e46de205e 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -243,6 +243,18 @@ static ssize_t duplex_show(struct device *dev, } static DEVICE_ATTR_RO(duplex); +static ssize_t testing_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + + if (netif_running(netdev)) + return sprintf(buf, fmt_dec, !!netif_testing(netdev)); + + return -EINVAL; +} +static DEVICE_ATTR_RO(testing); + static ssize_t dormant_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -260,7 +272,7 @@ static const char *const operstates[] = { "notpresent", /* currently unused */ "down", "lowerlayerdown", - "testing", /* currently unused */ + "testing", "dormant", "up" }; @@ -524,6 +536,7 @@ static struct attribute *net_class_attrs[] __ro_after_init = { &dev_attr_speed.attr, &dev_attr_duplex.attr, &dev_attr_dormant.attr, + &dev_attr_testing.attr, &dev_attr_operstate.attr, &dev_attr_carrier_changes.attr, &dev_attr_ifalias.attr, -- cgit v1.2.3 From 77e9b2ab451d32c81468ef679c377b2f831cd720 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Mon, 20 Apr 2020 00:11:52 +0200 Subject: net: ethtool: self_test: Mark interface in testing operative status When an interface is executing a self test, put the interface into operative status testing. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/ethtool/ioctl.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 89d0b1827aaf..593fa665f820 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -1746,7 +1746,9 @@ static int ethtool_self_test(struct net_device *dev, char __user *useraddr) if (!data) return -ENOMEM; + netif_testing_on(dev); ops->self_test(dev, &test, data); + netif_testing_off(dev); ret = -EFAULT; if (copy_to_user(useraddr, &test, sizeof(test))) -- cgit v1.2.3 From b6246f4d8d0778fd045b84dbd7fc5aadd8f3136e Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 20 Apr 2020 22:51:49 +0100 Subject: net: ipv4: remove redundant assignment to variable rc The variable rc is being assigned with a value that is never read and it is being updated later with a new value. The initialization is redundant and can be removed. Addresses-Coverity: ("Unused value") Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index cf58e29cf746..c618e242490f 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1914,7 +1914,7 @@ static int __init inet_init(void) { struct inet_protosw *q; struct list_head *r; - int rc = -EINVAL; + int rc; sock_skb_cb_check_size(sizeof(struct inet_skb_parm)); -- cgit v1.2.3 From 2a7e978625e887cbb80cb2188cf5f955e4223fa1 Mon Sep 17 00:00:00 2001 From: Simon Wunderlich Date: Mon, 13 Apr 2020 20:40:20 +0200 Subject: batman-adv: Start new development cycle Signed-off-by: Simon Wunderlich --- net/batman-adv/main.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 2a234d0ad445..61d8dbe8c954 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -13,7 +13,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2020.1" +#define BATADV_SOURCE_VERSION "2020.2" #endif /* B.A.T.M.A.N. parameters */ -- cgit v1.2.3 From c08dd06b3d25e8450a2e187ee6b7412d07c460af Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Thu, 26 Mar 2020 18:37:07 +0100 Subject: batman-adv: Fix spelling error in term buffer checkpatch warns about a typo in the word bufFer which was introduced in commit 2191c1bcbc64 ("batman-adv: kernel doc for types.h"). Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 4a17a66cc572..d152b8e81f61 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -1086,7 +1086,7 @@ struct batadv_priv_bla { * struct batadv_priv_debug_log - debug logging data */ struct batadv_priv_debug_log { - /** @log_buff: buffer holding the logs (ring bufer) */ + /** @log_buff: buffer holding the logs (ring buffer) */ char log_buff[BATADV_LOG_BUF_LEN]; /** @log_start: index of next character to read */ -- cgit v1.2.3 From 9204a4f876b229505f4ab947124d1160b80bbb4c Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Mon, 13 Apr 2020 20:26:07 +0200 Subject: batman-adv: trace: Drop unneeded types.h include The commit 04ae87a52074 ("ftrace: Rework event_create_dir()") restructured various macros in the ftrace framework. These changes also had the nice side effect that the linux/types.h include is no longer necessary to define some of the types used by these macros. Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/trace.h | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/trace.h b/net/batman-adv/trace.h index f631b1e01b89..a87547570b4e 100644 --- a/net/batman-adv/trace.h +++ b/net/batman-adv/trace.h @@ -15,7 +15,6 @@ #include #include #include -#include #undef TRACE_SYSTEM #define TRACE_SYSTEM batadv -- cgit v1.2.3 From 26893e7e928e1790852b072edb9bff3c1309e111 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Mon, 13 Apr 2020 21:23:29 +0200 Subject: batman-adv: Utilize prandom_u32_max for random [0, max) values The kernel provides a function to create random values from 0 - (max-1) since commit f337db64af05 ("random32: add prandom_u32_max and convert open coded users"). Simply use this function to replace code sections which use prandom_u32 and a handcrafted method to map it to the correct range. Signed-off-by: Sven Eckelmann --- net/batman-adv/bat_iv_ogm.c | 4 ++-- net/batman-adv/bat_v_elp.c | 2 +- net/batman-adv/bat_v_ogm.c | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index a7c8dd7ae513..e87f19c82e8d 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -280,7 +280,7 @@ batadv_iv_ogm_emit_send_time(const struct batadv_priv *bat_priv) unsigned int msecs; msecs = atomic_read(&bat_priv->orig_interval) - BATADV_JITTER; - msecs += prandom_u32() % (2 * BATADV_JITTER); + msecs += prandom_u32_max(2 * BATADV_JITTER); return jiffies + msecs_to_jiffies(msecs); } @@ -288,7 +288,7 @@ batadv_iv_ogm_emit_send_time(const struct batadv_priv *bat_priv) /* when do we schedule a ogm packet to be sent */ static unsigned long batadv_iv_ogm_fwd_send_time(void) { - return jiffies + msecs_to_jiffies(prandom_u32() % (BATADV_JITTER / 2)); + return jiffies + msecs_to_jiffies(prandom_u32_max(BATADV_JITTER / 2)); } /* apply hop penalty for a normal link */ diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index 1e3172db7492..353e49c40e7f 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -49,7 +49,7 @@ static void batadv_v_elp_start_timer(struct batadv_hard_iface *hard_iface) unsigned int msecs; msecs = atomic_read(&hard_iface->bat_v.elp_interval) - BATADV_JITTER; - msecs += prandom_u32() % (2 * BATADV_JITTER); + msecs += prandom_u32_max(2 * BATADV_JITTER); queue_delayed_work(batadv_event_workqueue, &hard_iface->bat_v.elp_wq, msecs_to_jiffies(msecs)); diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index 969466218999..0959d32be65c 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -88,7 +88,7 @@ static void batadv_v_ogm_start_queue_timer(struct batadv_hard_iface *hard_iface) unsigned int msecs = BATADV_MAX_AGGREGATION_MS * 1000; /* msecs * [0.9, 1.1] */ - msecs += prandom_u32() % (msecs / 5) - (msecs / 10); + msecs += prandom_u32_max(msecs / 5) - (msecs / 10); queue_delayed_work(batadv_event_workqueue, &hard_iface->bat_v.aggr_wq, msecs_to_jiffies(msecs / 1000)); } @@ -107,7 +107,7 @@ static void batadv_v_ogm_start_timer(struct batadv_priv *bat_priv) return; msecs = atomic_read(&bat_priv->orig_interval) - BATADV_JITTER; - msecs += prandom_u32() % (2 * BATADV_JITTER); + msecs += prandom_u32_max(2 * BATADV_JITTER); queue_delayed_work(batadv_event_workqueue, &bat_priv->bat_v.ogm_wq, msecs_to_jiffies(msecs)); } -- cgit v1.2.3 From 1c79031f8a75c132bbd42e3ef20267af97b67466 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 19 Apr 2020 17:18:47 +0300 Subject: drivers: Remove inclusion of vermagic header Get rid of linux/vermagic.h includes, so that MODULE_ARCH_VERMAGIC from the arch header arch/x86/include/asm/module.h won't be redefined. In file included from ./include/linux/module.h:30, from drivers/net/ethernet/3com/3c515.c:56: ./arch/x86/include/asm/module.h:73: warning: "MODULE_ARCH_VERMAGIC" redefined 73 | # define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY | In file included from drivers/net/ethernet/3com/3c515.c:25: ./include/linux/vermagic.h:28: note: this is the location of the previous definition 28 | #define MODULE_ARCH_VERMAGIC "" | Fixes: 6bba2e89a88c ("net/3com: Delete driver and module versions from 3com drivers") Co-developed-by: Borislav Petkov Signed-off-by: Borislav Petkov Acked-by: Shannon Nelson # ionic Acked-by: Sebastian Reichel # power Signed-off-by: Leon Romanovsky Signed-off-by: David S. Miller --- drivers/net/bonding/bonding_priv.h | 2 +- drivers/net/ethernet/3com/3c509.c | 1 - drivers/net/ethernet/3com/3c515.c | 1 - drivers/net/ethernet/adaptec/starfire.c | 1 - drivers/net/ethernet/pensando/ionic/ionic_main.c | 2 +- drivers/power/supply/test_power.c | 2 +- net/ethtool/ioctl.c | 3 +-- 7 files changed, 4 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/drivers/net/bonding/bonding_priv.h b/drivers/net/bonding/bonding_priv.h index 45b77bc8c7b3..48cdf3a49a7d 100644 --- a/drivers/net/bonding/bonding_priv.h +++ b/drivers/net/bonding/bonding_priv.h @@ -14,7 +14,7 @@ #ifndef _BONDING_PRIV_H #define _BONDING_PRIV_H -#include +#include #define DRV_NAME "bonding" #define DRV_DESCRIPTION "Ethernet Channel Bonding Driver" diff --git a/drivers/net/ethernet/3com/3c509.c b/drivers/net/ethernet/3com/3c509.c index b762176a1406..139d0120f511 100644 --- a/drivers/net/ethernet/3com/3c509.c +++ b/drivers/net/ethernet/3com/3c509.c @@ -85,7 +85,6 @@ #include #include #include -#include #include #include diff --git a/drivers/net/ethernet/3com/3c515.c b/drivers/net/ethernet/3com/3c515.c index 90312fcd6319..47b4215bb93b 100644 --- a/drivers/net/ethernet/3com/3c515.c +++ b/drivers/net/ethernet/3com/3c515.c @@ -22,7 +22,6 @@ */ -#include #define DRV_NAME "3c515" #define CORKSCREW 1 diff --git a/drivers/net/ethernet/adaptec/starfire.c b/drivers/net/ethernet/adaptec/starfire.c index 2db42211329f..a64191fc2af9 100644 --- a/drivers/net/ethernet/adaptec/starfire.c +++ b/drivers/net/ethernet/adaptec/starfire.c @@ -45,7 +45,6 @@ #include /* Processor type for cache alignment. */ #include #include -#include /* * The current frame processor firmware fails to checksum a fragment diff --git a/drivers/net/ethernet/pensando/ionic/ionic_main.c b/drivers/net/ethernet/pensando/ionic/ionic_main.c index 588c62e9add7..3ed150512091 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_main.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_main.c @@ -6,7 +6,7 @@ #include #include #include -#include +#include #include "ionic.h" #include "ionic_bus.h" diff --git a/drivers/power/supply/test_power.c b/drivers/power/supply/test_power.c index 65c23ef6408d..b3c05ff05783 100644 --- a/drivers/power/supply/test_power.c +++ b/drivers/power/supply/test_power.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include enum test_power_id { TEST_AC, diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 593fa665f820..226d5ecdd567 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include @@ -28,7 +27,7 @@ #include #include #include - +#include #include "common.h" /* -- cgit v1.2.3 From beb97d3a3192c00575580af9073921c6283cf93d Mon Sep 17 00:00:00 2001 From: wenxu Date: Tue, 21 Apr 2020 07:55:43 +0800 Subject: net/sched: act_ct: update nf_conn_acct for act_ct SW offload in flowtable When the act_ct SW offload in flowtable, The counter of the conntrack entry will never update. So update the nf_conn_acct conuter in act_ct flowtable software offload. Signed-off-by: wenxu Reviewed-by: Roi Dayan Signed-off-by: David S. Miller --- net/sched/act_ct.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 1a766393be62..9adff83b523b 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -536,6 +537,7 @@ static bool tcf_ct_flow_table_lookup(struct tcf_ct_params *p, flow_offload_refresh(nf_ft, flow); nf_conntrack_get(&ct->ct_general); nf_ct_set(skb, ct, ctinfo); + nf_ct_acct_update(ct, dir, skb->len); return true; } -- cgit v1.2.3 From 540bde5c2c3da005b87b3edb394d6ca4f890777d Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Tue, 21 Apr 2020 11:09:12 +0800 Subject: ila: remove unused macro 'ILA_HASH_TABLE_SIZE' net/ipv6/ila/ila_xlat.c:604:0: warning: macro "ILA_HASH_TABLE_SIZE" is not used [-Wunused-macros] Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- net/ipv6/ila/ila_xlat.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c index 5fc1f4e0c0cf..a1ac0e3d8c60 100644 --- a/net/ipv6/ila/ila_xlat.c +++ b/net/ipv6/ila/ila_xlat.c @@ -601,8 +601,6 @@ out_ret: return ret; } -#define ILA_HASH_TABLE_SIZE 1024 - int ila_xlat_init_net(struct net *net) { struct ila_net *ilan = net_generic(net, ila_net_id); -- cgit v1.2.3 From dfddb54043f0a377f642bd0e6a28aa40769e2e65 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Tue, 21 Apr 2020 13:10:54 +0530 Subject: net: qrtr: Add tracepoint support Add tracepoint support for QRTR with NS as the first candidate. Later on this can be extended to core QRTR and transport drivers. The trace_printk() used in NS has been replaced by tracepoints. Signed-off-by: Manivannan Sadhasivam Signed-off-by: David S. Miller --- include/trace/events/qrtr.h | 115 ++++++++++++++++++++++++++++++++++++++++++++ net/qrtr/ns.c | 20 ++++---- 2 files changed, 126 insertions(+), 9 deletions(-) create mode 100644 include/trace/events/qrtr.h (limited to 'net') diff --git a/include/trace/events/qrtr.h b/include/trace/events/qrtr.h new file mode 100644 index 000000000000..b1de14c3bb93 --- /dev/null +++ b/include/trace/events/qrtr.h @@ -0,0 +1,115 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM qrtr + +#if !defined(_TRACE_QRTR_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_QRTR_H + +#include +#include + +TRACE_EVENT(qrtr_ns_service_announce_new, + + TP_PROTO(__le32 service, __le32 instance, __le32 node, __le32 port), + + TP_ARGS(service, instance, node, port), + + TP_STRUCT__entry( + __field(__le32, service) + __field(__le32, instance) + __field(__le32, node) + __field(__le32, port) + ), + + TP_fast_assign( + __entry->service = service; + __entry->instance = instance; + __entry->node = node; + __entry->port = port; + ), + + TP_printk("advertising new server [%d:%x]@[%d:%d]", + __entry->service, __entry->instance, __entry->node, + __entry->port + ) +); + +TRACE_EVENT(qrtr_ns_service_announce_del, + + TP_PROTO(__le32 service, __le32 instance, __le32 node, __le32 port), + + TP_ARGS(service, instance, node, port), + + TP_STRUCT__entry( + __field(__le32, service) + __field(__le32, instance) + __field(__le32, node) + __field(__le32, port) + ), + + TP_fast_assign( + __entry->service = service; + __entry->instance = instance; + __entry->node = node; + __entry->port = port; + ), + + TP_printk("advertising removal of server [%d:%x]@[%d:%d]", + __entry->service, __entry->instance, __entry->node, + __entry->port + ) +); + +TRACE_EVENT(qrtr_ns_server_add, + + TP_PROTO(__le32 service, __le32 instance, __le32 node, __le32 port), + + TP_ARGS(service, instance, node, port), + + TP_STRUCT__entry( + __field(__le32, service) + __field(__le32, instance) + __field(__le32, node) + __field(__le32, port) + ), + + TP_fast_assign( + __entry->service = service; + __entry->instance = instance; + __entry->node = node; + __entry->port = port; + ), + + TP_printk("add server [%d:%x]@[%d:%d]", + __entry->service, __entry->instance, __entry->node, + __entry->port + ) +); + +TRACE_EVENT(qrtr_ns_message, + + TP_PROTO(const char * const ctrl_pkt_str, __u32 sq_node, __u32 sq_port), + + TP_ARGS(ctrl_pkt_str, sq_node, sq_port), + + TP_STRUCT__entry( + __string(ctrl_pkt_str, ctrl_pkt_str) + __field(__u32, sq_node) + __field(__u32, sq_port) + ), + + TP_fast_assign( + __assign_str(ctrl_pkt_str, ctrl_pkt_str); + __entry->sq_node = sq_node; + __entry->sq_port = sq_port; + ), + + TP_printk("%s from %d:%d", + __get_str(ctrl_pkt_str), __entry->sq_node, __entry->sq_port + ) +); + +#endif /* _TRACE_QRTR_H */ + +/* This part must be outside protection */ +#include diff --git a/net/qrtr/ns.c b/net/qrtr/ns.c index e7d0fe3f4330..3ca196fc7f9b 100644 --- a/net/qrtr/ns.c +++ b/net/qrtr/ns.c @@ -12,6 +12,9 @@ #include "qrtr.h" +#define CREATE_TRACE_POINTS +#include + static RADIX_TREE(nodes, GFP_KERNEL); static struct { @@ -105,8 +108,8 @@ static int service_announce_new(struct sockaddr_qrtr *dest, struct msghdr msg = { }; struct kvec iv; - trace_printk("advertising new server [%d:%x]@[%d:%d]\n", - srv->service, srv->instance, srv->node, srv->port); + trace_qrtr_ns_service_announce_new(srv->service, srv->instance, + srv->node, srv->port); iv.iov_base = &pkt; iv.iov_len = sizeof(pkt); @@ -132,8 +135,8 @@ static int service_announce_del(struct sockaddr_qrtr *dest, struct kvec iv; int ret; - trace_printk("advertising removal of server [%d:%x]@[%d:%d]\n", - srv->service, srv->instance, srv->node, srv->port); + trace_qrtr_ns_service_announce_del(srv->service, srv->instance, + srv->node, srv->port); iv.iov_base = &pkt; iv.iov_len = sizeof(pkt); @@ -244,8 +247,8 @@ static struct qrtr_server *server_add(unsigned int service, radix_tree_insert(&node->servers, port, srv); - trace_printk("add server [%d:%x]@[%d:%d]\n", srv->service, - srv->instance, srv->node, srv->port); + trace_qrtr_ns_server_add(srv->service, srv->instance, + srv->node, srv->port); return srv; @@ -633,9 +636,8 @@ static void qrtr_ns_worker(struct work_struct *work) cmd = le32_to_cpu(pkt->cmd); if (cmd < ARRAY_SIZE(qrtr_ctrl_pkt_strings) && qrtr_ctrl_pkt_strings[cmd]) - trace_printk("%s from %d:%d\n", - qrtr_ctrl_pkt_strings[cmd], sq.sq_node, - sq.sq_port); + trace_qrtr_ns_message(qrtr_ctrl_pkt_strings[cmd], + sq.sq_node, sq.sq_port); ret = 0; switch (cmd) { -- cgit v1.2.3 From 8518307dc2b2a97b235c74920b45020f9bebe33e Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Wed, 22 Apr 2020 15:16:36 +0800 Subject: net: caif: use true,false for bool variables Fix the following coccicheck warning: net/caif/caif_dev.c:410:2-13: WARNING: Assignment of 0/1 to bool variable net/caif/caif_dev.c:445:2-13: WARNING: Assignment of 0/1 to bool variable net/caif/caif_dev.c:145:1-12: WARNING: Assignment of 0/1 to bool variable net/caif/caif_dev.c:223:1-12: WARNING: Assignment of 0/1 to bool variable Signed-off-by: Jason Yan Signed-off-by: David S. Miller --- net/caif/caif_dev.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c index 195d2d67be8a..c10e5a55758d 100644 --- a/net/caif/caif_dev.c +++ b/net/caif/caif_dev.c @@ -142,7 +142,7 @@ static void caif_flow_cb(struct sk_buff *skb) spin_lock_bh(&caifd->flow_lock); send_xoff = caifd->xoff; - caifd->xoff = 0; + caifd->xoff = false; dtor = caifd->xoff_skb_dtor; if (WARN_ON(caifd->xoff_skb != skb)) @@ -220,7 +220,7 @@ static int transmit(struct cflayer *layer, struct cfpkt *pkt) pr_debug("queue has stopped(%d) or is full (%d > %d)\n", netif_queue_stopped(caifd->netdev), qlen, high); - caifd->xoff = 1; + caifd->xoff = true; caifd->xoff_skb = skb; caifd->xoff_skb_dtor = skb->destructor; skb->destructor = caif_flow_cb; @@ -407,7 +407,7 @@ static int caif_device_notify(struct notifier_block *me, unsigned long what, break; } - caifd->xoff = 0; + caifd->xoff = false; cfcnfg_set_phy_state(cfg, &caifd->layer, true); rcu_read_unlock(); @@ -442,7 +442,7 @@ static int caif_device_notify(struct notifier_block *me, unsigned long what, if (caifd->xoff_skb_dtor != NULL && caifd->xoff_skb != NULL) caifd->xoff_skb->destructor = caifd->xoff_skb_dtor; - caifd->xoff = 0; + caifd->xoff = false; caifd->xoff_skb_dtor = NULL; caifd->xoff_skb = NULL; -- cgit v1.2.3 From b75326c201242de9495ff98e5d5cff41d7fc0d9d Mon Sep 17 00:00:00 2001 From: Fernando Gont Date: Sun, 19 Apr 2020 09:24:57 -0300 Subject: ipv6: Honor all IPv6 PIO Valid Lifetime values RFC4862 5.5.3 e) prevents received Router Advertisements from reducing the Valid Lifetime of configured addresses to less than two hours, thus preventing hosts from reacting to the information provided by a router that has positive knowledge that a prefix has become invalid. This patch makes hosts honor all Valid Lifetime values, as per draft-gont-6man-slaac-renum-06, Section 4.2. This is meant to help mitigate the problem discussed in draft-ietf-v6ops-slaac-renum. Note: Attacks aiming at disabling an advertised prefix via a Valid Lifetime of 0 are not really more harmful than other attacks that can be performed via forged RA messages, such as those aiming at completely disabling a next-hop router via an RA that advertises a Router Lifetime of 0, or performing a Denial of Service (DoS) attack by advertising illegitimate prefixes via forged PIOs. In scenarios where RA-based attacks are of concern, proper mitigations such as RA-Guard [RFC6105] [RFC7113] should be implemented. Signed-off-by: Fernando Gont Signed-off-by: David S. Miller --- include/net/addrconf.h | 2 -- net/ipv6/addrconf.c | 27 +++++++-------------------- 2 files changed, 7 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/include/net/addrconf.h b/include/net/addrconf.h index e0eabe58aa8b..fdb07105384c 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -6,8 +6,6 @@ #define RTR_SOLICITATION_INTERVAL (4*HZ) #define RTR_SOLICITATION_MAX_INTERVAL (3600*HZ) /* 1 hour */ -#define MIN_VALID_LIFETIME (2*3600) /* 2 hours */ - #define TEMP_VALID_LIFETIME (7*86400) #define TEMP_PREFERRED_LIFETIME (86400) #define REGEN_MAX_RETRY (3) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 24e319dfb510..27b4fb6e452b 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2564,7 +2564,7 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev, __u32 valid_lft, u32 prefered_lft) { struct inet6_ifaddr *ifp = ipv6_get_ifaddr(net, addr, dev, 1); - int create = 0, update_lft = 0; + int create = 0; if (!ifp && valid_lft) { int max_addresses = in6_dev->cnf.max_addresses; @@ -2608,32 +2608,19 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev, unsigned long now; u32 stored_lft; - /* update lifetime (RFC2462 5.5.3 e) */ + /* Update lifetime (RFC4862 5.5.3 e) + * We deviate from RFC4862 by honoring all Valid Lifetimes to + * improve the reaction of SLAAC to renumbering events + * (draft-gont-6man-slaac-renum-06, Section 4.2) + */ spin_lock_bh(&ifp->lock); now = jiffies; if (ifp->valid_lft > (now - ifp->tstamp) / HZ) stored_lft = ifp->valid_lft - (now - ifp->tstamp) / HZ; else stored_lft = 0; - if (!create && stored_lft) { - const u32 minimum_lft = min_t(u32, - stored_lft, MIN_VALID_LIFETIME); - valid_lft = max(valid_lft, minimum_lft); - - /* RFC4862 Section 5.5.3e: - * "Note that the preferred lifetime of the - * corresponding address is always reset to - * the Preferred Lifetime in the received - * Prefix Information option, regardless of - * whether the valid lifetime is also reset or - * ignored." - * - * So we should always update prefered_lft here. - */ - update_lft = 1; - } - if (update_lft) { + if (!create && stored_lft) { ifp->valid_lft = valid_lft; ifp->prefered_lft = prefered_lft; ifp->tstamp = now; -- cgit v1.2.3 From e131a5634830047923c694b4ce0c3b31745ff01b Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Tue, 21 Apr 2020 16:41:08 +0300 Subject: net: dsa: add GRO support via gro_cells gro_cells lib is used by different encapsulating netdevices, such as geneve, macsec, vxlan etc. to speed up decapsulated traffic processing. CPU tag is a sort of "encapsulation", and we can use the same mechs to greatly improve overall DSA performance. skbs are passed to the GRO layer after removing CPU tags, so we don't need any new packet offload types as it was firstly proposed by me in the first GRO-over-DSA variant [1]. The size of struct gro_cells is sizeof(void *), so hot struct dsa_slave_priv becomes only 4/8 bytes bigger, and all critical fields remain in one 32-byte cacheline. The other positive side effect is that drivers for network devices that can be shipped as CPU ports of DSA-driven switches can now use napi_gro_frags() to pass skbs to kernel. Packets built that way are completely non-linear and are likely being dropped without GRO. This was tested on to-be-mainlined-soon Ethernet driver that uses napi_gro_frags(), and the overall performance was on par with the variant from [1], sometimes even better due to minimal overhead. net.core.gro_normal_batch tuning may help to push it to the limit on particular setups and platforms. iperf3 IPoE VLAN NAT TCP forwarding (port1.218 -> port0) setup on 1.2 GHz MIPS board: 5.7-rc2 baseline: [ID] Interval Transfer Bitrate Retr [ 5] 0.00-120.01 sec 9.00 GBytes 644 Mbits/sec 413 sender [ 5] 0.00-120.00 sec 8.99 GBytes 644 Mbits/sec receiver Iface RX packets TX packets eth0 7097731 7097702 port0 426050 6671829 port1 6671681 425862 port1.218 6671677 425851 With this patch: [ID] Interval Transfer Bitrate Retr [ 5] 0.00-120.01 sec 12.2 GBytes 870 Mbits/sec 122 sender [ 5] 0.00-120.00 sec 12.2 GBytes 870 Mbits/sec receiver Iface RX packets TX packets eth0 9474792 9474777 port0 455200 353288 port1 9019592 455035 port1.218 353144 455024 v2: - Add some performance examples in the commit message; - No functional changes. [1] https://lore.kernel.org/netdev/20191230143028.27313-1-alobakin@dlink.ru/ Signed-off-by: Alexander Lobakin Signed-off-by: David S. Miller --- net/dsa/Kconfig | 1 + net/dsa/dsa.c | 2 +- net/dsa/dsa_priv.h | 3 +++ net/dsa/slave.c | 10 +++++++++- 4 files changed, 14 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index 92663dcb3aa2..739613070d07 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -9,6 +9,7 @@ menuconfig NET_DSA tristate "Distributed Switch Architecture" depends on HAVE_NET_DSA depends on BRIDGE || BRIDGE=n + select GRO_CELLS select NET_SWITCHDEV select PHYLINK select NET_DEVLINK diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index ee2610c4d46a..0384a911779e 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -234,7 +234,7 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev, if (dsa_skb_defer_rx_timestamp(p, skb)) return 0; - netif_receive_skb(skb); + gro_cells_receive(&p->gcells, skb); return 0; } diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 904cc7c9b882..6d9a1ef65fa0 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -11,6 +11,7 @@ #include #include #include +#include enum { DSA_NOTIFIER_AGEING_TIME, @@ -77,6 +78,8 @@ struct dsa_slave_priv { struct pcpu_sw_netstats *stats64; + struct gro_cells gcells; + /* DSA port data, such as switch, port index, etc. */ struct dsa_port *dp; diff --git a/net/dsa/slave.c b/net/dsa/slave.c index e94eb1aac602..f2c241cf3a80 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1762,6 +1762,11 @@ int dsa_slave_create(struct dsa_port *port) free_netdev(slave_dev); return -ENOMEM; } + + ret = gro_cells_init(&p->gcells, slave_dev); + if (ret) + goto out_free; + p->dp = port; INIT_LIST_HEAD(&p->mall_tc_list); p->xmit = cpu_dp->tag_ops->xmit; @@ -1781,7 +1786,7 @@ int dsa_slave_create(struct dsa_port *port) ret = dsa_slave_phy_setup(slave_dev); if (ret) { netdev_err(master, "error %d setting up slave phy\n", ret); - goto out_free; + goto out_gcells; } dsa_slave_notify(slave_dev, DSA_PORT_REGISTER); @@ -1800,6 +1805,8 @@ out_phy: phylink_disconnect_phy(p->dp->pl); rtnl_unlock(); phylink_destroy(p->dp->pl); +out_gcells: + gro_cells_destroy(&p->gcells); out_free: free_percpu(p->stats64); free_netdev(slave_dev); @@ -1820,6 +1827,7 @@ void dsa_slave_destroy(struct net_device *slave_dev) dsa_slave_notify(slave_dev, DSA_PORT_UNREGISTER); unregister_netdev(slave_dev); phylink_destroy(dp->pl); + gro_cells_destroy(&p->gcells); free_percpu(p->stats64); free_netdev(slave_dev); } -- cgit v1.2.3 From 6f8b12d661d09b488b9ac879b8eafbd2cc4a1450 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 22 Apr 2020 09:13:27 -0700 Subject: net: napi: add hard irqs deferral feature Back in commit 3b47d30396ba ("net: gro: add a per device gro flush timer") we added the ability to arm one high resolution timer, that we used to keep not-complete packets in GRO engine a bit longer, hoping that further frames might be added to them. Since then, we added the napi_complete_done() interface, and commit 364b6055738b ("net: busy-poll: return busypolling status to drivers") allowed drivers to avoid re-arming NIC interrupts if we made a promise that their NAPI poll() handler would be called in the near future. This infrastructure can be leveraged, thanks to a new device parameter, which allows to arm the napi hrtimer, instead of re-arming the device hard IRQ. We have noticed that on some servers with 32 RX queues or more, the chit-chat between the NIC and the host caused by IRQ delivery and re-arming could hurt throughput by ~20% on 100Gbit NIC. In contrast, hrtimers are using local (percpu) resources and might have lower cost. The new tunable, named napi_defer_hard_irqs, is placed in the same hierarchy than gro_flush_timeout (/sys/class/net/ethX/) By default, both gro_flush_timeout and napi_defer_hard_irqs are zero. This patch does not change the prior behavior of gro_flush_timeout if used alone : NIC hard irqs should be rearmed as before. One concrete usage can be : echo 20000 >/sys/class/net/eth1/gro_flush_timeout echo 10 >/sys/class/net/eth1/napi_defer_hard_irqs If at least one packet is retired, then we will reset napi counter to 10 (napi_defer_hard_irqs), ensuring at least 10 periodic scans of the queue. On busy queues, this should avoid NIC hard IRQ, while before this patch IRQ avoidance was only possible if napi->poll() was exhausting its budget and not call napi_complete_done(). This feature also can be used to work around some non-optimal NIC irq coalescing strategies. Having the ability to insert XX usec delays between each napi->poll() can increase cache efficiency, since we increase batch sizes. It also keeps serving cpus not idle too long, reducing tail latencies. Co-developed-by: Luigi Rizzo Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 ++ net/core/dev.c | 29 ++++++++++++++++++----------- net/core/net-sysfs.c | 18 ++++++++++++++++++ 3 files changed, 38 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 0750b54b3765..5a8d40f1ffe2 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -329,6 +329,7 @@ struct napi_struct { unsigned long state; int weight; + int defer_hard_irqs_count; unsigned long gro_bitmask; int (*poll)(struct napi_struct *, int); #ifdef CONFIG_NETPOLL @@ -1995,6 +1996,7 @@ struct net_device { struct bpf_prog __rcu *xdp_prog; unsigned long gro_flush_timeout; + int napi_defer_hard_irqs; rx_handler_func_t __rcu *rx_handler; void __rcu *rx_handler_data; diff --git a/net/core/dev.c b/net/core/dev.c index fb61522b1ce1..67585484ad32 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6227,7 +6227,8 @@ EXPORT_SYMBOL(__napi_schedule_irqoff); bool napi_complete_done(struct napi_struct *n, int work_done) { - unsigned long flags, val, new; + unsigned long flags, val, new, timeout = 0; + bool ret = true; /* * 1) Don't let napi dequeue from the cpu poll list @@ -6239,20 +6240,23 @@ bool napi_complete_done(struct napi_struct *n, int work_done) NAPIF_STATE_IN_BUSY_POLL))) return false; - if (n->gro_bitmask) { - unsigned long timeout = 0; - - if (work_done) + if (work_done) { + if (n->gro_bitmask) timeout = n->dev->gro_flush_timeout; - + n->defer_hard_irqs_count = n->dev->napi_defer_hard_irqs; + } + if (n->defer_hard_irqs_count > 0) { + n->defer_hard_irqs_count--; + timeout = n->dev->gro_flush_timeout; + if (timeout) + ret = false; + } + if (n->gro_bitmask) { /* When the NAPI instance uses a timeout and keeps postponing * it, we need to bound somehow the time packets are kept in * the GRO layer */ napi_gro_flush(n, !!timeout); - if (timeout) - hrtimer_start(&n->timer, ns_to_ktime(timeout), - HRTIMER_MODE_REL_PINNED); } gro_normal_list(n); @@ -6284,7 +6288,10 @@ bool napi_complete_done(struct napi_struct *n, int work_done) return false; } - return true; + if (timeout) + hrtimer_start(&n->timer, ns_to_ktime(timeout), + HRTIMER_MODE_REL_PINNED); + return ret; } EXPORT_SYMBOL(napi_complete_done); @@ -6464,7 +6471,7 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) /* Note : we use a relaxed variant of napi_schedule_prep() not setting * NAPI_STATE_MISSED, since we do not react to a device IRQ. */ - if (napi->gro_bitmask && !napi_disable_pending(napi) && + if (!napi_disable_pending(napi) && !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) __napi_schedule_irqoff(napi); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 0d9e46de205e..f3b650cd0923 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -382,6 +382,23 @@ static ssize_t gro_flush_timeout_store(struct device *dev, } NETDEVICE_SHOW_RW(gro_flush_timeout, fmt_ulong); +static int change_napi_defer_hard_irqs(struct net_device *dev, unsigned long val) +{ + dev->napi_defer_hard_irqs = val; + return 0; +} + +static ssize_t napi_defer_hard_irqs_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + return netdev_store(dev, attr, buf, len, change_napi_defer_hard_irqs); +} +NETDEVICE_SHOW_RW(napi_defer_hard_irqs, fmt_dec); + static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { @@ -545,6 +562,7 @@ static struct attribute *net_class_attrs[] __ro_after_init = { &dev_attr_flags.attr, &dev_attr_tx_queue_len.attr, &dev_attr_gro_flush_timeout.attr, + &dev_attr_napi_defer_hard_irqs.attr, &dev_attr_phys_port_id.attr, &dev_attr_phys_port_name.attr, &dev_attr_phys_switch_id.attr, -- cgit v1.2.3 From 7e417a66b86c110f4b282945dac82e21e0b08328 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 22 Apr 2020 09:13:28 -0700 Subject: net: napi: use READ_ONCE()/WRITE_ONCE() gro_flush_timeout and napi_defer_hard_irqs can be read from napi_complete_done() while other cpus write the value, whithout explicit synchronization. Use READ_ONCE()/WRITE_ONCE() to annotate the races. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 6 +++--- net/core/net-sysfs.c | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 67585484ad32..afff16849c26 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6242,12 +6242,12 @@ bool napi_complete_done(struct napi_struct *n, int work_done) if (work_done) { if (n->gro_bitmask) - timeout = n->dev->gro_flush_timeout; - n->defer_hard_irqs_count = n->dev->napi_defer_hard_irqs; + timeout = READ_ONCE(n->dev->gro_flush_timeout); + n->defer_hard_irqs_count = READ_ONCE(n->dev->napi_defer_hard_irqs); } if (n->defer_hard_irqs_count > 0) { n->defer_hard_irqs_count--; - timeout = n->dev->gro_flush_timeout; + timeout = READ_ONCE(n->dev->gro_flush_timeout); if (timeout) ret = false; } diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index f3b650cd0923..880e89c894f6 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -367,7 +367,7 @@ NETDEVICE_SHOW_RW(tx_queue_len, fmt_dec); static int change_gro_flush_timeout(struct net_device *dev, unsigned long val) { - dev->gro_flush_timeout = val; + WRITE_ONCE(dev->gro_flush_timeout, val); return 0; } @@ -384,7 +384,7 @@ NETDEVICE_SHOW_RW(gro_flush_timeout, fmt_ulong); static int change_napi_defer_hard_irqs(struct net_device *dev, unsigned long val) { - dev->napi_defer_hard_irqs = val; + WRITE_ONCE(dev->napi_defer_hard_irqs, val); return 0; } -- cgit v1.2.3 From 3c9143d96852485725d330b9164062d8f2d90a38 Mon Sep 17 00:00:00 2001 From: Xu Wang Date: Thu, 23 Apr 2020 13:43:13 +0800 Subject: net: sched : Remove unnecessary cast in kfree Remove unnecassary casts in the argument to kfree. Signed-off-by: Xu Wang Signed-off-by: David S. Miller --- net/sched/em_ipt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/em_ipt.c b/net/sched/em_ipt.c index eecfe072c508..18755d29fd15 100644 --- a/net/sched/em_ipt.c +++ b/net/sched/em_ipt.c @@ -199,7 +199,7 @@ static void em_ipt_destroy(struct tcf_ematch *em) im->match->destroy(&par); } module_put(im->match->me); - kfree((void *)im); + kfree(im); } static int em_ipt_match(struct sk_buff *skb, struct tcf_ematch *em, -- cgit v1.2.3 From c7c4c44c9a95d87e50ced38f7480e779cb472174 Mon Sep 17 00:00:00 2001 From: Tonghao Zhang Date: Fri, 24 Apr 2020 08:08:02 +0800 Subject: net: openvswitch: expand the meters supported number In kernel datapath of Open vSwitch, there are only 1024 buckets of meter in one datapath. If installing more than 1024 (e.g. 8192) meters, it may lead to the performance drop. But in some case, for example, Open vSwitch used as edge gateway, there should be 20K at least, where meters used for IP address bandwidth limitation. [Open vSwitch userspace datapath has this issue too.] For more scalable meter, this patch use meter array instead of hash tables, and expand/shrink the array when necessary. So we can install more meters than before in the datapath. Introducing the struct *dp_meter_instance, it's easy to expand meter though changing the *ti point in the struct *dp_meter_table. Cc: Pravin B Shelar Cc: Andy Zhou Signed-off-by: Tonghao Zhang Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/datapath.h | 2 +- net/openvswitch/meter.c | 240 ++++++++++++++++++++++++++++++++++----------- net/openvswitch/meter.h | 16 ++- 3 files changed, 195 insertions(+), 63 deletions(-) (limited to 'net') diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index e239a46c2f94..2016dd107939 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -82,7 +82,7 @@ struct datapath { u32 max_headroom; /* Switch meters. */ - struct hlist_head *meters; + struct dp_meter_table meter_tbl; }; /** diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c index 5010d1ddd4bd..f806ded1dd0a 100644 --- a/net/openvswitch/meter.c +++ b/net/openvswitch/meter.c @@ -19,8 +19,6 @@ #include "datapath.h" #include "meter.h" -#define METER_HASH_BUCKETS 1024 - static const struct nla_policy meter_policy[OVS_METER_ATTR_MAX + 1] = { [OVS_METER_ATTR_ID] = { .type = NLA_U32, }, [OVS_METER_ATTR_KBPS] = { .type = NLA_FLAG }, @@ -39,6 +37,11 @@ static const struct nla_policy band_policy[OVS_BAND_ATTR_MAX + 1] = { [OVS_BAND_ATTR_STATS] = { .len = sizeof(struct ovs_flow_stats) }, }; +static u32 meter_hash(struct dp_meter_instance *ti, u32 id) +{ + return id % ti->n_meters; +} + static void ovs_meter_free(struct dp_meter *meter) { if (!meter) @@ -47,40 +50,153 @@ static void ovs_meter_free(struct dp_meter *meter) kfree_rcu(meter, rcu); } -static struct hlist_head *meter_hash_bucket(const struct datapath *dp, - u32 meter_id) -{ - return &dp->meters[meter_id & (METER_HASH_BUCKETS - 1)]; -} - /* Call with ovs_mutex or RCU read lock. */ -static struct dp_meter *lookup_meter(const struct datapath *dp, +static struct dp_meter *lookup_meter(const struct dp_meter_table *tbl, u32 meter_id) { + struct dp_meter_instance *ti = rcu_dereference_ovsl(tbl->ti); + u32 hash = meter_hash(ti, meter_id); struct dp_meter *meter; - struct hlist_head *head; - head = meter_hash_bucket(dp, meter_id); - hlist_for_each_entry_rcu(meter, head, dp_hash_node, - lockdep_ovsl_is_held()) { - if (meter->id == meter_id) - return meter; - } + meter = rcu_dereference_ovsl(ti->dp_meters[hash]); + if (meter && likely(meter->id == meter_id)) + return meter; + return NULL; } -static void attach_meter(struct datapath *dp, struct dp_meter *meter) +static struct dp_meter_instance *dp_meter_instance_alloc(const u32 size) +{ + struct dp_meter_instance *ti; + + ti = kvzalloc(sizeof(*ti) + + sizeof(struct dp_meter *) * size, + GFP_KERNEL); + if (!ti) + return NULL; + + ti->n_meters = size; + + return ti; +} + +static void dp_meter_instance_free(struct dp_meter_instance *ti) +{ + kvfree(ti); +} + +static void dp_meter_instance_free_rcu(struct rcu_head *rcu) +{ + struct dp_meter_instance *ti; + + ti = container_of(rcu, struct dp_meter_instance, rcu); + kvfree(ti); +} + +static int +dp_meter_instance_realloc(struct dp_meter_table *tbl, u32 size) +{ + struct dp_meter_instance *ti = rcu_dereference_ovsl(tbl->ti); + int n_meters = min(size, ti->n_meters); + struct dp_meter_instance *new_ti; + int i; + + new_ti = dp_meter_instance_alloc(size); + if (!new_ti) + return -ENOMEM; + + for (i = 0; i < n_meters; i++) + new_ti->dp_meters[i] = + rcu_dereference_ovsl(ti->dp_meters[i]); + + rcu_assign_pointer(tbl->ti, new_ti); + call_rcu(&ti->rcu, dp_meter_instance_free_rcu); + + return 0; +} + +static void dp_meter_instance_insert(struct dp_meter_instance *ti, + struct dp_meter *meter) +{ + u32 hash; + + hash = meter_hash(ti, meter->id); + rcu_assign_pointer(ti->dp_meters[hash], meter); +} + +static void dp_meter_instance_remove(struct dp_meter_instance *ti, + struct dp_meter *meter) { - struct hlist_head *head = meter_hash_bucket(dp, meter->id); + u32 hash; - hlist_add_head_rcu(&meter->dp_hash_node, head); + hash = meter_hash(ti, meter->id); + RCU_INIT_POINTER(ti->dp_meters[hash], NULL); } -static void detach_meter(struct dp_meter *meter) +static int attach_meter(struct dp_meter_table *tbl, struct dp_meter *meter) { + struct dp_meter_instance *ti = rcu_dereference_ovsl(tbl->ti); + u32 hash = meter_hash(ti, meter->id); + + /* In generally, slots selected should be empty, because + * OvS uses id-pool to fetch a available id. + */ + if (unlikely(rcu_dereference_ovsl(ti->dp_meters[hash]))) + return -EBUSY; + + dp_meter_instance_insert(ti, meter); + + /* That function is thread-safe. */ + if (++tbl->count >= ti->n_meters) + if (dp_meter_instance_realloc(tbl, ti->n_meters * 2)) + goto expand_err; + + return 0; + +expand_err: + dp_meter_instance_remove(ti, meter); + tbl->count--; + return -ENOMEM; +} + +static int detach_meter(struct dp_meter_table *tbl, struct dp_meter *meter) +{ + struct dp_meter_instance *ti; + ASSERT_OVSL(); - if (meter) - hlist_del_rcu(&meter->dp_hash_node); + if (!meter) + return 0; + + ti = rcu_dereference_ovsl(tbl->ti); + dp_meter_instance_remove(ti, meter); + + tbl->count--; + + /* Shrink the meter array if necessary. */ + if (ti->n_meters > DP_METER_ARRAY_SIZE_MIN && + tbl->count <= (ti->n_meters / 4)) { + int half_size = ti->n_meters / 2; + int i; + + /* Avoid hash collision, don't move slots to other place. + * Make sure there are no references of meters in array + * which will be released. + */ + for (i = half_size; i < ti->n_meters; i++) + if (rcu_dereference_ovsl(ti->dp_meters[i])) + goto out; + + if (dp_meter_instance_realloc(tbl, half_size)) + goto shrink_err; + } + +out: + return 0; + +shrink_err: + dp_meter_instance_insert(ti, meter); + tbl->count++; + return -ENOMEM; } static struct sk_buff * @@ -273,6 +389,7 @@ static int ovs_meter_cmd_set(struct sk_buff *skb, struct genl_info *info) struct sk_buff *reply; struct ovs_header *ovs_reply_header; struct ovs_header *ovs_header = info->userhdr; + struct dp_meter_table *meter_tbl; struct datapath *dp; int err; u32 meter_id; @@ -300,12 +417,18 @@ static int ovs_meter_cmd_set(struct sk_buff *skb, struct genl_info *info) goto exit_unlock; } + meter_tbl = &dp->meter_tbl; meter_id = nla_get_u32(a[OVS_METER_ATTR_ID]); - /* Cannot fail after this. */ - old_meter = lookup_meter(dp, meter_id); - detach_meter(old_meter); - attach_meter(dp, meter); + old_meter = lookup_meter(meter_tbl, meter_id); + err = detach_meter(meter_tbl, old_meter); + if (err) + goto exit_unlock; + + err = attach_meter(meter_tbl, meter); + if (err) + goto exit_unlock; + ovs_unlock(); /* Build response with the meter_id and stats from @@ -337,14 +460,14 @@ exit_free_meter: static int ovs_meter_cmd_get(struct sk_buff *skb, struct genl_info *info) { - struct nlattr **a = info->attrs; - u32 meter_id; struct ovs_header *ovs_header = info->userhdr; struct ovs_header *ovs_reply_header; + struct nlattr **a = info->attrs; + struct dp_meter *meter; + struct sk_buff *reply; struct datapath *dp; + u32 meter_id; int err; - struct sk_buff *reply; - struct dp_meter *meter; if (!a[OVS_METER_ATTR_ID]) return -EINVAL; @@ -365,7 +488,7 @@ static int ovs_meter_cmd_get(struct sk_buff *skb, struct genl_info *info) } /* Locate meter, copy stats. */ - meter = lookup_meter(dp, meter_id); + meter = lookup_meter(&dp->meter_tbl, meter_id); if (!meter) { err = -ENOENT; goto exit_unlock; @@ -390,18 +513,17 @@ exit_unlock: static int ovs_meter_cmd_del(struct sk_buff *skb, struct genl_info *info) { - struct nlattr **a = info->attrs; - u32 meter_id; struct ovs_header *ovs_header = info->userhdr; struct ovs_header *ovs_reply_header; + struct nlattr **a = info->attrs; + struct dp_meter *old_meter; + struct sk_buff *reply; struct datapath *dp; + u32 meter_id; int err; - struct sk_buff *reply; - struct dp_meter *old_meter; if (!a[OVS_METER_ATTR_ID]) return -EINVAL; - meter_id = nla_get_u32(a[OVS_METER_ATTR_ID]); reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_DEL, &ovs_reply_header); @@ -416,14 +538,19 @@ static int ovs_meter_cmd_del(struct sk_buff *skb, struct genl_info *info) goto exit_unlock; } - old_meter = lookup_meter(dp, meter_id); + meter_id = nla_get_u32(a[OVS_METER_ATTR_ID]); + old_meter = lookup_meter(&dp->meter_tbl, meter_id); if (old_meter) { spin_lock_bh(&old_meter->lock); err = ovs_meter_cmd_reply_stats(reply, meter_id, old_meter); WARN_ON(err); spin_unlock_bh(&old_meter->lock); - detach_meter(old_meter); + + err = detach_meter(&dp->meter_tbl, old_meter); + if (err) + goto exit_unlock; } + ovs_unlock(); ovs_meter_free(old_meter); genlmsg_end(reply, ovs_reply_header); @@ -443,16 +570,16 @@ exit_unlock: bool ovs_meter_execute(struct datapath *dp, struct sk_buff *skb, struct sw_flow_key *key, u32 meter_id) { - struct dp_meter *meter; - struct dp_meter_band *band; long long int now_ms = div_u64(ktime_get_ns(), 1000 * 1000); long long int long_delta_ms; - u32 delta_ms; - u32 cost; + struct dp_meter_band *band; + struct dp_meter *meter; int i, band_exceeded_max = -1; u32 band_exceeded_rate = 0; + u32 delta_ms; + u32 cost; - meter = lookup_meter(dp, meter_id); + meter = lookup_meter(&dp->meter_tbl, meter_id); /* Do not drop the packet when there is no meter. */ if (!meter) return false; @@ -570,32 +697,27 @@ struct genl_family dp_meter_genl_family __ro_after_init = { int ovs_meters_init(struct datapath *dp) { - int i; + struct dp_meter_table *tbl = &dp->meter_tbl; + struct dp_meter_instance *ti; - dp->meters = kmalloc_array(METER_HASH_BUCKETS, - sizeof(struct hlist_head), GFP_KERNEL); - - if (!dp->meters) + ti = dp_meter_instance_alloc(DP_METER_ARRAY_SIZE_MIN); + if (!ti) return -ENOMEM; - for (i = 0; i < METER_HASH_BUCKETS; i++) - INIT_HLIST_HEAD(&dp->meters[i]); + rcu_assign_pointer(tbl->ti, ti); + tbl->count = 0; return 0; } void ovs_meters_exit(struct datapath *dp) { + struct dp_meter_table *tbl = &dp->meter_tbl; + struct dp_meter_instance *ti = rcu_dereference_raw(tbl->ti); int i; - for (i = 0; i < METER_HASH_BUCKETS; i++) { - struct hlist_head *head = &dp->meters[i]; - struct dp_meter *meter; - struct hlist_node *n; - - hlist_for_each_entry_safe(meter, n, head, dp_hash_node) - kfree(meter); - } + for (i = 0; i < ti->n_meters; i++) + ovs_meter_free(ti->dp_meters[i]); - kfree(dp->meters); + dp_meter_instance_free(ti); } diff --git a/net/openvswitch/meter.h b/net/openvswitch/meter.h index f645913870bd..f52052d30a16 100644 --- a/net/openvswitch/meter.h +++ b/net/openvswitch/meter.h @@ -13,11 +13,13 @@ #include #include #include +#include #include "flow.h" struct datapath; #define DP_MAX_BANDS 1 +#define DP_METER_ARRAY_SIZE_MIN BIT_ULL(10) struct dp_meter_band { u32 type; @@ -30,9 +32,6 @@ struct dp_meter_band { struct dp_meter { spinlock_t lock; /* Per meter lock */ struct rcu_head rcu; - struct hlist_node dp_hash_node; /*Element in datapath->meters - * hash table. - */ u32 id; u16 kbps:1, keep_stats:1; u16 n_bands; @@ -42,6 +41,17 @@ struct dp_meter { struct dp_meter_band bands[]; }; +struct dp_meter_instance { + struct rcu_head rcu; + u32 n_meters; + struct dp_meter __rcu *dp_meters[]; +}; + +struct dp_meter_table { + struct dp_meter_instance __rcu *ti; + u32 count; +}; + extern struct genl_family dp_meter_genl_family; int ovs_meters_init(struct datapath *dp); void ovs_meters_exit(struct datapath *dp); -- cgit v1.2.3 From eb58eebc7fb5e23c9cc7d557c0a9236630591526 Mon Sep 17 00:00:00 2001 From: Tonghao Zhang Date: Fri, 24 Apr 2020 08:08:03 +0800 Subject: net: openvswitch: set max limitation to meters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Don't allow user to create meter unlimitedly, which may cause to consume a large amount of kernel memory. The max number supported is decided by physical memory and 20K meters as default. Cc: Pravin B Shelar Cc: Andy Zhou Signed-off-by: Tonghao Zhang Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/meter.c | 57 ++++++++++++++++++++++++++++++++++++++++--------- net/openvswitch/meter.h | 2 ++ 2 files changed, 49 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c index f806ded1dd0a..372f4565872d 100644 --- a/net/openvswitch/meter.c +++ b/net/openvswitch/meter.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -137,6 +138,7 @@ static int attach_meter(struct dp_meter_table *tbl, struct dp_meter *meter) { struct dp_meter_instance *ti = rcu_dereference_ovsl(tbl->ti); u32 hash = meter_hash(ti, meter->id); + int err; /* In generally, slots selected should be empty, because * OvS uses id-pool to fetch a available id. @@ -147,16 +149,24 @@ static int attach_meter(struct dp_meter_table *tbl, struct dp_meter *meter) dp_meter_instance_insert(ti, meter); /* That function is thread-safe. */ - if (++tbl->count >= ti->n_meters) - if (dp_meter_instance_realloc(tbl, ti->n_meters * 2)) - goto expand_err; + tbl->count++; + if (tbl->count >= tbl->max_meters_allowed) { + err = -EFBIG; + goto attach_err; + } + + if (tbl->count >= ti->n_meters && + dp_meter_instance_realloc(tbl, ti->n_meters * 2)) { + err = -ENOMEM; + goto attach_err; + } return 0; -expand_err: +attach_err: dp_meter_instance_remove(ti, meter); tbl->count--; - return -ENOMEM; + return err; } static int detach_meter(struct dp_meter_table *tbl, struct dp_meter *meter) @@ -266,18 +276,32 @@ error: static int ovs_meter_cmd_features(struct sk_buff *skb, struct genl_info *info) { - struct sk_buff *reply; + struct ovs_header *ovs_header = info->userhdr; struct ovs_header *ovs_reply_header; struct nlattr *nla, *band_nla; - int err; + struct sk_buff *reply; + struct datapath *dp; + int err = -EMSGSIZE; reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_FEATURES, &ovs_reply_header); if (IS_ERR(reply)) return PTR_ERR(reply); - if (nla_put_u32(reply, OVS_METER_ATTR_MAX_METERS, U32_MAX) || - nla_put_u32(reply, OVS_METER_ATTR_MAX_BANDS, DP_MAX_BANDS)) + ovs_lock(); + dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); + if (!dp) { + err = -ENODEV; + goto exit_unlock; + } + + if (nla_put_u32(reply, OVS_METER_ATTR_MAX_METERS, + dp->meter_tbl.max_meters_allowed)) + goto exit_unlock; + + ovs_unlock(); + + if (nla_put_u32(reply, OVS_METER_ATTR_MAX_BANDS, DP_MAX_BANDS)) goto nla_put_failure; nla = nla_nest_start_noflag(reply, OVS_METER_ATTR_BANDS); @@ -296,9 +320,10 @@ static int ovs_meter_cmd_features(struct sk_buff *skb, struct genl_info *info) genlmsg_end(reply, ovs_reply_header); return genlmsg_reply(reply, info); +exit_unlock: + ovs_unlock(); nla_put_failure: nlmsg_free(reply); - err = -EMSGSIZE; return err; } @@ -699,15 +724,27 @@ int ovs_meters_init(struct datapath *dp) { struct dp_meter_table *tbl = &dp->meter_tbl; struct dp_meter_instance *ti; + unsigned long free_mem_bytes; ti = dp_meter_instance_alloc(DP_METER_ARRAY_SIZE_MIN); if (!ti) return -ENOMEM; + /* Allow meters in a datapath to use ~3.12% of physical memory. */ + free_mem_bytes = nr_free_buffer_pages() * (PAGE_SIZE >> 5); + tbl->max_meters_allowed = min(free_mem_bytes / sizeof(struct dp_meter), + DP_METER_NUM_MAX); + if (!tbl->max_meters_allowed) + goto out_err; + rcu_assign_pointer(tbl->ti, ti); tbl->count = 0; return 0; + +out_err: + dp_meter_instance_free(ti); + return -ENOMEM; } void ovs_meters_exit(struct datapath *dp) diff --git a/net/openvswitch/meter.h b/net/openvswitch/meter.h index f52052d30a16..61a3ca43cd77 100644 --- a/net/openvswitch/meter.h +++ b/net/openvswitch/meter.h @@ -20,6 +20,7 @@ struct datapath; #define DP_MAX_BANDS 1 #define DP_METER_ARRAY_SIZE_MIN BIT_ULL(10) +#define DP_METER_NUM_MAX (200000UL) struct dp_meter_band { u32 type; @@ -50,6 +51,7 @@ struct dp_meter_instance { struct dp_meter_table { struct dp_meter_instance __rcu *ti; u32 count; + u32 max_meters_allowed; }; extern struct genl_family dp_meter_genl_family; -- cgit v1.2.3 From a8e387384f554ea0b63889a7682ffcddee24df8b Mon Sep 17 00:00:00 2001 From: Tonghao Zhang Date: Fri, 24 Apr 2020 08:08:04 +0800 Subject: net: openvswitch: remove the unnecessary check Before invoking the ovs_meter_cmd_reply_stats, "meter" was checked, so don't check it agin in that function. Cc: Pravin B Shelar Cc: Andy Zhou Signed-off-by: Tonghao Zhang Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/meter.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c index 372f4565872d..b7893b0d6423 100644 --- a/net/openvswitch/meter.c +++ b/net/openvswitch/meter.c @@ -242,12 +242,11 @@ static int ovs_meter_cmd_reply_stats(struct sk_buff *reply, u32 meter_id, if (nla_put_u32(reply, OVS_METER_ATTR_ID, meter_id)) goto error; - if (!meter) - return 0; - if (nla_put(reply, OVS_METER_ATTR_STATS, - sizeof(struct ovs_flow_stats), &meter->stats) || - nla_put_u64_64bit(reply, OVS_METER_ATTR_USED, meter->used, + sizeof(struct ovs_flow_stats), &meter->stats)) + goto error; + + if (nla_put_u64_64bit(reply, OVS_METER_ATTR_USED, meter->used, OVS_METER_ATTR_PAD)) goto error; -- cgit v1.2.3 From c77350089052cafa8125169e37463ab7028d6a18 Mon Sep 17 00:00:00 2001 From: Tonghao Zhang Date: Fri, 24 Apr 2020 08:08:05 +0800 Subject: net: openvswitch: make EINVAL return value more obvious Cc: Pravin B Shelar Cc: Andy Zhou Signed-off-by: Tonghao Zhang Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/meter.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c index b7893b0d6423..e36b464b32a5 100644 --- a/net/openvswitch/meter.c +++ b/net/openvswitch/meter.c @@ -419,9 +419,8 @@ static int ovs_meter_cmd_set(struct sk_buff *skb, struct genl_info *info) u32 meter_id; bool failed; - if (!a[OVS_METER_ATTR_ID]) { - return -ENODEV; - } + if (!a[OVS_METER_ATTR_ID]) + return -EINVAL; meter = dp_meter_create(a); if (IS_ERR_OR_NULL(meter)) -- cgit v1.2.3 From e57358873bb5d6caa882b9684f59140912b37dde Mon Sep 17 00:00:00 2001 From: Tonghao Zhang Date: Fri, 24 Apr 2020 08:08:06 +0800 Subject: net: openvswitch: use u64 for meter bucket When setting the meter rate to 4+Gbps, there is an overflow, the meters don't work as expected. Cc: Pravin B Shelar Cc: Andy Zhou Signed-off-by: Tonghao Zhang Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/meter.c | 2 +- net/openvswitch/meter.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c index e36b464b32a5..915f31123f23 100644 --- a/net/openvswitch/meter.c +++ b/net/openvswitch/meter.c @@ -392,7 +392,7 @@ static struct dp_meter *dp_meter_create(struct nlattr **a) * * Start with a full bucket. */ - band->bucket = (band->burst_size + band->rate) * 1000; + band->bucket = (band->burst_size + band->rate) * 1000ULL; band_max_delta_t = band->bucket / band->rate; if (band_max_delta_t > meter->max_delta_t) meter->max_delta_t = band_max_delta_t; diff --git a/net/openvswitch/meter.h b/net/openvswitch/meter.h index 61a3ca43cd77..0c33889a8515 100644 --- a/net/openvswitch/meter.h +++ b/net/openvswitch/meter.h @@ -26,7 +26,7 @@ struct dp_meter_band { u32 type; u32 rate; u32 burst_size; - u32 bucket; /* 1/1000 packets, or in bits */ + u64 bucket; /* 1/1000 packets, or in bits */ struct ovs_flow_stats stats; }; -- cgit v1.2.3 From 1db364c88695272e3410eb4b5d4595c8cb15db30 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 17 Apr 2020 12:38:04 +0200 Subject: mac80211: mlme: remove duplicate AID bookkeeping Maintain the connection AID only in sdata->vif.bss_conf.aid, not also in sdata->u.mgd.aid. Keep setting that where we set ifmgd->aid before, which has the side effect of exposing the AID to the driver before the station entry (AP) is marked associated, in case it needs it then. Requested-by: Felix Fietkau Signed-off-by: Johannes Berg Tested-by: Lorenzo Bianconi Link: https://lore.kernel.org/r/20200417123802.085d4a322b0c.I2e7a2ceceea8c6880219f9e9ee4d4ac985fd295a@changeid Signed-off-by: Johannes Berg --- net/mac80211/debugfs_netdev.c | 2 +- net/mac80211/ieee80211_i.h | 2 -- net/mac80211/mlme.c | 7 +++---- net/mac80211/tdls.c | 3 +-- net/mac80211/tx.c | 2 +- 5 files changed, 6 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index 3dbe7c5cefd1..d7e955127d5c 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -236,7 +236,7 @@ IEEE80211_IF_FILE_R(hw_queues); /* STA attributes */ IEEE80211_IF_FILE(bssid, u.mgd.bssid, MAC); -IEEE80211_IF_FILE(aid, u.mgd.aid, DEC); +IEEE80211_IF_FILE(aid, vif.bss_conf.aid, DEC); IEEE80211_IF_FILE(beacon_timeout, u.mgd.beacon_timeout, JIFFIES_TO_MS); static int ieee80211_set_smps(struct ieee80211_sub_if_data *sdata, diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index f8ed4f621f7f..934a91bef575 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -450,8 +450,6 @@ struct ieee80211_if_managed { u8 bssid[ETH_ALEN] __aligned(2); - u16 aid; - bool powersave; /* powersave requested for this iface */ bool broken_ap; /* AP is broken -- turn off powersave */ bool have_beacon; diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 16d75da0996a..7139335f29c0 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -3249,7 +3249,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, return false; } - ifmgd->aid = aid; + sdata->vif.bss_conf.aid = aid; ifmgd->tdls_chan_switch_prohibited = elems->ext_capab && elems->ext_capab_len >= 5 && (elems->ext_capab[4] & WLAN_EXT_CAPA5_TDLS_CH_SW_PROHIBITED); @@ -3521,9 +3521,8 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, bss_conf->protected_keep_alive = false; } - /* set AID and assoc capability, + /* set assoc capability (AID was already set earlier), * ieee80211_set_associated() will tell the driver */ - bss_conf->aid = aid; bss_conf->assoc_capability = capab_info; ieee80211_set_associated(sdata, cbss, changed); @@ -3948,7 +3947,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, mgmt->bssid, bssid); if (ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK) && - ieee80211_check_tim(elems.tim, elems.tim_len, ifmgd->aid)) { + ieee80211_check_tim(elems.tim, elems.tim_len, bss_conf->aid)) { if (local->hw.conf.dynamic_ps_timeout > 0) { if (local->hw.conf.flags & IEEE80211_CONF_PS) { local->hw.conf.flags &= ~IEEE80211_CONF_PS; diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index fca1f5477396..7ff22f9d6e80 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -226,12 +226,11 @@ static void ieee80211_tdls_add_link_ie(struct ieee80211_sub_if_data *sdata, static void ieee80211_tdls_add_aid(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) { - struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; u8 *pos = skb_put(skb, 4); *pos++ = WLAN_EID_AID; *pos++ = 2; /* len */ - put_unaligned_le16(ifmgd->aid, pos); + put_unaligned_le16(sdata->vif.bss_conf.aid, pos); } /* translate numbering in the WMM parameter IE to the mac80211 notation */ diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 82846aca86d9..3dc1990e15c5 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -5006,7 +5006,7 @@ struct sk_buff *ieee80211_pspoll_get(struct ieee80211_hw *hw, pspoll = skb_put_zero(skb, sizeof(*pspoll)); pspoll->frame_control = cpu_to_le16(IEEE80211_FTYPE_CTL | IEEE80211_STYPE_PSPOLL); - pspoll->aid = cpu_to_le16(ifmgd->aid); + pspoll->aid = cpu_to_le16(sdata->vif.bss_conf.aid); /* aid in PS-Poll has its two MSBs each set to 1 */ pspoll->aid |= cpu_to_le16(1 << 15 | 1 << 14); -- cgit v1.2.3 From 90e8f58dfc04d1bd48ca155cc55ebf7ba1824864 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 17 Apr 2020 11:18:31 +0200 Subject: mac80211: fix drv_config_iface_filter() behaviour There are two bugs with this, first, it shouldn't be called on an interface that's down, and secondly, it should then be called when the interface comes up. Note that the currently only user (iwlwifi) doesn't seem to care about either of these scenarios. Signed-off-by: Johannes Berg Link: https://lore.kernel.org/r/20200417111830.401d82c7a0bf.I5dc7d718816460c2d8d89c7af6c215f9e2b3078f@changeid Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 15 +++++++++------ net/mac80211/iface.c | 5 +++++ 2 files changed, 14 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 0f72813fed53..b90f2131ec7a 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -3421,12 +3421,15 @@ static void ieee80211_mgmt_frame_register(struct wiphy *wiphy, if (!local->open_count) break; - if (sdata->vif.probe_req_reg == 1) - drv_config_iface_filter(local, sdata, FIF_PROBE_REQ, - FIF_PROBE_REQ); - else if (sdata->vif.probe_req_reg == 0) - drv_config_iface_filter(local, sdata, 0, - FIF_PROBE_REQ); + if (ieee80211_sdata_running(sdata)) { + if (sdata->vif.probe_req_reg == 1) + drv_config_iface_filter(local, sdata, + FIF_PROBE_REQ, + FIF_PROBE_REQ); + else if (sdata->vif.probe_req_reg == 0) + drv_config_iface_filter(local, sdata, 0, + FIF_PROBE_REQ); + } ieee80211_configure_filter(local); break; diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index d069825705d6..f900c84fb40f 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -644,6 +644,11 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) local->fif_probe_req++; } + if (sdata->vif.probe_req_reg) + drv_config_iface_filter(local, sdata, + FIF_PROBE_REQ, + FIF_PROBE_REQ); + if (sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE && sdata->vif.type != NL80211_IFTYPE_NAN) changed |= ieee80211_reset_erp_info(sdata); -- cgit v1.2.3 From 4d797fce783a8eb11dd23463828db84743795046 Mon Sep 17 00:00:00 2001 From: Jouni Malinen Date: Wed, 1 Apr 2020 17:25:47 +0300 Subject: cfg80211: Unprotected Beacon frame RX indication Extend cfg80211_rx_unprot_mlme_mgmt() to cover indication of unprotected Beacon frames in addition to the previously used Deauthentication and Disassociation frames. The Beacon frame case is quite similar, but has couple of exceptions: this is used both with fully unprotected and also incorrectly protected frames and there is a rate limit on the events to avoid unnecessary flooding netlink events in case something goes wrong. Signed-off-by: Jouni Malinen Link: https://lore.kernel.org/r/20200401142548.6990-1-jouni@codeaurora.org [add missing kernel-doc] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 12 ++++++++++-- include/uapi/linux/nl80211.h | 7 +++++++ net/wireless/nl80211.c | 13 +++++++++++-- net/wireless/sme.c | 2 ++ 4 files changed, 30 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 70e48f66dac8..775952677b3d 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -5045,6 +5045,8 @@ struct cfg80211_cqm_config; * @pmsr_list: (private) peer measurement requests * @pmsr_lock: (private) peer measurements requests/results lock * @pmsr_free_wk: (private) peer measurements cleanup work + * @unprot_beacon_reported: (private) timestamp of last + * unprotected beacon report */ struct wireless_dev { struct wiphy *wiphy; @@ -5121,6 +5123,8 @@ struct wireless_dev { struct list_head pmsr_list; spinlock_t pmsr_lock; struct work_struct pmsr_free_wk; + + unsigned long unprot_beacon_reported; }; static inline u8 *wdev_address(struct wireless_dev *wdev) @@ -6135,12 +6139,16 @@ void cfg80211_tx_mlme_mgmt(struct net_device *dev, const u8 *buf, size_t len); /** * cfg80211_rx_unprot_mlme_mgmt - notification of unprotected mlme mgmt frame * @dev: network device - * @buf: deauthentication frame (header + body) + * @buf: received management frame (header + body) * @len: length of the frame data * * This function is called whenever a received deauthentication or dissassoc * frame has been dropped in station mode because of MFP being used but the - * frame was not protected. This function may sleep. + * frame was not protected. This is also used to notify reception of a Beacon + * frame that was dropped because it did not include a valid MME MIC while + * beacon protection was enabled (BIGTK configured in station mode). + * + * This function may sleep. */ void cfg80211_rx_unprot_mlme_mgmt(struct net_device *dev, const u8 *buf, size_t len); diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 2b691161830f..afdd9802ccb8 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1151,6 +1151,11 @@ * @NL80211_CMD_SET_TID_CONFIG: Data frame TID specific configuration * is passed using %NL80211_ATTR_TID_CONFIG attribute. * + * @NL80211_CMD_UNPROT_BEACON: Unprotected or incorrectly protected Beacon + * frame. This event is used to indicate that a received Beacon frame was + * dropped because it did not include a valid MME MIC while beacon + * protection was enabled (BIGTK configured in station mode). + * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use */ @@ -1377,6 +1382,8 @@ enum nl80211_commands { NL80211_CMD_SET_TID_CONFIG, + NL80211_CMD_UNPROT_BEACON, + /* add new commands above here */ /* used to define NL80211_CMD_MAX below */ diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 692bcd35f809..2127e5344b1a 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -15542,10 +15542,19 @@ void cfg80211_rx_unprot_mlme_mgmt(struct net_device *dev, const u8 *buf, if (WARN_ON(len < 2)) return; - if (ieee80211_is_deauth(mgmt->frame_control)) + if (ieee80211_is_deauth(mgmt->frame_control)) { cmd = NL80211_CMD_UNPROT_DEAUTHENTICATE; - else + } else if (ieee80211_is_disassoc(mgmt->frame_control)) { cmd = NL80211_CMD_UNPROT_DISASSOCIATE; + } else if (ieee80211_is_beacon(mgmt->frame_control)) { + if (wdev->unprot_beacon_reported && + elapsed_jiffies_msecs(wdev->unprot_beacon_reported) < 10000) + return; + cmd = NL80211_CMD_UNPROT_BEACON; + wdev->unprot_beacon_reported = jiffies; + } else { + return; + } trace_cfg80211_rx_unprot_mlme_mgmt(dev, buf, len); nl80211_send_mlme_event(rdev, dev, buf, len, cmd, GFP_ATOMIC, -1, diff --git a/net/wireless/sme.c b/net/wireless/sme.c index ac3e60aa1fc8..3554c0d951f4 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -694,6 +694,7 @@ void __cfg80211_connect_result(struct net_device *dev, return; } + wdev->unprot_beacon_reported = 0; nl80211_send_connect_result(wiphy_to_rdev(wdev->wiphy), dev, cr, GFP_KERNEL); @@ -921,6 +922,7 @@ void __cfg80211_roamed(struct wireless_dev *wdev, cfg80211_hold_bss(bss_from_pub(info->bss)); wdev->current_bss = bss_from_pub(info->bss); + wdev->unprot_beacon_reported = 0; nl80211_send_roamed(wiphy_to_rdev(wdev->wiphy), wdev->netdev, info, GFP_KERNEL); -- cgit v1.2.3 From 9eaf183af741e3d8393eb571ac8aec9ee7d6530e Mon Sep 17 00:00:00 2001 From: Jouni Malinen Date: Wed, 1 Apr 2020 17:25:48 +0300 Subject: mac80211: Report beacon protection failures to user space Report received Beacon frames that do not have a valid MME MIC when beacon protection is enabled. This covers both the cases of no MME in the received frame and invalid MIC in the MME. Signed-off-by: Jouni Malinen Link: https://lore.kernel.org/r/20200401142548.6990-2-jouni@codeaurora.org Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 91a13aee4378..a724551b8ddf 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -1984,8 +1984,12 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) if (mmie_keyidx < NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS || mmie_keyidx >= NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS + - NUM_DEFAULT_BEACON_KEYS) + NUM_DEFAULT_BEACON_KEYS) { + cfg80211_rx_unprot_mlme_mgmt(rx->sdata->dev, + skb->data, + skb->len); return RX_DROP_MONITOR; /* unexpected BIP keyidx */ + } rx->key = ieee80211_rx_get_bigtk(rx, mmie_keyidx); if (!rx->key) @@ -2131,6 +2135,10 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) /* either the frame has been decrypted or will be dropped */ status->flag |= RX_FLAG_DECRYPTED; + if (unlikely(ieee80211_is_beacon(fc) && result == RX_DROP_UNUSABLE)) + cfg80211_rx_unprot_mlme_mgmt(rx->sdata->dev, + skb->data, skb->len); + return result; } @@ -2411,8 +2419,12 @@ static int ieee80211_drop_unencrypted_mgmt(struct ieee80211_rx_data *rx) return -EACCES; } if (unlikely(ieee80211_is_beacon(fc) && rx->key && - ieee80211_get_mmie_keyidx(rx->skb) < 0)) + ieee80211_get_mmie_keyidx(rx->skb) < 0)) { + cfg80211_rx_unprot_mlme_mgmt(rx->sdata->dev, + rx->skb->data, + rx->skb->len); return -EACCES; + } /* * When using MFP, Action frames are not allowed prior to * having configured keys. -- cgit v1.2.3 From 6cd536fe62ef58d7c4eac2da07ab0ed7fd19010d Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 17 Apr 2020 12:43:01 +0200 Subject: cfg80211: change internal management frame registration API Almost all drivers below cfg80211 get the API wrong (except for cfg80211) and are unable to cope with multiple registrations for the same frame type, which is valid due to the match filter. This seems to indicate the API is wrong, and we should maintain the full information in cfg80211 instead of the drivers. Change the API to no longer inform the driver about individual registrations and unregistrations, but rather every time about the entire state of the entire wiphy and single wdev, whenever it may have changed. This also simplifies the code in cfg80211 as it no longer has to track exactly what was unregistered and can free things immediately. Signed-off-by: Johannes Berg Acked-by: Arend van Spriel Reviewed-by: Sergey Matyukevich Link: https://lore.kernel.org/r/20200417124300.f47f3828afc8.I7f81ef59c2c5a340d7075fb3c6d0e08e8aeffe07@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/ath6kl/cfg80211.c | 26 ++++--- .../broadcom/brcm80211/brcmfmac/cfg80211.c | 19 ++--- drivers/net/wireless/marvell/mwifiex/cfg80211.c | 16 ++--- drivers/net/wireless/quantenna/qtnfmac/cfg80211.c | 83 +++++++++++----------- include/net/cfg80211.h | 23 ++++-- include/net/mac80211.h | 2 +- net/mac80211/cfg.c | 50 ++++++------- net/mac80211/ieee80211_i.h | 2 +- net/wireless/core.c | 7 +- net/wireless/core.h | 6 +- net/wireless/mlme.c | 72 ++++++++----------- net/wireless/rdev-ops.h | 11 +-- net/wireless/trace.h | 20 +++--- 13 files changed, 159 insertions(+), 178 deletions(-) (limited to 'net') diff --git a/drivers/net/wireless/ath/ath6kl/cfg80211.c b/drivers/net/wireless/ath/ath6kl/cfg80211.c index 37cf602d8adf..67f8f2aa7a53 100644 --- a/drivers/net/wireless/ath/ath6kl/cfg80211.c +++ b/drivers/net/wireless/ath/ath6kl/cfg80211.c @@ -3249,22 +3249,19 @@ static int ath6kl_get_antenna(struct wiphy *wiphy, return 0; } -static void ath6kl_mgmt_frame_register(struct wiphy *wiphy, - struct wireless_dev *wdev, - u16 frame_type, bool reg) +static void ath6kl_update_mgmt_frame_registrations(struct wiphy *wiphy, + struct wireless_dev *wdev, + struct mgmt_frame_regs *upd) { struct ath6kl_vif *vif = ath6kl_vif_from_wdev(wdev); - ath6kl_dbg(ATH6KL_DBG_WLAN_CFG, "%s: frame_type=0x%x reg=%d\n", - __func__, frame_type, reg); - if (frame_type == IEEE80211_STYPE_PROBE_REQ) { - /* - * Note: This notification callback is not allowed to sleep, so - * we cannot send WMI_PROBE_REQ_REPORT_CMD here. Instead, we - * hardcode target to report Probe Request frames all the time. - */ - vif->probe_req_report = reg; - } + /* + * FIXME: send WMI_PROBE_REQ_REPORT_CMD here instead of hardcoding + * the reporting in the target all the time, this callback + * *is* allowed to sleep after all. + */ + vif->probe_req_report = + upd->interface_stypes & BIT(IEEE80211_STYPE_PROBE_REQ >> 4); } static int ath6kl_cfg80211_sscan_start(struct wiphy *wiphy, @@ -3464,7 +3461,8 @@ static struct cfg80211_ops ath6kl_cfg80211_ops = { .remain_on_channel = ath6kl_remain_on_channel, .cancel_remain_on_channel = ath6kl_cancel_remain_on_channel, .mgmt_tx = ath6kl_mgmt_tx, - .mgmt_frame_register = ath6kl_mgmt_frame_register, + .update_mgmt_frame_registrations = + ath6kl_update_mgmt_frame_registrations, .get_antenna = ath6kl_get_antenna, .sched_scan_start = ath6kl_cfg80211_sscan_start, .sched_scan_stop = ath6kl_cfg80211_sscan_stop, diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c index 2ba165330038..fa846471dac2 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c @@ -4979,21 +4979,15 @@ brcmf_cfg80211_change_station(struct wiphy *wiphy, struct net_device *ndev, } static void -brcmf_cfg80211_mgmt_frame_register(struct wiphy *wiphy, - struct wireless_dev *wdev, - u16 frame_type, bool reg) +brcmf_cfg80211_update_mgmt_frame_registrations(struct wiphy *wiphy, + struct wireless_dev *wdev, + struct mgmt_frame_regs *upd) { struct brcmf_cfg80211_vif *vif; - u16 mgmt_type; - brcmf_dbg(TRACE, "Enter, frame_type %04x, reg=%d\n", frame_type, reg); - - mgmt_type = (frame_type & IEEE80211_FCTL_STYPE) >> 4; vif = container_of(wdev, struct brcmf_cfg80211_vif, wdev); - if (reg) - vif->mgmt_rx_reg |= BIT(mgmt_type); - else - vif->mgmt_rx_reg &= ~BIT(mgmt_type); + + vif->mgmt_rx_reg = upd->interface_stypes; } @@ -5408,7 +5402,8 @@ static struct cfg80211_ops brcmf_cfg80211_ops = { .change_station = brcmf_cfg80211_change_station, .sched_scan_start = brcmf_cfg80211_sched_scan_start, .sched_scan_stop = brcmf_cfg80211_sched_scan_stop, - .mgmt_frame_register = brcmf_cfg80211_mgmt_frame_register, + .update_mgmt_frame_registrations = + brcmf_cfg80211_update_mgmt_frame_registrations, .mgmt_tx = brcmf_cfg80211_mgmt_tx, .remain_on_channel = brcmf_p2p_remain_on_channel, .cancel_remain_on_channel = brcmf_cfg80211_cancel_remain_on_channel, diff --git a/drivers/net/wireless/marvell/mwifiex/cfg80211.c b/drivers/net/wireless/marvell/mwifiex/cfg80211.c index 1566d2197906..21a17d4017c4 100644 --- a/drivers/net/wireless/marvell/mwifiex/cfg80211.c +++ b/drivers/net/wireless/marvell/mwifiex/cfg80211.c @@ -269,17 +269,12 @@ mwifiex_cfg80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, * CFG802.11 operation handler to register a mgmt frame. */ static void -mwifiex_cfg80211_mgmt_frame_register(struct wiphy *wiphy, - struct wireless_dev *wdev, - u16 frame_type, bool reg) +mwifiex_cfg80211_update_mgmt_frame_registrations(struct wiphy *wiphy, + struct wireless_dev *wdev, + struct mgmt_frame_regs *upd) { struct mwifiex_private *priv = mwifiex_netdev_get_priv(wdev->netdev); - u32 mask; - - if (reg) - mask = priv->mgmt_frame_mask | BIT(frame_type >> 4); - else - mask = priv->mgmt_frame_mask & ~BIT(frame_type >> 4); + u32 mask = upd->interface_stypes; if (mask != priv->mgmt_frame_mask) { priv->mgmt_frame_mask = mask; @@ -4189,7 +4184,8 @@ static struct cfg80211_ops mwifiex_cfg80211_ops = { .del_key = mwifiex_cfg80211_del_key, .set_default_mgmt_key = mwifiex_cfg80211_set_default_mgmt_key, .mgmt_tx = mwifiex_cfg80211_mgmt_tx, - .mgmt_frame_register = mwifiex_cfg80211_mgmt_frame_register, + .update_mgmt_frame_registrations = + mwifiex_cfg80211_update_mgmt_frame_registrations, .remain_on_channel = mwifiex_cfg80211_remain_on_channel, .cancel_remain_on_channel = mwifiex_cfg80211_cancel_remain_on_channel, .set_default_key = mwifiex_cfg80211_set_default_key, diff --git a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c index 8be17106008d..54cdf3ad09d7 100644 --- a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c +++ b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c @@ -389,55 +389,57 @@ static int qtnf_set_wiphy_params(struct wiphy *wiphy, u32 changed) } static void -qtnf_mgmt_frame_register(struct wiphy *wiphy, struct wireless_dev *wdev, - u16 frame_type, bool reg) +qtnf_update_mgmt_frame_registrations(struct wiphy *wiphy, + struct wireless_dev *wdev, + struct mgmt_frame_regs *upd) { struct qtnf_vif *vif = qtnf_netdev_get_priv(wdev->netdev); - u16 mgmt_type; - u16 new_mask; - u16 qlink_frame_type = 0; + u16 new_mask = upd->interface_stypes; + u16 old_mask = vif->mgmt_frames_bitmask; + static const struct { + u16 mask, qlink_type; + } updates[] = { + { + .mask = BIT(IEEE80211_STYPE_REASSOC_REQ >> 4) | + BIT(IEEE80211_STYPE_ASSOC_REQ >> 4), + .qlink_type = QLINK_MGMT_FRAME_ASSOC_REQ, + }, + { + .mask = BIT(IEEE80211_STYPE_AUTH >> 4), + .qlink_type = QLINK_MGMT_FRAME_AUTH, + }, + { + .mask = BIT(IEEE80211_STYPE_PROBE_REQ >> 4), + .qlink_type = QLINK_MGMT_FRAME_PROBE_REQ, + }, + { + .mask = BIT(IEEE80211_STYPE_ACTION >> 4), + .qlink_type = QLINK_MGMT_FRAME_ACTION, + }, + }; + unsigned int i; - mgmt_type = (frame_type & IEEE80211_FCTL_STYPE) >> 4; + if (new_mask == old_mask) + return; - if (reg) - new_mask = vif->mgmt_frames_bitmask | BIT(mgmt_type); - else - new_mask = vif->mgmt_frames_bitmask & ~BIT(mgmt_type); + for (i = 0; i < ARRAY_SIZE(updates); i++) { + u16 mask = updates[i].mask; + u16 qlink_frame_type = updates[i].qlink_type; + bool reg; - if (new_mask == vif->mgmt_frames_bitmask) - return; + /* the ! are here due to the assoc/reassoc merge */ + if (!(new_mask & mask) == !(old_mask & mask)) + continue; - switch (frame_type & IEEE80211_FCTL_STYPE) { - case IEEE80211_STYPE_REASSOC_REQ: - case IEEE80211_STYPE_ASSOC_REQ: - qlink_frame_type = QLINK_MGMT_FRAME_ASSOC_REQ; - break; - case IEEE80211_STYPE_AUTH: - qlink_frame_type = QLINK_MGMT_FRAME_AUTH; - break; - case IEEE80211_STYPE_PROBE_REQ: - qlink_frame_type = QLINK_MGMT_FRAME_PROBE_REQ; - break; - case IEEE80211_STYPE_ACTION: - qlink_frame_type = QLINK_MGMT_FRAME_ACTION; - break; - default: - pr_warn("VIF%u.%u: unsupported frame type: %X\n", - vif->mac->macid, vif->vifid, - (frame_type & IEEE80211_FCTL_STYPE) >> 4); - return; - } + reg = new_mask & mask; - if (qtnf_cmd_send_register_mgmt(vif, qlink_frame_type, reg)) { - pr_warn("VIF%u.%u: failed to %sregister mgmt frame type 0x%x\n", - vif->mac->macid, vif->vifid, reg ? "" : "un", - frame_type); - return; + if (qtnf_cmd_send_register_mgmt(vif, qlink_frame_type, reg)) + pr_warn("VIF%u.%u: failed to %sregister qlink frame type 0x%x\n", + vif->mac->macid, vif->vifid, reg ? "" : "un", + qlink_frame_type); } vif->mgmt_frames_bitmask = new_mask; - pr_debug("VIF%u.%u: %sregistered mgmt frame type 0x%x\n", - vif->mac->macid, vif->vifid, reg ? "" : "un", frame_type); } static int @@ -1017,7 +1019,8 @@ static struct cfg80211_ops qtn_cfg80211_ops = { .change_beacon = qtnf_change_beacon, .stop_ap = qtnf_stop_ap, .set_wiphy_params = qtnf_set_wiphy_params, - .mgmt_frame_register = qtnf_mgmt_frame_register, + .update_mgmt_frame_registrations = + qtnf_update_mgmt_frame_registrations, .mgmt_tx = qtnf_mgmt_tx, .change_station = qtnf_change_station, .del_station = qtnf_del_station, diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 775952677b3d..bc273f6d60f2 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -3384,6 +3384,17 @@ struct cfg80211_update_owe_info { size_t ie_len; }; +/** + * struct mgmt_frame_regs - management frame registrations data + * @global_stypes: bitmap of management frame subtypes registered + * for the entire device + * @interface_stypes: bitmap of management frame subtypes registered + * for the given interface + */ +struct mgmt_frame_regs { + u32 global_stypes, interface_stypes; +}; + /** * struct cfg80211_ops - backend description for wireless configuration * @@ -3608,8 +3619,8 @@ struct cfg80211_update_owe_info { * The driver should not call cfg80211_sched_scan_stopped() for a requested * stop (when this method returns 0). * - * @mgmt_frame_register: Notify driver that a management frame type was - * registered. The callback is allowed to sleep. + * @update_mgmt_frame_registrations: Notify the driver that management frame + * registrations were updated. The callback is allowed to sleep. * * @set_antenna: Set antenna configuration (tx_ant, rx_ant) on the device. * Parameters are bitmaps of allowed antennas to use for TX/RX. Drivers may @@ -3932,9 +3943,9 @@ struct cfg80211_ops { struct net_device *dev, u32 rate, u32 pkts, u32 intvl); - void (*mgmt_frame_register)(struct wiphy *wiphy, - struct wireless_dev *wdev, - u16 frame_type, bool reg); + void (*update_mgmt_frame_registrations)(struct wiphy *wiphy, + struct wireless_dev *wdev, + struct mgmt_frame_regs *upd); int (*set_antenna)(struct wiphy *wiphy, u32 tx_ant, u32 rx_ant); int (*get_antenna)(struct wiphy *wiphy, u32 *tx_ant, u32 *rx_ant); @@ -5015,6 +5026,7 @@ struct cfg80211_cqm_config; * by cfg80211 on change_interface * @mgmt_registrations: list of registrations for management frames * @mgmt_registrations_lock: lock for the list + * @mgmt_registrations_update_wk: update work to defer from atomic context * @mtx: mutex used to lock data in this struct, may be used by drivers * and some API functions require it held * @beacon_interval: beacon interval used on this device for transmitting @@ -5060,6 +5072,7 @@ struct wireless_dev { struct list_head mgmt_registrations; spinlock_t mgmt_registrations_lock; + struct work_struct mgmt_registrations_update_wk; struct mutex mtx; diff --git a/include/net/mac80211.h b/include/net/mac80211.h index b6b4de0e4b5e..f6dc5a38720f 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1647,7 +1647,7 @@ struct ieee80211_vif { struct dentry *debugfs_dir; #endif - unsigned int probe_req_reg; + bool probe_req_reg; bool txqs_stopped[IEEE80211_NUM_ACS]; diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index b90f2131ec7a..e62b4764e82e 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -3398,44 +3398,35 @@ int ieee80211_attach_ack_skb(struct ieee80211_local *local, struct sk_buff *skb, return 0; } -static void ieee80211_mgmt_frame_register(struct wiphy *wiphy, +static void +ieee80211_update_mgmt_frame_registrations(struct wiphy *wiphy, struct wireless_dev *wdev, - u16 frame_type, bool reg) + struct mgmt_frame_regs *upd) { struct ieee80211_local *local = wiphy_priv(wiphy); struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); + u32 preq_mask = BIT(IEEE80211_STYPE_PROBE_REQ >> 4); + bool global_change, intf_change; - switch (frame_type) { - case IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_PROBE_REQ: - if (reg) { - local->probe_req_reg++; - sdata->vif.probe_req_reg++; - } else { - if (local->probe_req_reg) - local->probe_req_reg--; + global_change = + local->probe_req_reg != !!(upd->global_stypes & preq_mask); + local->probe_req_reg = upd->global_stypes & preq_mask; - if (sdata->vif.probe_req_reg) - sdata->vif.probe_req_reg--; - } + intf_change = sdata->vif.probe_req_reg != + !!(upd->interface_stypes & preq_mask); + sdata->vif.probe_req_reg = upd->interface_stypes & preq_mask; - if (!local->open_count) - break; + if (!local->open_count) + return; - if (ieee80211_sdata_running(sdata)) { - if (sdata->vif.probe_req_reg == 1) - drv_config_iface_filter(local, sdata, - FIF_PROBE_REQ, - FIF_PROBE_REQ); - else if (sdata->vif.probe_req_reg == 0) - drv_config_iface_filter(local, sdata, 0, - FIF_PROBE_REQ); - } + if (intf_change && ieee80211_sdata_running(sdata)) + drv_config_iface_filter(local, sdata, + sdata->vif.probe_req_reg ? + FIF_PROBE_REQ : 0, + FIF_PROBE_REQ); + if (global_change) ieee80211_configure_filter(local); - break; - default: - break; - } } static int ieee80211_set_antenna(struct wiphy *wiphy, u32 tx_ant, u32 rx_ant) @@ -4020,7 +4011,8 @@ const struct cfg80211_ops mac80211_config_ops = { .mgmt_tx_cancel_wait = ieee80211_mgmt_tx_cancel_wait, .set_cqm_rssi_config = ieee80211_set_cqm_rssi_config, .set_cqm_rssi_range_config = ieee80211_set_cqm_rssi_range_config, - .mgmt_frame_register = ieee80211_mgmt_frame_register, + .update_mgmt_frame_registrations = + ieee80211_update_mgmt_frame_registrations, .set_antenna = ieee80211_set_antenna, .get_antenna = ieee80211_get_antenna, .set_rekey_data = ieee80211_set_rekey_data, diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 934a91bef575..da41ee996d3d 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1167,7 +1167,7 @@ struct ieee80211_local { /* number of interfaces with corresponding FIF_ flags */ int fif_fcsfail, fif_plcpfail, fif_control, fif_other_bss, fif_pspoll, fif_probe_req; - int probe_req_reg; + bool probe_req_reg; unsigned int filter_flags; /* FIF_* */ bool wiphy_ciphers_allocated; diff --git a/net/wireless/core.c b/net/wireless/core.c index 341402b4f178..5757dea2aa94 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -480,9 +480,6 @@ use_default_name: INIT_LIST_HEAD(&rdev->bss_list); INIT_LIST_HEAD(&rdev->sched_scan_req_list); INIT_WORK(&rdev->scan_done_wk, __cfg80211_scan_done); - INIT_LIST_HEAD(&rdev->mlme_unreg); - spin_lock_init(&rdev->mlme_unreg_lock); - INIT_WORK(&rdev->mlme_unreg_wk, cfg80211_mlme_unreg_wk); INIT_DELAYED_WORK(&rdev->dfs_update_channels_wk, cfg80211_dfs_channels_update_work); #ifdef CONFIG_CFG80211_WEXT @@ -1030,7 +1027,6 @@ void wiphy_unregister(struct wiphy *wiphy) cancel_delayed_work_sync(&rdev->dfs_update_channels_wk); flush_work(&rdev->destroy_work); flush_work(&rdev->sched_scan_stop_wk); - flush_work(&rdev->mlme_unreg_wk); flush_work(&rdev->propagate_radar_detect_wk); flush_work(&rdev->propagate_cac_done_wk); @@ -1094,6 +1090,7 @@ static void __cfg80211_unregister_wdev(struct wireless_dev *wdev, bool sync) rdev->devlist_generation++; cfg80211_mlme_purge_registrations(wdev); + flush_work(&wdev->mgmt_registrations_update_wk); switch (wdev->iftype) { case NL80211_IFTYPE_P2P_DEVICE: @@ -1238,6 +1235,8 @@ void cfg80211_init_wdev(struct cfg80211_registered_device *rdev, spin_lock_init(&wdev->event_lock); INIT_LIST_HEAD(&wdev->mgmt_registrations); spin_lock_init(&wdev->mgmt_registrations_lock); + INIT_WORK(&wdev->mgmt_registrations_update_wk, + cfg80211_mgmt_registrations_update_wk); INIT_LIST_HEAD(&wdev->pmsr_list); spin_lock_init(&wdev->pmsr_lock); INIT_WORK(&wdev->pmsr_free_wk, cfg80211_pmsr_free_wk); diff --git a/net/wireless/core.h b/net/wireless/core.h index bb897a803ffe..30fb2c35ae43 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -60,10 +60,6 @@ struct cfg80211_registered_device { struct list_head beacon_registrations; spinlock_t beacon_registrations_lock; - struct list_head mlme_unreg; - spinlock_t mlme_unreg_lock; - struct work_struct mlme_unreg_wk; - /* protected by RTNL only */ int num_running_ifaces; int num_running_monitor_ifaces; @@ -386,7 +382,7 @@ void cfg80211_mlme_down(struct cfg80211_registered_device *rdev, int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_pid, u16 frame_type, const u8 *match_data, int match_len, struct netlink_ext_ack *extack); -void cfg80211_mlme_unreg_wk(struct work_struct *wk); +void cfg80211_mgmt_registrations_update_wk(struct work_struct *wk); void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlpid); void cfg80211_mlme_purge_registrations(struct wireless_dev *wdev); int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev, diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index e4805a3bd310..2e1a21e90b83 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -429,43 +429,37 @@ struct cfg80211_mgmt_registration { u8 match[]; }; -static void -cfg80211_process_mlme_unregistrations(struct cfg80211_registered_device *rdev) +static void cfg80211_mgmt_registrations_update(struct wireless_dev *wdev) { + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); + struct wireless_dev *tmp; struct cfg80211_mgmt_registration *reg; + struct mgmt_frame_regs upd = {}; ASSERT_RTNL(); - spin_lock_bh(&rdev->mlme_unreg_lock); - while ((reg = list_first_entry_or_null(&rdev->mlme_unreg, - struct cfg80211_mgmt_registration, - list))) { - list_del(®->list); - spin_unlock_bh(&rdev->mlme_unreg_lock); - - if (rdev->ops->mgmt_frame_register) { - u16 frame_type = le16_to_cpu(reg->frame_type); + rcu_read_lock(); + list_for_each_entry_rcu(tmp, &rdev->wiphy.wdev_list, list) { + list_for_each_entry_rcu(reg, &tmp->mgmt_registrations, list) { + u32 mask = BIT(le16_to_cpu(reg->frame_type) >> 4); - rdev_mgmt_frame_register(rdev, reg->wdev, - frame_type, false); + upd.global_stypes |= mask; + if (tmp == wdev) + upd.interface_stypes |= mask; } - - kfree(reg); - - spin_lock_bh(&rdev->mlme_unreg_lock); } - spin_unlock_bh(&rdev->mlme_unreg_lock); + rcu_read_unlock(); + + rdev_update_mgmt_frame_registrations(rdev, wdev, &upd); } -void cfg80211_mlme_unreg_wk(struct work_struct *wk) +void cfg80211_mgmt_registrations_update_wk(struct work_struct *wk) { - struct cfg80211_registered_device *rdev; - - rdev = container_of(wk, struct cfg80211_registered_device, - mlme_unreg_wk); + struct wireless_dev *wdev = container_of(wk, struct wireless_dev, + mgmt_registrations_update_wk); rtnl_lock(); - cfg80211_process_mlme_unregistrations(rdev); + cfg80211_mgmt_registrations_update(wdev); rtnl_unlock(); } @@ -473,8 +467,6 @@ int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid, u16 frame_type, const u8 *match_data, int match_len, struct netlink_ext_ack *extack) { - struct wiphy *wiphy = wdev->wiphy; - struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct cfg80211_mgmt_registration *reg, *nreg; int err = 0; u16 mgmt_type; @@ -534,10 +526,8 @@ int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid, } } - if (err) { - kfree(nreg); + if (err) goto out; - } memcpy(nreg->match, match_data, match_len); nreg->match_len = match_len; @@ -547,15 +537,12 @@ int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid, list_add(&nreg->list, &wdev->mgmt_registrations); spin_unlock_bh(&wdev->mgmt_registrations_lock); - /* process all unregistrations to avoid driver confusion */ - cfg80211_process_mlme_unregistrations(rdev); - - if (rdev->ops->mgmt_frame_register) - rdev_mgmt_frame_register(rdev, wdev, frame_type, true); + cfg80211_mgmt_registrations_update(wdev); return 0; out: + kfree(nreg); spin_unlock_bh(&wdev->mgmt_registrations_lock); return err; @@ -574,11 +561,9 @@ void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlportid) continue; list_del(®->list); - spin_lock(&rdev->mlme_unreg_lock); - list_add_tail(®->list, &rdev->mlme_unreg); - spin_unlock(&rdev->mlme_unreg_lock); + kfree(reg); - schedule_work(&rdev->mlme_unreg_wk); + schedule_work(&wdev->mgmt_registrations_update_wk); } spin_unlock_bh(&wdev->mgmt_registrations_lock); @@ -594,15 +579,16 @@ void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlportid) void cfg80211_mlme_purge_registrations(struct wireless_dev *wdev) { - struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); + struct cfg80211_mgmt_registration *reg, *tmp; spin_lock_bh(&wdev->mgmt_registrations_lock); - spin_lock(&rdev->mlme_unreg_lock); - list_splice_tail_init(&wdev->mgmt_registrations, &rdev->mlme_unreg); - spin_unlock(&rdev->mlme_unreg_lock); + list_for_each_entry_safe(reg, tmp, &wdev->mgmt_registrations, list) { + list_del(®->list); + kfree(reg); + } spin_unlock_bh(&wdev->mgmt_registrations_lock); - cfg80211_process_mlme_unregistrations(rdev); + cfg80211_mgmt_registrations_update(wdev); } int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev, diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index 99462f0c4e08..df5142e86c4f 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -819,13 +819,16 @@ rdev_set_cqm_txe_config(struct cfg80211_registered_device *rdev, } static inline void -rdev_mgmt_frame_register(struct cfg80211_registered_device *rdev, - struct wireless_dev *wdev, u16 frame_type, bool reg) +rdev_update_mgmt_frame_registrations(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev, + struct mgmt_frame_regs *upd) { might_sleep(); - trace_rdev_mgmt_frame_register(&rdev->wiphy, wdev , frame_type, reg); - rdev->ops->mgmt_frame_register(&rdev->wiphy, wdev , frame_type, reg); + trace_rdev_update_mgmt_frame_registrations(&rdev->wiphy, wdev, upd); + if (rdev->ops->update_mgmt_frame_registrations) + rdev->ops->update_mgmt_frame_registrations(&rdev->wiphy, wdev, + upd); trace_rdev_return_void(&rdev->wiphy); } diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 839df54cee21..ee736620f1e3 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -1582,25 +1582,25 @@ TRACE_EVENT(rdev_set_bitrate_mask, WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer)) ); -TRACE_EVENT(rdev_mgmt_frame_register, +TRACE_EVENT(rdev_update_mgmt_frame_registrations, TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, - u16 frame_type, bool reg), - TP_ARGS(wiphy, wdev, frame_type, reg), + struct mgmt_frame_regs *upd), + TP_ARGS(wiphy, wdev, upd), TP_STRUCT__entry( WIPHY_ENTRY WDEV_ENTRY - __field(u16, frame_type) - __field(bool, reg) + __field(u16, global_stypes) + __field(u16, interface_stypes) ), TP_fast_assign( WIPHY_ASSIGN; WDEV_ASSIGN; - __entry->frame_type = frame_type; - __entry->reg = reg; + __entry->global_stypes = upd->global_stypes; + __entry->interface_stypes = upd->interface_stypes; ), - TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", frame_type: 0x%.2x, reg: %s ", - WIPHY_PR_ARG, WDEV_PR_ARG, __entry->frame_type, - __entry->reg ? "true" : "false") + TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", global: 0x%.2x, intf: 0x%.2x", + WIPHY_PR_ARG, WDEV_PR_ARG, + __entry->global_stypes, __entry->interface_stypes) ); TRACE_EVENT(rdev_return_int_tx_rx, -- cgit v1.2.3 From 9dba48a6ece79da064655736dc7347a5fcadedef Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 17 Apr 2020 12:40:15 +0200 Subject: cfg80211: support multicast RX registration For DPP, there's a need to receive multicast action frames, but many drivers need a special filter configuration for this. Support announcing from userspace in the management registration that multicast RX is required, with an extended feature flag if the driver handles this. Signed-off-by: Johannes Berg Reviewed-by: Sergey Matyukevich Link: https://lore.kernel.org/r/20200417124013.c46238801048.Ib041d437ce0bff28a0c6d5dc915f68f1d8591002@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 4 ++++ include/uapi/linux/nl80211.h | 13 +++++++++++++ net/wireless/core.h | 3 ++- net/wireless/mlme.c | 38 ++++++++++++++++++++++++++++++-------- net/wireless/nl80211.c | 10 ++++++++++ 5 files changed, 59 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index bc273f6d60f2..dbb9675fe38f 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -3390,9 +3390,13 @@ struct cfg80211_update_owe_info { * for the entire device * @interface_stypes: bitmap of management frame subtypes registered * for the given interface + * @global_mcast_rx: mcast RX is needed globally for these subtypes + * @interface_mcast_stypes: mcast RX is needed on this interface + * for these subtypes */ struct mgmt_frame_regs { u32 global_stypes, interface_stypes; + u32 global_mcast_stypes, interface_mcast_stypes; }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index afdd9802ccb8..e0dc89eceab8 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -687,6 +687,10 @@ * four bytes for vendor frames including the OUI. The registration * cannot be dropped, but is removed automatically when the netlink * socket is closed. Multiple registrations can be made. + * The %NL80211_ATTR_RECEIVE_MULTICAST flag attribute can be given if + * %NL80211_EXT_FEATURE_MULTICAST_REGISTRATIONS is available, in which + * case the registration can also be modified to include/exclude the + * flag, rather than requiring unregistration to change it. * @NL80211_CMD_REGISTER_ACTION: Alias for @NL80211_CMD_REGISTER_FRAME for * backward compatibility * @NL80211_CMD_FRAME: Management frame TX request and RX notification. This @@ -2477,6 +2481,9 @@ enum nl80211_commands { * no roaming occurs between the reauth threshold and PMK expiration, * disassociation is still forced. * + * @NL80211_ATTR_RECEIVE_MULTICAST: multicast flag for the + * %NL80211_CMD_REGISTER_FRAME command, see the description there. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -2952,6 +2959,8 @@ enum nl80211_attrs { NL80211_ATTR_PMK_LIFETIME, NL80211_ATTR_PMK_REAUTH_THRESHOLD, + NL80211_ATTR_RECEIVE_MULTICAST, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -5691,6 +5700,9 @@ enum nl80211_feature_flags { * @NL80211_EXT_FEATURE_DEL_IBSS_STA: The driver supports removing stations * in IBSS mode, essentially by dropping their state. * + * @NL80211_EXT_FEATURE_MULTICAST_REGISTRATIONS: management frame registrations + * are possible for multicast frames and those will be reported properly. + * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ @@ -5742,6 +5754,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_CONTROL_PORT_NO_PREAUTH, NL80211_EXT_FEATURE_PROTECTED_TWT, NL80211_EXT_FEATURE_DEL_IBSS_STA, + NL80211_EXT_FEATURE_MULTICAST_REGISTRATIONS, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, diff --git a/net/wireless/core.h b/net/wireless/core.h index 30fb2c35ae43..639d41896573 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -381,7 +381,8 @@ void cfg80211_mlme_down(struct cfg80211_registered_device *rdev, struct net_device *dev); int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_pid, u16 frame_type, const u8 *match_data, - int match_len, struct netlink_ext_ack *extack); + int match_len, bool multicast_rx, + struct netlink_ext_ack *extack); void cfg80211_mgmt_registrations_update_wk(struct work_struct *wk); void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlpid); void cfg80211_mlme_purge_registrations(struct wireless_dev *wdev); diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index 2e1a21e90b83..409497a3527d 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -426,6 +426,8 @@ struct cfg80211_mgmt_registration { __le16 frame_type; + bool multicast_rx; + u8 match[]; }; @@ -442,10 +444,18 @@ static void cfg80211_mgmt_registrations_update(struct wireless_dev *wdev) list_for_each_entry_rcu(tmp, &rdev->wiphy.wdev_list, list) { list_for_each_entry_rcu(reg, &tmp->mgmt_registrations, list) { u32 mask = BIT(le16_to_cpu(reg->frame_type) >> 4); + u32 mcast_mask = 0; + + if (reg->multicast_rx) + mcast_mask = mask; upd.global_stypes |= mask; - if (tmp == wdev) + upd.global_mcast_stypes |= mcast_mask; + + if (tmp == wdev) { upd.interface_stypes |= mask; + upd.interface_mcast_stypes |= mcast_mask; + } } } rcu_read_unlock(); @@ -465,11 +475,13 @@ void cfg80211_mgmt_registrations_update_wk(struct work_struct *wk) int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid, u16 frame_type, const u8 *match_data, - int match_len, struct netlink_ext_ack *extack) + int match_len, bool multicast_rx, + struct netlink_ext_ack *extack) { struct cfg80211_mgmt_registration *reg, *nreg; int err = 0; u16 mgmt_type; + bool update_multicast = false; if (!wdev->wiphy->mgmt_stypes) return -EOPNOTSUPP; @@ -520,6 +532,11 @@ int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid, continue; if (memcmp(reg->match, match_data, mlen) == 0) { + if (reg->multicast_rx != multicast_rx) { + update_multicast = true; + reg->multicast_rx = multicast_rx; + break; + } NL_SET_ERR_MSG(extack, "Match already configured"); err = -EALREADY; break; @@ -529,12 +546,17 @@ int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid, if (err) goto out; - memcpy(nreg->match, match_data, match_len); - nreg->match_len = match_len; - nreg->nlportid = snd_portid; - nreg->frame_type = cpu_to_le16(frame_type); - nreg->wdev = wdev; - list_add(&nreg->list, &wdev->mgmt_registrations); + if (update_multicast) { + kfree(nreg); + } else { + memcpy(nreg->match, match_data, match_len); + nreg->match_len = match_len; + nreg->nlportid = snd_portid; + nreg->frame_type = cpu_to_le16(frame_type); + nreg->wdev = wdev; + nreg->multicast_rx = multicast_rx; + list_add(&nreg->list, &wdev->mgmt_registrations); + } spin_unlock_bh(&wdev->mgmt_registrations_lock); cfg80211_mgmt_registrations_update(wdev); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 2127e5344b1a..73a3e885d4dd 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -661,6 +661,7 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_CONTROL_PORT_NO_PREAUTH] = { .type = NLA_FLAG }, [NL80211_ATTR_PMK_LIFETIME] = NLA_POLICY_MIN(NLA_U32, 1), [NL80211_ATTR_PMK_REAUTH_THRESHOLD] = NLA_POLICY_RANGE(NLA_U8, 1, 100), + [NL80211_ATTR_RECEIVE_MULTICAST] = { .type = NLA_FLAG }, }; /* policy for the key attributes */ @@ -10773,9 +10774,18 @@ static int nl80211_register_mgmt(struct sk_buff *skb, struct genl_info *info) if (!rdev->ops->mgmt_tx) return -EOPNOTSUPP; + if (info->attrs[NL80211_ATTR_RECEIVE_MULTICAST] && + !wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_MULTICAST_REGISTRATIONS)) { + GENL_SET_ERR_MSG(info, + "multicast RX registrations are not supported"); + return -EOPNOTSUPP; + } + return cfg80211_mlme_register_mgmt(wdev, info->snd_portid, frame_type, nla_data(info->attrs[NL80211_ATTR_FRAME_MATCH]), nla_len(info->attrs[NL80211_ATTR_FRAME_MATCH]), + info->attrs[NL80211_ATTR_RECEIVE_MULTICAST], info->extack); } -- cgit v1.2.3 From 155d7c733807190258639c66b36340948f369349 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 20 Apr 2020 14:06:00 +0200 Subject: nl80211: allow client-only BIGTK support The current NL80211_EXT_FEATURE_BEACON_PROTECTION feature flag requires both AP and client support, add a new one called NL80211_EXT_FEATURE_BEACON_PROTECTION_CLIENT that enables only support in client (and P2P-client) modes. Link: https://lore.kernel.org/r/20200420140559.6ba704053a5a.Ifeb869fb0b48e52fe0cb9c15572b93ac8a924f8d@changeid Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 3 +++ net/wireless/nl80211.c | 19 +++++++++++++++---- 2 files changed, 18 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index e0dc89eceab8..9679d561f7d0 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -5690,6 +5690,8 @@ enum nl80211_feature_flags { * * @NL80211_EXT_FEATURE_BEACON_PROTECTION: The driver supports Beacon protection * and can receive key configuration for BIGTK using key indexes 6 and 7. + * @NL80211_EXT_FEATURE_BEACON_PROTECTION_CLIENT: The driver supports Beacon + * protection as a client only and cannot transmit protected beacons. * * @NL80211_EXT_FEATURE_CONTROL_PORT_NO_PREAUTH: The driver can disable the * forwarding of preauth frames over the control port. They are then @@ -5755,6 +5757,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_PROTECTED_TWT, NL80211_EXT_FEATURE_DEL_IBSS_STA, NL80211_EXT_FEATURE_MULTICAST_REGISTRATIONS, + NL80211_EXT_FEATURE_BEACON_PROTECTION_CLIENT, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 73a3e885d4dd..d470d77d2eb6 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -3905,14 +3905,25 @@ static int nl80211_get_key(struct sk_buff *skb, struct genl_info *info) }; void *hdr; struct sk_buff *msg; + bool bigtk_support = false; + + if (wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_BEACON_PROTECTION)) + bigtk_support = true; + + if ((dev->ieee80211_ptr->iftype == NL80211_IFTYPE_STATION || + dev->ieee80211_ptr->iftype == NL80211_IFTYPE_P2P_CLIENT) && + wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_BEACON_PROTECTION_CLIENT)) + bigtk_support = true; if (info->attrs[NL80211_ATTR_KEY_IDX]) { key_idx = nla_get_u8(info->attrs[NL80211_ATTR_KEY_IDX]); - if (key_idx > 5 && - !wiphy_ext_feature_isset( - &rdev->wiphy, - NL80211_EXT_FEATURE_BEACON_PROTECTION)) + + if (key_idx >= 6 && key_idx <= 7 && !bigtk_support) { + GENL_SET_ERR_MSG(info, "BIGTK not supported"); return -EINVAL; + } } if (info->attrs[NL80211_ATTR_MAC]) -- cgit v1.2.3 From 873b1cf61105a67f01f6fc3758405edb1bd1ba35 Mon Sep 17 00:00:00 2001 From: Jouni Malinen Date: Tue, 21 Apr 2020 17:48:15 +0300 Subject: mac80211: Process multicast RX registration for Action frames Convert a user space registration for processing multicast Action frames (NL80211_CMD_REGISTER_FRAME with NL80211_ATTR_RECEIVE_MULTICAST) to a new enum ieee80211_filter_flags bit FIF_MCAST_ACTION so that drivers can update their RX filter parameters appropriately, if needed. Signed-off-by: Jouni Malinen Link: https://lore.kernel.org/r/20200421144815.19175-1-jouni@codeaurora.org [rename variables to rx_mcast_action_reg indicating action frames only] Signed-off-by: Johannes Berg --- include/net/mac80211.h | 6 ++++++ net/mac80211/cfg.c | 14 +++++++++++--- net/mac80211/ieee80211_i.h | 1 + net/mac80211/main.c | 3 +++ 4 files changed, 21 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index f6dc5a38720f..f12fe3b0a868 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1620,6 +1620,8 @@ enum ieee80211_vif_flags { * monitor interface (if that is requested.) * @probe_req_reg: probe requests should be reported to mac80211 for this * interface. + * @rx_mcast_action_reg: multicast Action frames should be reported to mac80211 + * for this interface. * @drv_priv: data area for driver use, will always be aligned to * sizeof(void \*). * @txq: the multicast data TX queue (if driver uses the TXQ abstraction) @@ -1648,6 +1650,7 @@ struct ieee80211_vif { #endif bool probe_req_reg; + bool rx_mcast_action_reg; bool txqs_stopped[IEEE80211_NUM_ACS]; @@ -3091,6 +3094,8 @@ void ieee80211_free_txskb(struct ieee80211_hw *hw, struct sk_buff *skb); * @FIF_PSPOLL: pass PS Poll frames * * @FIF_PROBE_REQ: pass probe request frames + * + * @FIF_MCAST_ACTION: pass multicast Action frames */ enum ieee80211_filter_flags { FIF_ALLMULTI = 1<<1, @@ -3101,6 +3106,7 @@ enum ieee80211_filter_flags { FIF_OTHER_BSS = 1<<6, FIF_PSPOLL = 1<<7, FIF_PROBE_REQ = 1<<8, + FIF_MCAST_ACTION = 1<<9, }; /** diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index e62b4764e82e..f0d43b9cfa43 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -3406,15 +3406,23 @@ ieee80211_update_mgmt_frame_registrations(struct wiphy *wiphy, struct ieee80211_local *local = wiphy_priv(wiphy); struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); u32 preq_mask = BIT(IEEE80211_STYPE_PROBE_REQ >> 4); + u32 action_mask = BIT(IEEE80211_STYPE_ACTION >> 4); bool global_change, intf_change; global_change = - local->probe_req_reg != !!(upd->global_stypes & preq_mask); + (local->probe_req_reg != !!(upd->global_stypes & preq_mask)) || + (local->rx_mcast_action_reg != + !!(upd->global_mcast_stypes & action_mask)); local->probe_req_reg = upd->global_stypes & preq_mask; + local->rx_mcast_action_reg = upd->global_mcast_stypes & action_mask; - intf_change = sdata->vif.probe_req_reg != - !!(upd->interface_stypes & preq_mask); + intf_change = (sdata->vif.probe_req_reg != + !!(upd->interface_stypes & preq_mask)) || + (sdata->vif.rx_mcast_action_reg != + !!(upd->interface_mcast_stypes & action_mask)); sdata->vif.probe_req_reg = upd->interface_stypes & preq_mask; + sdata->vif.rx_mcast_action_reg = + upd->interface_mcast_stypes & action_mask; if (!local->open_count) return; diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index da41ee996d3d..9407cf44305c 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1168,6 +1168,7 @@ struct ieee80211_local { int fif_fcsfail, fif_plcpfail, fif_control, fif_other_bss, fif_pspoll, fif_probe_req; bool probe_req_reg; + bool rx_mcast_action_reg; unsigned int filter_flags; /* FIF_* */ bool wiphy_ciphers_allocated; diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 0e9ad60fb2b3..a0cb052ea30d 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -64,6 +64,9 @@ void ieee80211_configure_filter(struct ieee80211_local *local) if (local->fif_pspoll) new_flags |= FIF_PSPOLL; + if (local->rx_mcast_action_reg) + new_flags |= FIF_MCAST_ACTION; + spin_lock_bh(&local->filter_lock); changed_flags = local->filter_flags ^ new_flags; -- cgit v1.2.3 From 9166cc49767a646990a73380480356416b7794eb Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 26 Mar 2020 15:09:32 +0200 Subject: mac80211: implement Operating Mode Notification extended NSS support Somehow we missed this for a long time, but similar to the extended NSS support in VHT capabilities, we need to have this in Operating Mode notification. Implement it by * parsing the 160/80+80 bit there and setting the bandwidth appropriately * having callers of ieee80211_get_vht_max_nss() pass in the current max NSS value as received in the operating mode notification in order to modify it appropriately depending on the extended NSS bits. This updates all drivers that use it, i.e. only iwlwifi/mvm. Signed-off-by: Johannes Berg Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20200326150855.098483728cfa.I4e8c25d3288441759c2793247197229f0696a37d@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/mvm/rs.c | 6 +++--- include/linux/ieee80211.h | 12 +++++++++--- net/mac80211/vht.c | 10 ++++++++-- net/wireless/util.c | 26 ++++++++++++++------------ 4 files changed, 34 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/rs.c b/drivers/net/wireless/intel/iwlwifi/mvm/rs.c index c1aba2bf73cf..a8c13f6fbce0 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/rs.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/rs.c @@ -1,10 +1,9 @@ // SPDX-License-Identifier: GPL-2.0-only /****************************************************************************** * - * Copyright(c) 2005 - 2014 Intel Corporation. All rights reserved. + * Copyright(c) 2005 - 2014, 2018 - 2020 Intel Corporation. All rights reserved. * Copyright(c) 2013 - 2015 Intel Mobile Communications GmbH * Copyright(c) 2016 - 2017 Intel Deutschland GmbH - * Copyright(c) 2018 - 2019 Intel Corporation * * Contact Information: * Intel Linux Wireless @@ -1430,7 +1429,8 @@ static u32 rs_bw_from_sta_bw(struct ieee80211_sta *sta) */ if (ieee80211_get_vht_max_nss(&vht_cap, IEEE80211_VHT_CHANWIDTH_160MHZ, - 0, true) < sta->rx_nss) + 0, true, + sta->rx_nss) < sta->rx_nss) return RATE_MCS_CHAN_WIDTH_80; return RATE_MCS_CHAN_WIDTH_160; case IEEE80211_STA_RX_BW_80: diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 16268ef1cbcc..c326aec535c6 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -9,7 +9,7 @@ * Copyright (c) 2006, Michael Wu * Copyright (c) 2013 - 2014 Intel Mobile Communications GmbH * Copyright (c) 2016 - 2017 Intel Deutschland GmbH - * Copyright (c) 2018 - 2019 Intel Corporation + * Copyright (c) 2018 - 2020 Intel Corporation */ #ifndef LINUX_IEEE80211_H @@ -859,6 +859,7 @@ enum ieee80211_ht_chanwidth_values { * @IEEE80211_OPMODE_NOTIF_CHANWIDTH_40MHZ: 40 MHz channel width * @IEEE80211_OPMODE_NOTIF_CHANWIDTH_80MHZ: 80 MHz channel width * @IEEE80211_OPMODE_NOTIF_CHANWIDTH_160MHZ: 160 MHz or 80+80 MHz channel width + * @IEEE80211_OPMODE_NOTIF_BW_160_80P80: 160 / 80+80 MHz indicator flag * @IEEE80211_OPMODE_NOTIF_RX_NSS_MASK: number of spatial streams mask * (the NSS value is the value of this field + 1) * @IEEE80211_OPMODE_NOTIF_RX_NSS_SHIFT: number of spatial streams shift @@ -866,11 +867,12 @@ enum ieee80211_ht_chanwidth_values { * using a beamforming steering matrix */ enum ieee80211_vht_opmode_bits { - IEEE80211_OPMODE_NOTIF_CHANWIDTH_MASK = 3, + IEEE80211_OPMODE_NOTIF_CHANWIDTH_MASK = 0x03, IEEE80211_OPMODE_NOTIF_CHANWIDTH_20MHZ = 0, IEEE80211_OPMODE_NOTIF_CHANWIDTH_40MHZ = 1, IEEE80211_OPMODE_NOTIF_CHANWIDTH_80MHZ = 2, IEEE80211_OPMODE_NOTIF_CHANWIDTH_160MHZ = 3, + IEEE80211_OPMODE_NOTIF_BW_160_80P80 = 0x04, IEEE80211_OPMODE_NOTIF_RX_NSS_MASK = 0x70, IEEE80211_OPMODE_NOTIF_RX_NSS_SHIFT = 4, IEEE80211_OPMODE_NOTIF_RX_NSS_TYPE_BF = 0x80, @@ -1731,6 +1733,9 @@ struct ieee80211_mu_edca_param_set { * @ext_nss_bw_capable: indicates whether or not the local transmitter * (rate scaling algorithm) can deal with the new logic * (dot11VHTExtendedNSSBWCapable) + * @max_vht_nss: current maximum NSS as advertised by the STA in + * operating mode notification, can be 0 in which case the + * capability data will be used to derive this (from MCS support) * * Due to the VHT Extended NSS Bandwidth Support, the maximum NSS can * vary for a given BW/MCS. This function parses the data. @@ -1739,7 +1744,8 @@ struct ieee80211_mu_edca_param_set { */ int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap, enum ieee80211_vht_chanwidth bw, - int mcs, bool ext_nss_bw_capable); + int mcs, bool ext_nss_bw_capable, + unsigned int max_vht_nss); /* 802.11ax HE MAC capabilities */ #define IEEE80211_HE_MAC_CAP0_HTC_HE 0x01 diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c index 632f07401850..9c6045f9c24d 100644 --- a/net/mac80211/vht.c +++ b/net/mac80211/vht.c @@ -4,7 +4,7 @@ * * Portions of this file * Copyright(c) 2015 - 2016 Intel Deutschland GmbH - * Copyright (C) 2018 - 2019 Intel Corporation + * Copyright (C) 2018 - 2020 Intel Corporation */ #include @@ -575,15 +575,21 @@ u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, switch (opmode & IEEE80211_OPMODE_NOTIF_CHANWIDTH_MASK) { case IEEE80211_OPMODE_NOTIF_CHANWIDTH_20MHZ: + /* ignore IEEE80211_OPMODE_NOTIF_BW_160_80P80 must not be set */ sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_20; break; case IEEE80211_OPMODE_NOTIF_CHANWIDTH_40MHZ: + /* ignore IEEE80211_OPMODE_NOTIF_BW_160_80P80 must not be set */ sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_40; break; case IEEE80211_OPMODE_NOTIF_CHANWIDTH_80MHZ: - sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_80; + if (opmode & IEEE80211_OPMODE_NOTIF_BW_160_80P80) + sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_160; + else + sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_80; break; case IEEE80211_OPMODE_NOTIF_CHANWIDTH_160MHZ: + /* legacy only, no longer used by newer spec */ sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_160; break; } diff --git a/net/wireless/util.c b/net/wireless/util.c index 6590efbbcbb9..123d6ce79b8e 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -5,7 +5,7 @@ * Copyright 2007-2009 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2017 Intel Deutschland GmbH - * Copyright (C) 2018-2019 Intel Corporation + * Copyright (C) 2018-2020 Intel Corporation */ #include #include @@ -2030,10 +2030,10 @@ EXPORT_SYMBOL(cfg80211_send_layer2_update); int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap, enum ieee80211_vht_chanwidth bw, - int mcs, bool ext_nss_bw_capable) + int mcs, bool ext_nss_bw_capable, + unsigned int max_vht_nss) { u16 map = le16_to_cpu(cap->supp_mcs.rx_mcs_map); - int max_vht_nss = 0; int ext_nss_bw; int supp_width; int i, mcs_encoding; @@ -2041,7 +2041,7 @@ int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap, if (map == 0xffff) return 0; - if (WARN_ON(mcs > 9)) + if (WARN_ON(mcs > 9 || max_vht_nss > 8)) return 0; if (mcs <= 7) mcs_encoding = 0; @@ -2050,16 +2050,18 @@ int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap, else mcs_encoding = 2; - /* find max_vht_nss for the given MCS */ - for (i = 7; i >= 0; i--) { - int supp = (map >> (2 * i)) & 3; + if (!max_vht_nss) { + /* find max_vht_nss for the given MCS */ + for (i = 7; i >= 0; i--) { + int supp = (map >> (2 * i)) & 3; - if (supp == 3) - continue; + if (supp == 3) + continue; - if (supp >= mcs_encoding) { - max_vht_nss = i + 1; - break; + if (supp >= mcs_encoding) { + max_vht_nss = i + 1; + break; + } } } -- cgit v1.2.3 From d46b4ab870fa29445b701e922e9aa36b15f833ea Mon Sep 17 00:00:00 2001 From: Shaul Triebitz Date: Thu, 26 Mar 2020 15:09:33 +0200 Subject: mac80211: add twt_protected flag to the bss_conf structure Add a flag to the BSS conf whether the BSS and STA support protected TWT. Signed-off-by: Shaul Triebitz Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20200326150855.1dcb2d16fa74.I74d7c007dad2601d2e39f54612fe6554dd5ab386@changeid Signed-off-by: Johannes Berg --- include/net/mac80211.h | 2 ++ net/mac80211/mlme.c | 9 +++++++++ 2 files changed, 11 insertions(+) (limited to 'net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index f12fe3b0a868..5fb80dd8bbbc 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -508,6 +508,7 @@ struct ieee80211_ftm_responder_params { * mode only, set if the AP advertises TWT responder role) * @twt_responder: does this BSS support TWT requester (relevant for managed * mode only, set if the AP advertises TWT responder role) + * @twt_protected: does this BSS support protected TWT frames * @assoc: association status * @ibss_joined: indicates whether this station is part of an IBSS * or not @@ -618,6 +619,7 @@ struct ieee80211_bss_conf { bool he_support; bool twt_requester; bool twt_responder; + bool twt_protected; /* association related data */ bool assoc, ibss_joined; bool ibss_creator; diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 7139335f29c0..b77787995723 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -3384,10 +3384,19 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, sta); bss_conf->he_support = sta->sta.he_cap.has_he; + if (elems->rsnx && elems->rsnx_len && + (elems->rsnx[0] & WLAN_RSNX_CAPA_PROTECTED_TWT) && + wiphy_ext_feature_isset(local->hw.wiphy, + NL80211_EXT_FEATURE_PROTECTED_TWT)) + bss_conf->twt_protected = true; + else + bss_conf->twt_protected = false; + changed |= ieee80211_recalc_twt_req(sdata, sta, elems); } else { bss_conf->he_support = false; bss_conf->twt_requester = false; + bss_conf->twt_protected = false; } if (bss_conf->he_support) { -- cgit v1.2.3 From a4055e74a2ff7c70ccdb6c36254ad5181464f211 Mon Sep 17 00:00:00 2001 From: Andrei Otcheretianski Date: Thu, 26 Mar 2020 15:09:34 +0200 Subject: mac80211: Don't destroy auth data in case of anti-clogging SAE AP may reject authentication with WLAN_STATUS_ANTI_CLOG_REQUIRED. As the user space will immediately continue the authentication flow, there is no need to destroy the authentication data in this case. This saves unneeded station removal and releasing the channel. Signed-off-by: Andrei Otcheretianski Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20200326150855.7483996157a8.I8040a842874aaf6d209df3fc8a2acb97a0bf508b@changeid Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index b77787995723..56d61bc9954d 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -2948,10 +2948,15 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, } if (status_code != WLAN_STATUS_SUCCESS) { + cfg80211_rx_mlme_mgmt(sdata->dev, (u8 *)mgmt, len); + + if (auth_alg == WLAN_AUTH_SAE && + status_code == WLAN_STATUS_ANTI_CLOG_REQUIRED) + return; + sdata_info(sdata, "%pM denied authentication (status %d)\n", mgmt->sa, status_code); ieee80211_destroy_auth_data(sdata, false); - cfg80211_rx_mlme_mgmt(sdata->dev, (u8 *)mgmt, len); event.u.mlme.status = MLME_DENIED; event.u.mlme.reason = status_code; drv_event_callback(sdata->local, sdata, &event); -- cgit v1.2.3 From 2a392596d8811c6d58c014ec881b159c75a0cf45 Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Thu, 26 Mar 2020 15:09:35 +0200 Subject: cfg80211: Parse HE membership selector This extends the support for drivers that rebuilds IEs in the FW (same as with HT/VHT). Signed-off-by: Ilan Peer Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20200326150855.20feaabfb484.I886252639604c8e3e84b8ef97962f1b0e4beec81@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 1 + include/net/cfg80211.h | 3 ++- net/wireless/nl80211.c | 2 ++ 3 files changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index c326aec535c6..38f513ce7528 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1067,6 +1067,7 @@ struct ieee80211_mgmt { /* Supported rates membership selectors */ #define BSS_MEMBERSHIP_SELECTOR_HT_PHY 127 #define BSS_MEMBERSHIP_SELECTOR_VHT_PHY 126 +#define BSS_MEMBERSHIP_SELECTOR_HE_PHY 122 /* mgmt header + 1 byte category code */ #define IEEE80211_MIN_ACTION_SIZE offsetof(struct ieee80211_mgmt, u.action.u) diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index dbb9675fe38f..e288fdcb3df2 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1054,6 +1054,7 @@ enum cfg80211_ap_settings_flags { * @ht_required: stations must support HT * @vht_required: stations must support VHT * @twt_responder: Enable Target Wait Time + * @he_required: stations must support HE * @flags: flags, as defined in enum cfg80211_ap_settings_flags * @he_obss_pd: OBSS Packet Detection settings * @he_bss_color: BSS Color settings @@ -1083,7 +1084,7 @@ struct cfg80211_ap_settings { const struct ieee80211_vht_cap *vht_cap; const struct ieee80211_he_cap_elem *he_cap; const struct ieee80211_he_operation *he_oper; - bool ht_required, vht_required; + bool ht_required, vht_required, he_required; bool twt_responder; u32 flags; struct ieee80211_he_obss_pd he_obss_pd; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index d470d77d2eb6..3d27b24c68b2 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -4738,6 +4738,8 @@ static void nl80211_check_ap_rate_selectors(struct cfg80211_ap_settings *params, params->ht_required = true; if (rates[2 + i] == BSS_MEMBERSHIP_SELECTOR_VHT_PHY) params->vht_required = true; + if (rates[2 + i] == BSS_MEMBERSHIP_SELECTOR_HE_PHY) + params->he_required = true; } } -- cgit v1.2.3 From 4826e721103acf42421304330cf48a642fa163bb Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Thu, 26 Mar 2020 15:09:36 +0200 Subject: mac80211: Skip entries with HE membership selector When parsing supported rates IE. Signed-off-by: Ilan Peer Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20200326150855.ed3e66f8c197.I93aad0e5ddb7ce79f05f8153922acb9aa5076d38@changeid Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 56d61bc9954d..c77f47b41356 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -3154,15 +3154,16 @@ static void ieee80211_get_rates(struct ieee80211_supported_band *sband, *have_higher_than_11mbit = true; /* - * Skip HT and VHT BSS membership selectors since they're not - * rates. + * Skip HT, VHT and HE BSS membership selectors since they're + * not rates. * * Note: Even though the membership selector and the basic * rate flag share the same bit, they are not exactly * the same. */ if (supp_rates[i] == (0x80 | BSS_MEMBERSHIP_SELECTOR_HT_PHY) || - supp_rates[i] == (0x80 | BSS_MEMBERSHIP_SELECTOR_VHT_PHY)) + supp_rates[i] == (0x80 | BSS_MEMBERSHIP_SELECTOR_VHT_PHY) || + supp_rates[i] == (0x80 | BSS_MEMBERSHIP_SELECTOR_HE_PHY)) continue; for (j = 0; j < sband->n_bitrates; j++) { -- cgit v1.2.3 From 31d8bb4e07f80935ee9bf599a9d99de7ca90fc5a Mon Sep 17 00:00:00 2001 From: Mordechay Goodstein Date: Thu, 26 Mar 2020 15:09:37 +0200 Subject: mac80211: agg-tx: refactor sending addba We move the actual arming the timer and sending ADDBA to a function for the use in different places calling the same logic. Signed-off-by: Mordechay Goodstein Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20200326150855.58a337eb90a1.I75934e6464535fbf43969acc796bc886291e79a5@changeid Signed-off-by: Johannes Berg --- net/mac80211/agg-tx.c | 67 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 39 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index 33da6f738c99..32f40c4f3120 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -9,7 +9,7 @@ * Copyright 2007, Michael Wu * Copyright 2007-2010, Intel Corporation * Copyright(c) 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018 - 2019 Intel Corporation + * Copyright (C) 2018 - 2020 Intel Corporation */ #include @@ -448,6 +448,43 @@ static void sta_addba_resp_timer_expired(struct timer_list *t) ieee80211_stop_tx_ba_session(&sta->sta, tid); } +static void ieee80211_send_addba_with_timeout(struct sta_info *sta, + struct tid_ampdu_tx *tid_tx) +{ + struct ieee80211_sub_if_data *sdata = sta->sdata; + struct ieee80211_local *local = sta->local; + u8 tid = tid_tx->tid; + u16 buf_size; + + /* activate the timer for the recipient's addBA response */ + mod_timer(&tid_tx->addba_resp_timer, jiffies + ADDBA_RESP_INTERVAL); + ht_dbg(sdata, "activated addBA response timer on %pM tid %d\n", + sta->sta.addr, tid); + + spin_lock_bh(&sta->lock); + sta->ampdu_mlme.last_addba_req_time[tid] = jiffies; + sta->ampdu_mlme.addba_req_num[tid]++; + spin_unlock_bh(&sta->lock); + + if (sta->sta.he_cap.has_he) { + buf_size = local->hw.max_tx_aggregation_subframes; + } else { + /* + * We really should use what the driver told us it will + * transmit as the maximum, but certain APs (e.g. the + * LinkSys WRT120N with FW v1.0.07 build 002 Jun 18 2012) + * will crash when we use a lower number. + */ + buf_size = IEEE80211_MAX_AMPDU_BUF_HT; + } + + /* send AddBA request */ + ieee80211_send_addba_request(sdata, sta->sta.addr, tid, + tid_tx->dialog_token, + sta->tid_seq[tid] >> 4, + buf_size, tid_tx->timeout); +} + void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid) { struct tid_ampdu_tx *tid_tx; @@ -462,7 +499,6 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid) .timeout = 0, }; int ret; - u16 buf_size; tid_tx = rcu_dereference_protected_tid_tx(sta, tid); @@ -508,32 +544,7 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid) return; } - /* activate the timer for the recipient's addBA response */ - mod_timer(&tid_tx->addba_resp_timer, jiffies + ADDBA_RESP_INTERVAL); - ht_dbg(sdata, "activated addBA response timer on %pM tid %d\n", - sta->sta.addr, tid); - - spin_lock_bh(&sta->lock); - sta->ampdu_mlme.last_addba_req_time[tid] = jiffies; - sta->ampdu_mlme.addba_req_num[tid]++; - spin_unlock_bh(&sta->lock); - - if (sta->sta.he_cap.has_he) { - buf_size = local->hw.max_tx_aggregation_subframes; - } else { - /* - * We really should use what the driver told us it will - * transmit as the maximum, but certain APs (e.g. the - * LinkSys WRT120N with FW v1.0.07 build 002 Jun 18 2012) - * will crash when we use a lower number. - */ - buf_size = IEEE80211_MAX_AMPDU_BUF_HT; - } - - /* send AddBA request */ - ieee80211_send_addba_request(sdata, sta->sta.addr, tid, - tid_tx->dialog_token, params.ssn, - buf_size, tid_tx->timeout); + ieee80211_send_addba_with_timeout(sta, tid_tx); } /* -- cgit v1.2.3 From 0c197f16f7bc5ddb43073690a80fb15998ad61e4 Mon Sep 17 00:00:00 2001 From: Mordechay Goodstein Date: Thu, 26 Mar 2020 15:09:38 +0200 Subject: mac80211: agg-tx: add an option to defer ADDBA transmit Driver tells mac80211 to sends ADDBA with SSN (starting sequence number) from the head of the queue, while the transmission of all the frames in the queue may take a while, which causes the peer to time out. In order to fix this scenario, add an option to defer ADDBA transmit until queue is drained. Signed-off-by: Mordechay Goodstein Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20200326150855.0f27423fec75.If67daab123a27c1cbddef000d6a3f212aa6309ef@changeid Signed-off-by: Johannes Berg --- include/net/mac80211.h | 6 +++++- net/mac80211/agg-tx.c | 12 +++++++++++- net/mac80211/sta_info.h | 2 ++ 3 files changed, 18 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 5fb80dd8bbbc..f3147633dda2 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -3125,7 +3125,10 @@ enum ieee80211_filter_flags { * @IEEE80211_AMPDU_RX_START: start RX aggregation * @IEEE80211_AMPDU_RX_STOP: stop RX aggregation * @IEEE80211_AMPDU_TX_START: start TX aggregation, the driver must either - * call ieee80211_start_tx_ba_cb_irqsafe() or return the special + * call ieee80211_start_tx_ba_cb_irqsafe() or + * call ieee80211_start_tx_ba_cb_irqsafe() with status + * %IEEE80211_AMPDU_TX_START_DELAY_ADDBA to delay addba after + * ieee80211_start_tx_ba_cb_irqsafe is called, or just return the special * status %IEEE80211_AMPDU_TX_START_IMMEDIATE. * @IEEE80211_AMPDU_TX_OPERATIONAL: TX aggregation has become operational * @IEEE80211_AMPDU_TX_STOP_CONT: stop TX aggregation but continue transmitting @@ -3151,6 +3154,7 @@ enum ieee80211_ampdu_mlme_action { }; #define IEEE80211_AMPDU_TX_START_IMMEDIATE 1 +#define IEEE80211_AMPDU_TX_START_DELAY_ADDBA 2 /** * struct ieee80211_ampdu_params - AMPDU action parameters diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index 32f40c4f3120..c2d5f512526d 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -483,6 +483,8 @@ static void ieee80211_send_addba_with_timeout(struct sta_info *sta, tid_tx->dialog_token, sta->tid_seq[tid] >> 4, buf_size, tid_tx->timeout); + + WARN_ON(test_and_set_bit(HT_AGG_STATE_SENT_ADDBA, &tid_tx->state)); } void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid) @@ -521,7 +523,9 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid) params.ssn = sta->tid_seq[tid] >> 4; ret = drv_ampdu_action(local, sdata, ¶ms); - if (ret == IEEE80211_AMPDU_TX_START_IMMEDIATE) { + if (ret == IEEE80211_AMPDU_TX_START_DELAY_ADDBA) { + return; + } else if (ret == IEEE80211_AMPDU_TX_START_IMMEDIATE) { /* * We didn't send the request yet, so don't need to check * here if we already got a response, just mark as driver @@ -765,6 +769,12 @@ void ieee80211_start_tx_ba_cb(struct sta_info *sta, int tid, if (WARN_ON(test_and_set_bit(HT_AGG_STATE_DRV_READY, &tid_tx->state))) return; + if (!test_bit(HT_AGG_STATE_SENT_ADDBA, &tid_tx->state)) { + ieee80211_send_addba_with_timeout(sta, tid_tx); + /* RESPONSE_RECEIVED state whould trigger the flow again */ + return; + } + if (test_bit(HT_AGG_STATE_RESPONSE_RECEIVED, &tid_tx->state)) ieee80211_agg_tx_operational(local, sta, tid); } diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 36f1abaab9ff..a5de3aa6ea42 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -3,6 +3,7 @@ * Copyright 2002-2005, Devicescape Software, Inc. * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright(c) 2015-2017 Intel Deutschland GmbH + * Copyright(c) 2020 Intel Corporation */ #ifndef STA_INFO_H @@ -116,6 +117,7 @@ enum ieee80211_sta_info_flags { #define HT_AGG_STATE_WANT_STOP 5 #define HT_AGG_STATE_START_CB 6 #define HT_AGG_STATE_STOP_CB 7 +#define HT_AGG_STATE_SENT_ADDBA 8 DECLARE_EWMA(avg_signal, 10, 8) enum ieee80211_agg_stop_reason { -- cgit v1.2.3 From 302ff8b7a2b01cfb7645f112bb259af1c146c57a Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Thu, 26 Mar 2020 15:09:39 +0200 Subject: mac80211: Fail association when AP has no legacy rates The MLME logic had a workaround that allowed to continue an association with an AP even if the AP did not provide any basic rates in its supported rates in the association response, assuming that the first (non basic) legacy rate could be used as a basic rate. However, this did not consider the case where the AP (which is obviously buggy) did not provide any legacy rate. Fix this by failing the association, as this can result in an unexpected failure in the low level driver and FW, e.g., in rate scale logic etc. Signed-off-by: Ilan Peer Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20200326150855.d70a1450d83f.I6e6ce5efda351a8544c0e7bfeee260fe3360d401@changeid Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index c77f47b41356..59a35c7997c3 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -5036,8 +5036,16 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata, * doesn't happen any more, but keep the workaround so * in case some *other* APs are buggy in different ways * we can connect -- with a warning. + * Allow this workaround only in case the AP provided at least + * one rate. */ - if (!basic_rates && min_rate_index >= 0) { + if (min_rate_index < 0) { + sdata_info(sdata, + "No legacy rates in association response\n"); + + sta_info_free(local, new_sta); + return -EINVAL; + } else if (!basic_rates) { sdata_info(sdata, "No basic rates, using min rate instead\n"); basic_rates = BIT(min_rate_index); -- cgit v1.2.3 From dba25b04c61170cf8592f87df2bb086201047473 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 26 Mar 2020 15:09:40 +0200 Subject: mac80211: minstrel_ht_assign_best_tp_rates: remove redundant test We know this pointer isn't NULL and in fact dereferenced it before, remove the redundant test. Signed-off-by: Johannes Berg Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20200326150855.adf551928846.Iae9015573d6c350cc1b12a311d6d13d086beec6c@changeid Signed-off-by: Johannes Berg --- net/mac80211/rc80211_minstrel_ht.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c index 694a31978a04..5547111d22bf 100644 --- a/net/mac80211/rc80211_minstrel_ht.c +++ b/net/mac80211/rc80211_minstrel_ht.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2010-2013 Felix Fietkau + * Copyright (C) 2019-2020 Intel Corporation */ #include #include @@ -490,7 +491,7 @@ minstrel_ht_assign_best_tp_rates(struct minstrel_ht_sta *mi, tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_avg; tmp_mcs_tp = minstrel_ht_get_tp_avg(mi, tmp_group, tmp_idx, tmp_prob); - if (tmp_cck_tp_rate && tmp_cck_tp > tmp_mcs_tp) { + if (tmp_cck_tp > tmp_mcs_tp) { for(i = 0; i < MAX_THR_RATES; i++) { minstrel_ht_sort_best_tp_rates(mi, tmp_cck_tp_rate[i], tmp_mcs_tp_rate); -- cgit v1.2.3 From 934f4c7dd3a544bb8000f7436f1f0e12e04ebc37 Mon Sep 17 00:00:00 2001 From: Thomas Pedersen Date: Wed, 1 Apr 2020 18:18:03 -0700 Subject: cfg80211: express channels with a KHz component Some bands (S1G) define channels centered on a non-integer MHz. Give ieee80211_channel and cfg80211_chan_def a freq_offset component where the final frequency can be expressed as: MHZ_TO_KHZ(chan->center_freq) + chan->freq_offset; Also provide some helper functions to do the frequency conversion and test for equality. Retain the existing interface to frequency and channel conversion helpers, and expose new ones which handle frequencies in units of KHz. Some internal functions (net/wireless/chan.c) pass around a frequency value. Convert these to units of KHz. mesh, ibss, wext, etc. are currently ignored. Signed-off-by: Thomas Pedersen Link: https://lore.kernel.org/r/20200402011810.22947-3-thomas@adapt-ip.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 92 ++++++++++++++++++++++++++++++++++++++++++++++---- net/wireless/chan.c | 68 +++++++++++++++++++++---------------- net/wireless/reg.c | 40 +++++++++++----------- net/wireless/scan.c | 4 +-- net/wireless/trace.h | 21 +++++++++--- net/wireless/util.c | 32 +++++++++++------- 6 files changed, 182 insertions(+), 75 deletions(-) (limited to 'net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index e288fdcb3df2..a82fc59a1d82 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -128,6 +128,7 @@ enum ieee80211_channel_flags { * with cfg80211. * * @center_freq: center frequency in MHz + * @freq_offset: offset from @center_freq, in KHz * @hw_value: hardware-specific value for the channel * @flags: channel flags from &enum ieee80211_channel_flags. * @orig_flags: channel flags at registration time, used by regulatory @@ -149,6 +150,7 @@ enum ieee80211_channel_flags { struct ieee80211_channel { enum nl80211_band band; u32 center_freq; + u16 freq_offset; u16 hw_value; u32 flags; int max_antenna_gain; @@ -617,6 +619,7 @@ struct key_params { * If edmg is requested (i.e. the .channels member is non-zero), * chan will define the primary channel and all other * parameters are ignored. + * @freq1_offset: offset from @center_freq1, in KHz */ struct cfg80211_chan_def { struct ieee80211_channel *chan; @@ -624,6 +627,7 @@ struct cfg80211_chan_def { u32 center_freq1; u32 center_freq2; struct ieee80211_edmg edmg; + u16 freq1_offset; }; /** @@ -713,6 +717,7 @@ cfg80211_chandef_identical(const struct cfg80211_chan_def *chandef1, return (chandef1->chan == chandef2->chan && chandef1->width == chandef2->width && chandef1->center_freq1 == chandef2->center_freq1 && + chandef1->freq1_offset == chandef2->freq1_offset && chandef1->center_freq2 == chandef2->center_freq2); } @@ -5177,30 +5182,92 @@ static inline void *wdev_priv(struct wireless_dev *wdev) * cfg80211 offers a number of utility functions that can be useful. */ +/** + * ieee80211_channel_equal - compare two struct ieee80211_channel + * + * @a: 1st struct ieee80211_channel + * @b: 2nd struct ieee80211_channel + * Return: true if center frequency of @a == @b + */ +static inline bool +ieee80211_channel_equal(struct ieee80211_channel *a, + struct ieee80211_channel *b) +{ + return (a->center_freq == b->center_freq && + a->freq_offset == b->freq_offset); +} + +/** + * ieee80211_channel_to_khz - convert ieee80211_channel to frequency in KHz + * @chan: struct ieee80211_channel to convert + * Return: The corresponding frequency (in KHz) + */ +static inline u32 +ieee80211_channel_to_khz(const struct ieee80211_channel *chan) +{ + return MHZ_TO_KHZ(chan->center_freq) + chan->freq_offset; +} + +/** + * ieee80211_channel_to_freq_khz - convert channel number to frequency + * @chan: channel number + * @band: band, necessary due to channel number overlap + * Return: The corresponding frequency (in KHz), or 0 if the conversion failed. + */ +u32 ieee80211_channel_to_freq_khz(int chan, enum nl80211_band band); + /** * ieee80211_channel_to_frequency - convert channel number to frequency * @chan: channel number * @band: band, necessary due to channel number overlap * Return: The corresponding frequency (in MHz), or 0 if the conversion failed. */ -int ieee80211_channel_to_frequency(int chan, enum nl80211_band band); +static inline int +ieee80211_channel_to_frequency(int chan, enum nl80211_band band) +{ + return KHZ_TO_MHZ(ieee80211_channel_to_freq_khz(chan, band)); +} + +/** + * ieee80211_freq_khz_to_channel - convert frequency to channel number + * @freq: center frequency in KHz + * Return: The corresponding channel, or 0 if the conversion failed. + */ +int ieee80211_freq_khz_to_channel(u32 freq); /** * ieee80211_frequency_to_channel - convert frequency to channel number - * @freq: center frequency + * @freq: center frequency in MHz * Return: The corresponding channel, or 0 if the conversion failed. */ -int ieee80211_frequency_to_channel(int freq); +static inline int +ieee80211_frequency_to_channel(int freq) +{ + return ieee80211_freq_khz_to_channel(MHZ_TO_KHZ(freq)); +} + +/** + * ieee80211_get_channel_khz - get channel struct from wiphy for specified + * frequency + * @wiphy: the struct wiphy to get the channel for + * @freq: the center frequency (in KHz) of the channel + * Return: The channel struct from @wiphy at @freq. + */ +struct ieee80211_channel * +ieee80211_get_channel_khz(struct wiphy *wiphy, u32 freq); /** * ieee80211_get_channel - get channel struct from wiphy for specified frequency * * @wiphy: the struct wiphy to get the channel for - * @freq: the center frequency of the channel - * + * @freq: the center frequency (in MHz) of the channel * Return: The channel struct from @wiphy at @freq. */ -struct ieee80211_channel *ieee80211_get_channel(struct wiphy *wiphy, int freq); +static inline struct ieee80211_channel * +ieee80211_get_channel(struct wiphy *wiphy, int freq) +{ + return ieee80211_get_channel_khz(wiphy, MHZ_TO_KHZ(freq)); +} /** * ieee80211_get_response_rate - get basic rate for a given rate @@ -7228,6 +7295,19 @@ bool ieee80211_operating_class_to_band(u8 operating_class, bool ieee80211_chandef_to_operating_class(struct cfg80211_chan_def *chandef, u8 *op_class); +/** + * ieee80211_chandef_to_khz - convert chandef to frequency in KHz + * + * @chandef: the chandef to convert + * + * Returns the center frequency of chandef (1st segment) in KHz. + */ +static inline u32 +ieee80211_chandef_to_khz(const struct cfg80211_chan_def *chandef) +{ + return MHZ_TO_KHZ(chandef->center_freq1) + chandef->freq1_offset; +} + /* * cfg80211_tdls_oper_request - request userspace to perform TDLS operation * @dev: the device on which the operation is requested diff --git a/net/wireless/chan.c b/net/wireless/chan.c index fcac5c6366e1..d60e50a3b910 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -27,6 +27,7 @@ void cfg80211_chandef_create(struct cfg80211_chan_def *chandef, return; chandef->chan = chan; + chandef->freq1_offset = chan->freq_offset; chandef->center_freq2 = 0; chandef->edmg.bw_config = 0; chandef->edmg.channels = 0; @@ -153,7 +154,8 @@ bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef) case NL80211_CHAN_WIDTH_10: case NL80211_CHAN_WIDTH_20: case NL80211_CHAN_WIDTH_20_NOHT: - if (chandef->center_freq1 != control_freq) + if (ieee80211_chandef_to_khz(chandef) != + ieee80211_channel_to_khz(chandef->chan)) return false; if (chandef->center_freq2) return false; @@ -386,10 +388,11 @@ static u32 cfg80211_get_start_freq(u32 center_freq, { u32 start_freq; - if (bandwidth <= 20) + bandwidth = MHZ_TO_KHZ(bandwidth); + if (bandwidth <= MHZ_TO_KHZ(20)) start_freq = center_freq; else - start_freq = center_freq - bandwidth/2 + 10; + start_freq = center_freq - bandwidth / 2 + MHZ_TO_KHZ(10); return start_freq; } @@ -399,10 +402,11 @@ static u32 cfg80211_get_end_freq(u32 center_freq, { u32 end_freq; - if (bandwidth <= 20) + bandwidth = MHZ_TO_KHZ(bandwidth); + if (bandwidth <= MHZ_TO_KHZ(20)) end_freq = center_freq; else - end_freq = center_freq + bandwidth/2 - 10; + end_freq = center_freq + bandwidth / 2 - MHZ_TO_KHZ(10); return end_freq; } @@ -417,8 +421,8 @@ static int cfg80211_get_chans_dfs_required(struct wiphy *wiphy, start_freq = cfg80211_get_start_freq(center_freq, bandwidth); end_freq = cfg80211_get_end_freq(center_freq, bandwidth); - for (freq = start_freq; freq <= end_freq; freq += 20) { - c = ieee80211_get_channel(wiphy, freq); + for (freq = start_freq; freq <= end_freq; freq += MHZ_TO_KHZ(20)) { + c = ieee80211_get_channel_khz(wiphy, freq); if (!c) return -EINVAL; @@ -449,8 +453,8 @@ int cfg80211_chandef_dfs_required(struct wiphy *wiphy, return -EINVAL; ret = cfg80211_get_chans_dfs_required(wiphy, - chandef->center_freq1, - width); + ieee80211_chandef_to_khz(chandef), + width); if (ret < 0) return ret; else if (ret > 0) @@ -460,8 +464,8 @@ int cfg80211_chandef_dfs_required(struct wiphy *wiphy, return 0; ret = cfg80211_get_chans_dfs_required(wiphy, - chandef->center_freq2, - width); + MHZ_TO_KHZ(chandef->center_freq2), + width); if (ret < 0) return ret; else if (ret > 0) @@ -503,8 +507,8 @@ static int cfg80211_get_chans_dfs_usable(struct wiphy *wiphy, * DFS_AVAILABLE). Return number of usable channels * (require CAC). Allow DFS and non-DFS channel mix. */ - for (freq = start_freq; freq <= end_freq; freq += 20) { - c = ieee80211_get_channel(wiphy, freq); + for (freq = start_freq; freq <= end_freq; freq += MHZ_TO_KHZ(20)) { + c = ieee80211_get_channel_khz(wiphy, freq); if (!c) return -EINVAL; @@ -536,8 +540,9 @@ bool cfg80211_chandef_dfs_usable(struct wiphy *wiphy, if (width < 0) return false; - r1 = cfg80211_get_chans_dfs_usable(wiphy, chandef->center_freq1, - width); + r1 = cfg80211_get_chans_dfs_usable(wiphy, + MHZ_TO_KHZ(chandef->center_freq1), + width); if (r1 < 0) return false; @@ -546,8 +551,8 @@ bool cfg80211_chandef_dfs_usable(struct wiphy *wiphy, case NL80211_CHAN_WIDTH_80P80: WARN_ON(!chandef->center_freq2); r2 = cfg80211_get_chans_dfs_usable(wiphy, - chandef->center_freq2, - width); + MHZ_TO_KHZ(chandef->center_freq2), + width); if (r2 < 0) return false; break; @@ -694,8 +699,8 @@ static bool cfg80211_get_chans_dfs_available(struct wiphy *wiphy, * If any channel in between is disabled or has not * had gone through CAC return false */ - for (freq = start_freq; freq <= end_freq; freq += 20) { - c = ieee80211_get_channel(wiphy, freq); + for (freq = start_freq; freq <= end_freq; freq += MHZ_TO_KHZ(20)) { + c = ieee80211_get_channel_khz(wiphy, freq); if (!c) return false; @@ -724,7 +729,8 @@ static bool cfg80211_chandef_dfs_available(struct wiphy *wiphy, if (width < 0) return false; - r = cfg80211_get_chans_dfs_available(wiphy, chandef->center_freq1, + r = cfg80211_get_chans_dfs_available(wiphy, + MHZ_TO_KHZ(chandef->center_freq1), width); /* If any of channels unavailable for cf1 just return */ @@ -735,8 +741,8 @@ static bool cfg80211_chandef_dfs_available(struct wiphy *wiphy, case NL80211_CHAN_WIDTH_80P80: WARN_ON(!chandef->center_freq2); r = cfg80211_get_chans_dfs_available(wiphy, - chandef->center_freq2, - width); + MHZ_TO_KHZ(chandef->center_freq2), + width); break; default: WARN_ON(chandef->center_freq2); @@ -757,8 +763,8 @@ static unsigned int cfg80211_get_chans_dfs_cac_time(struct wiphy *wiphy, start_freq = cfg80211_get_start_freq(center_freq, bandwidth); end_freq = cfg80211_get_end_freq(center_freq, bandwidth); - for (freq = start_freq; freq <= end_freq; freq += 20) { - c = ieee80211_get_channel(wiphy, freq); + for (freq = start_freq; freq <= end_freq; freq += MHZ_TO_KHZ(20)) { + c = ieee80211_get_channel_khz(wiphy, freq); if (!c) return 0; @@ -790,14 +796,14 @@ cfg80211_chandef_dfs_cac_time(struct wiphy *wiphy, return 0; t1 = cfg80211_get_chans_dfs_cac_time(wiphy, - chandef->center_freq1, + MHZ_TO_KHZ(chandef->center_freq1), width); if (!chandef->center_freq2) return t1; t2 = cfg80211_get_chans_dfs_cac_time(wiphy, - chandef->center_freq2, + MHZ_TO_KHZ(chandef->center_freq2), width); return max(t1, t2); @@ -813,8 +819,8 @@ static bool cfg80211_secondary_chans_ok(struct wiphy *wiphy, start_freq = cfg80211_get_start_freq(center_freq, bandwidth); end_freq = cfg80211_get_end_freq(center_freq, bandwidth); - for (freq = start_freq; freq <= end_freq; freq += 20) { - c = ieee80211_get_channel(wiphy, freq); + for (freq = start_freq; freq <= end_freq; freq += MHZ_TO_KHZ(20)) { + c = ieee80211_get_channel_khz(wiphy, freq); if (!c || c->flags & prohibited_flags) return false; } @@ -976,13 +982,15 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy, prohibited_flags |= IEEE80211_CHAN_NO_OFDM; - if (!cfg80211_secondary_chans_ok(wiphy, chandef->center_freq1, + if (!cfg80211_secondary_chans_ok(wiphy, + ieee80211_chandef_to_khz(chandef), width, prohibited_flags)) return false; if (!chandef->center_freq2) return true; - return cfg80211_secondary_chans_ok(wiphy, chandef->center_freq2, + return cfg80211_secondary_chans_ok(wiphy, + MHZ_TO_KHZ(chandef->center_freq2), width, prohibited_flags); } EXPORT_SYMBOL(cfg80211_chandef_usable); diff --git a/net/wireless/reg.c b/net/wireless/reg.c index d476d4da0d09..0d74a31ef0ab 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -1658,22 +1658,23 @@ static uint32_t reg_rule_to_chan_bw_flags(const struct ieee80211_regdomain *regd const struct ieee80211_channel *chan) { const struct ieee80211_freq_range *freq_range = NULL; - u32 max_bandwidth_khz, bw_flags = 0; + u32 max_bandwidth_khz, center_freq_khz, bw_flags = 0; freq_range = ®_rule->freq_range; max_bandwidth_khz = freq_range->max_bandwidth_khz; + center_freq_khz = ieee80211_channel_to_khz(chan); /* Check if auto calculation requested */ if (reg_rule->flags & NL80211_RRF_AUTO_BW) max_bandwidth_khz = reg_get_max_bandwidth(regd, reg_rule); /* If we get a reg_rule we can assume that at least 5Mhz fit */ if (!cfg80211_does_bw_fit_range(freq_range, - MHZ_TO_KHZ(chan->center_freq), + center_freq_khz, MHZ_TO_KHZ(10))) bw_flags |= IEEE80211_CHAN_NO_10MHZ; if (!cfg80211_does_bw_fit_range(freq_range, - MHZ_TO_KHZ(chan->center_freq), + center_freq_khz, MHZ_TO_KHZ(20))) bw_flags |= IEEE80211_CHAN_NO_20MHZ; @@ -1710,7 +1711,7 @@ static void handle_channel(struct wiphy *wiphy, flags = chan->orig_flags; - reg_rule = freq_reg_info(wiphy, MHZ_TO_KHZ(chan->center_freq)); + reg_rule = freq_reg_info(wiphy, ieee80211_channel_to_khz(chan)); if (IS_ERR(reg_rule)) { /* * We will disable all channels that do not match our @@ -1729,13 +1730,13 @@ static void handle_channel(struct wiphy *wiphy, if (lr->initiator == NL80211_REGDOM_SET_BY_DRIVER && request_wiphy && request_wiphy == wiphy && request_wiphy->regulatory_flags & REGULATORY_STRICT_REG) { - pr_debug("Disabling freq %d MHz for good\n", - chan->center_freq); + pr_debug("Disabling freq %d.%03d MHz for good\n", + chan->center_freq, chan->freq_offset); chan->orig_flags |= IEEE80211_CHAN_DISABLED; chan->flags = chan->orig_flags; } else { - pr_debug("Disabling freq %d MHz\n", - chan->center_freq); + pr_debug("Disabling freq %d.%03d MHz\n", + chan->center_freq, chan->freq_offset); chan->flags |= IEEE80211_CHAN_DISABLED; } return; @@ -1936,7 +1937,7 @@ static void handle_reg_beacon(struct wiphy *wiphy, unsigned int chan_idx, sband = wiphy->bands[reg_beacon->chan.band]; chan = &sband->channels[chan_idx]; - if (likely(chan->center_freq != reg_beacon->chan.center_freq)) + if (likely(!ieee80211_channel_equal(chan, ®_beacon->chan))) return; if (chan->beacon_found) @@ -2269,18 +2270,18 @@ static void handle_channel_custom(struct wiphy *wiphy, u32 bw_flags = 0; const struct ieee80211_reg_rule *reg_rule = NULL; const struct ieee80211_power_rule *power_rule = NULL; - u32 bw; + u32 bw, center_freq_khz; + center_freq_khz = ieee80211_channel_to_khz(chan); for (bw = MHZ_TO_KHZ(20); bw >= min_bw; bw = bw / 2) { - reg_rule = freq_reg_info_regd(MHZ_TO_KHZ(chan->center_freq), - regd, bw); + reg_rule = freq_reg_info_regd(center_freq_khz, regd, bw); if (!IS_ERR(reg_rule)) break; } if (IS_ERR_OR_NULL(reg_rule)) { - pr_debug("Disabling freq %d MHz as custom regd has no rule that fits it\n", - chan->center_freq); + pr_debug("Disabling freq %d.%03d MHz as custom regd has no rule that fits it\n", + chan->center_freq, chan->freq_offset); if (wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED) { chan->flags |= IEEE80211_CHAN_DISABLED; } else { @@ -3337,8 +3338,8 @@ static bool pending_reg_beacon(struct ieee80211_channel *beacon_chan) struct reg_beacon *pending_beacon; list_for_each_entry(pending_beacon, ®_pending_beacons, list) - if (beacon_chan->center_freq == - pending_beacon->chan.center_freq) + if (ieee80211_channel_equal(beacon_chan, + &pending_beacon->chan)) return true; return false; } @@ -3367,9 +3368,10 @@ int regulatory_hint_found_beacon(struct wiphy *wiphy, if (!reg_beacon) return -ENOMEM; - pr_debug("Found new beacon on frequency: %d MHz (Ch %d) on %s\n", - beacon_chan->center_freq, - ieee80211_frequency_to_channel(beacon_chan->center_freq), + pr_debug("Found new beacon on frequency: %d.%03d MHz (Ch %d) on %s\n", + beacon_chan->center_freq, beacon_chan->freq_offset, + ieee80211_freq_khz_to_channel( + ieee80211_channel_to_khz(beacon_chan)), wiphy_name(wiphy)); memcpy(®_beacon->chan, beacon_chan, diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 4000382aef48..74ea4cfb39fb 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -1322,8 +1322,8 @@ cfg80211_get_bss_channel(struct wiphy *wiphy, const u8 *ie, size_t ielen, return channel; } - freq = ieee80211_channel_to_frequency(channel_number, channel->band); - alt_channel = ieee80211_get_channel(wiphy, freq); + freq = ieee80211_channel_to_freq_khz(channel_number, channel->band); + alt_channel = ieee80211_get_channel_khz(wiphy, freq); if (!alt_channel) { if (channel->band == NL80211_BAND_2GHZ) { /* diff --git a/net/wireless/trace.h b/net/wireless/trace.h index ee736620f1e3..53c887ea67c7 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -112,24 +112,29 @@ } while (0) #define CHAN_ENTRY __field(enum nl80211_band, band) \ - __field(u32, center_freq) + __field(u32, center_freq) \ + __field(u16, freq_offset) #define CHAN_ASSIGN(chan) \ do { \ if (chan) { \ __entry->band = chan->band; \ __entry->center_freq = chan->center_freq; \ + __entry->freq_offset = chan->freq_offset; \ } else { \ __entry->band = 0; \ __entry->center_freq = 0; \ + __entry->freq_offset = 0; \ } \ } while (0) -#define CHAN_PR_FMT "band: %d, freq: %u" -#define CHAN_PR_ARG __entry->band, __entry->center_freq +#define CHAN_PR_FMT "band: %d, freq: %u.%03u" +#define CHAN_PR_ARG __entry->band, __entry->center_freq, __entry->freq_offset #define CHAN_DEF_ENTRY __field(enum nl80211_band, band) \ __field(u32, control_freq) \ + __field(u32, freq_offset) \ __field(u32, width) \ __field(u32, center_freq1) \ + __field(u32, freq1_offset) \ __field(u32, center_freq2) #define CHAN_DEF_ASSIGN(chandef) \ do { \ @@ -137,21 +142,27 @@ __entry->band = (chandef)->chan->band; \ __entry->control_freq = \ (chandef)->chan->center_freq; \ + __entry->freq_offset = \ + (chandef)->chan->freq_offset; \ __entry->width = (chandef)->width; \ __entry->center_freq1 = (chandef)->center_freq1;\ + __entry->freq1_offset = (chandef)->freq1_offset;\ __entry->center_freq2 = (chandef)->center_freq2;\ } else { \ __entry->band = 0; \ __entry->control_freq = 0; \ + __entry->freq_offset = 0; \ __entry->width = 0; \ __entry->center_freq1 = 0; \ + __entry->freq1_offset = 0; \ __entry->center_freq2 = 0; \ } \ } while (0) #define CHAN_DEF_PR_FMT \ - "band: %d, control freq: %u, width: %d, cf1: %u, cf2: %u" + "band: %d, control freq: %u.%03u, width: %d, cf1: %u.%03u, cf2: %u" #define CHAN_DEF_PR_ARG __entry->band, __entry->control_freq, \ - __entry->width, __entry->center_freq1, \ + __entry->freq_offset, __entry->width, \ + __entry->center_freq1, __entry->freq1_offset, \ __entry->center_freq2 #define SINFO_ENTRY __field(int, generation) \ diff --git a/net/wireless/util.c b/net/wireless/util.c index 123d6ce79b8e..df75e58eca5d 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -72,7 +72,7 @@ u32 ieee80211_mandatory_rates(struct ieee80211_supported_band *sband, } EXPORT_SYMBOL(ieee80211_mandatory_rates); -int ieee80211_channel_to_frequency(int chan, enum nl80211_band band) +u32 ieee80211_channel_to_freq_khz(int chan, enum nl80211_band band) { /* see 802.11 17.3.8.3.2 and Annex J * there are overlapping channel numbers in 5GHz and 2GHz bands */ @@ -81,15 +81,15 @@ int ieee80211_channel_to_frequency(int chan, enum nl80211_band band) switch (band) { case NL80211_BAND_2GHZ: if (chan == 14) - return 2484; + return MHZ_TO_KHZ(2484); else if (chan < 14) - return 2407 + chan * 5; + return MHZ_TO_KHZ(2407 + chan * 5); break; case NL80211_BAND_5GHZ: if (chan >= 182 && chan <= 196) - return 4000 + chan * 5; + return MHZ_TO_KHZ(4000 + chan * 5); else - return 5000 + chan * 5; + return MHZ_TO_KHZ(5000 + chan * 5); break; case NL80211_BAND_6GHZ: /* see 802.11ax D4.1 27.3.22.2 */ @@ -98,17 +98,20 @@ int ieee80211_channel_to_frequency(int chan, enum nl80211_band band) break; case NL80211_BAND_60GHZ: if (chan < 7) - return 56160 + chan * 2160; + return MHZ_TO_KHZ(56160 + chan * 2160); break; default: ; } return 0; /* not supported */ } -EXPORT_SYMBOL(ieee80211_channel_to_frequency); +EXPORT_SYMBOL(ieee80211_channel_to_freq_khz); -int ieee80211_frequency_to_channel(int freq) +int ieee80211_freq_khz_to_channel(u32 freq) { + /* TODO: just handle MHz for now */ + freq = KHZ_TO_MHZ(freq); + /* see 802.11 17.3.8.3.2 and Annex J */ if (freq == 2484) return 14; @@ -126,9 +129,10 @@ int ieee80211_frequency_to_channel(int freq) else return 0; } -EXPORT_SYMBOL(ieee80211_frequency_to_channel); +EXPORT_SYMBOL(ieee80211_freq_khz_to_channel); -struct ieee80211_channel *ieee80211_get_channel(struct wiphy *wiphy, int freq) +struct ieee80211_channel *ieee80211_get_channel_khz(struct wiphy *wiphy, + u32 freq) { enum nl80211_band band; struct ieee80211_supported_band *sband; @@ -141,14 +145,16 @@ struct ieee80211_channel *ieee80211_get_channel(struct wiphy *wiphy, int freq) continue; for (i = 0; i < sband->n_channels; i++) { - if (sband->channels[i].center_freq == freq) - return &sband->channels[i]; + struct ieee80211_channel *chan = &sband->channels[i]; + + if (ieee80211_channel_to_khz(chan) == freq) + return chan; } } return NULL; } -EXPORT_SYMBOL(ieee80211_get_channel); +EXPORT_SYMBOL(ieee80211_get_channel_khz); static void set_mandatory_flags_band(struct ieee80211_supported_band *sband) { -- cgit v1.2.3 From b6011960f392d1de619f10aa5d088c27f1e7526c Mon Sep 17 00:00:00 2001 From: Thomas Pedersen Date: Wed, 1 Apr 2020 18:18:04 -0700 Subject: mac80211: handle channel frequency offset cfg80211_chan_def and ieee80211_channel recently gained a frequency offset component. Handle this where it makes sense (potentially required by S1G channels). For IBSS, TDLS, CSA, and ROC we return -EOPNOTSUPP if a channel with frequency offset is passed, since they may or may not work. Once someone tests and verifies these commands work on thos types of channels, we can remove that error. join_ocb and join_mesh look harmless because they use a simple ieee80211_vif_use_channel(), which is using an already verified channel, so we let those through. Signed-off-by: Thomas Pedersen Link: https://lore.kernel.org/r/20200402011810.22947-4-thomas@adapt-ip.com Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 6 ++++++ net/mac80211/chan.c | 1 + net/mac80211/ibss.c | 5 +++++ net/mac80211/main.c | 8 +++++--- net/mac80211/mlme.c | 16 ++++++++++++---- net/mac80211/offchannel.c | 4 ++++ net/mac80211/scan.c | 1 + net/mac80211/tdls.c | 4 ++++ net/mac80211/trace.h | 41 +++++++++++++++++++++++++++++++++-------- 9 files changed, 71 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index f0d43b9cfa43..ae3e06375a28 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -3287,6 +3287,12 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, goto out; } + if (params->chandef.chan->freq_offset) { + /* this may work, but is untested */ + err = -EOPNOTSUPP; + goto out; + } + chanctx = container_of(conf, struct ieee80211_chanctx, conf); ch_switch.timestamp = 0; diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index 9c94baaf693c..e6e192f53e4e 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -533,6 +533,7 @@ static void ieee80211_del_chanctx(struct ieee80211_local *local, struct cfg80211_chan_def *chandef = &local->_oper_chandef; chandef->width = NL80211_CHAN_WIDTH_20_NOHT; chandef->center_freq1 = chandef->chan->center_freq; + chandef->freq1_offset = chandef->chan->freq_offset; chandef->center_freq2 = 0; /* NOTE: Disabling radar is only valid here for diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index d40744903fa9..2479cd48fed0 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -1758,6 +1758,11 @@ int ieee80211_ibss_join(struct ieee80211_sub_if_data *sdata, int i; int ret; + if (params->chandef.chan->freq_offset) { + /* this may work, but is untested */ + return -EOPNOTSUPP; + } + ret = cfg80211_chandef_dfs_required(local->hw.wiphy, ¶ms->chandef, sdata->wdev.iftype); diff --git a/net/mac80211/main.c b/net/mac80211/main.c index a0cb052ea30d..dfcee5e462da 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -107,13 +107,15 @@ static u32 ieee80211_hw_conf_chan(struct ieee80211_local *local) chandef.chan = local->tmp_channel; chandef.width = NL80211_CHAN_WIDTH_20_NOHT; chandef.center_freq1 = chandef.chan->center_freq; + chandef.freq1_offset = chandef.chan->freq_offset; } else chandef = local->_oper_chandef; WARN(!cfg80211_chandef_valid(&chandef), - "control:%d MHz width:%d center: %d/%d MHz", - chandef.chan->center_freq, chandef.width, - chandef.center_freq1, chandef.center_freq2); + "control:%d.%03d MHz width:%d center: %d.%03d/%d MHz", + chandef.chan->center_freq, chandef.chan->freq_offset, + chandef.width, chandef.center_freq1, chandef.freq1_offset, + chandef.center_freq2); if (!cfg80211_chandef_identical(&chandef, &local->_oper_chandef)) local->hw.conf.flags |= IEEE80211_CONF_OFFCHANNEL; diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 59a35c7997c3..acc8adf50d69 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -162,6 +162,7 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata, chandef->chan = channel; chandef->width = NL80211_CHAN_WIDTH_20_NOHT; chandef->center_freq1 = channel->center_freq; + chandef->freq1_offset = channel->freq_offset; if (!ht_oper || !sta_ht_cap.ht_supported) { ret = IEEE80211_STA_DISABLE_HT | @@ -396,9 +397,12 @@ static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata, return 0; sdata_info(sdata, - "AP %pM changed bandwidth, new config is %d MHz, width %d (%d/%d MHz)\n", - ifmgd->bssid, chandef.chan->center_freq, chandef.width, - chandef.center_freq1, chandef.center_freq2); + "AP %pM changed bandwidth, new config is %d.%03d MHz, " + "width %d (%d.%03d/%d MHz)\n", + ifmgd->bssid, chandef.chan->center_freq, + chandef.chan->freq_offset, chandef.width, + chandef.center_freq1, chandef.freq1_offset, + chandef.center_freq2); if (flags != (ifmgd->flags & (IEEE80211_STA_DISABLE_HT | IEEE80211_STA_DISABLE_VHT | @@ -1364,10 +1368,14 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, if (!cfg80211_chandef_usable(local->hw.wiphy, &csa_ie.chandef, IEEE80211_CHAN_DISABLED)) { sdata_info(sdata, - "AP %pM switches to unsupported channel (%d MHz, width:%d, CF1/2: %d/%d MHz), disconnecting\n", + "AP %pM switches to unsupported channel " + "(%d.%03d MHz, width:%d, CF1/2: %d.%03d/%d MHz), " + "disconnecting\n", ifmgd->associated->bssid, csa_ie.chandef.chan->center_freq, + csa_ie.chandef.chan->freq_offset, csa_ie.chandef.width, csa_ie.chandef.center_freq1, + csa_ie.chandef.freq1_offset, csa_ie.chandef.center_freq2); ieee80211_queue_work(&local->hw, &ifmgd->csa_connection_drop_work); diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c index c710504ccf1a..db3b8bf75656 100644 --- a/net/mac80211/offchannel.c +++ b/net/mac80211/offchannel.c @@ -557,6 +557,10 @@ static int ieee80211_start_roc_work(struct ieee80211_local *local, lockdep_assert_held(&local->mtx); + if (channel->freq_offset) + /* this may work, but is untested */ + return -EOPNOTSUPP; + if (local->use_chanctx && !local->ops->remain_on_channel) return -EOPNOTSUPP; diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index fdac8192a519..4d14118dddca 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -896,6 +896,7 @@ static void ieee80211_scan_state_set_channel(struct ieee80211_local *local, local->scan_chandef.chan = chan; local->scan_chandef.center_freq1 = chan->center_freq; + local->scan_chandef.freq1_offset = chan->freq_offset; local->scan_chandef.center_freq2 = 0; switch (scan_req->scan_width) { case NL80211_BSS_CHAN_WIDTH_5: diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index 7ff22f9d6e80..8ad420db3766 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -1566,6 +1566,10 @@ ieee80211_tdls_channel_switch(struct wiphy *wiphy, struct net_device *dev, u32 ch_sw_tm_ie; int ret; + if (chandef->chan->freq_offset) + /* this may work, but is untested */ + return -EOPNOTSUPP; + mutex_lock(&local->sta_mtx); sta = sta_info_get(sdata, addr); if (!sta) { diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index 427f51a0a994..1b4709694d2a 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -37,32 +37,42 @@ #define VIF_PR_ARG __get_str(vif_name), __entry->vif_type, __entry->p2p ? "/p2p" : "" #define CHANDEF_ENTRY __field(u32, control_freq) \ + __field(u32, freq_offset) \ __field(u32, chan_width) \ __field(u32, center_freq1) \ + __field(u32, freq1_offset) \ __field(u32, center_freq2) #define CHANDEF_ASSIGN(c) \ __entry->control_freq = (c) ? ((c)->chan ? (c)->chan->center_freq : 0) : 0; \ + __entry->freq_offset = (c) ? ((c)->chan ? (c)->chan->freq_offset : 0) : 0; \ __entry->chan_width = (c) ? (c)->width : 0; \ __entry->center_freq1 = (c) ? (c)->center_freq1 : 0; \ + __entry->freq1_offset = (c) ? (c)->freq1_offset : 0; \ __entry->center_freq2 = (c) ? (c)->center_freq2 : 0; -#define CHANDEF_PR_FMT " control:%d MHz width:%d center: %d/%d MHz" -#define CHANDEF_PR_ARG __entry->control_freq, __entry->chan_width, \ - __entry->center_freq1, __entry->center_freq2 +#define CHANDEF_PR_FMT " control:%d.%03d MHz width:%d center: %d.%03d/%d MHz" +#define CHANDEF_PR_ARG __entry->control_freq, __entry->freq_offset, __entry->chan_width, \ + __entry->center_freq1, __entry->freq1_offset, __entry->center_freq2 #define MIN_CHANDEF_ENTRY \ __field(u32, min_control_freq) \ + __field(u32, min_freq_offset) \ __field(u32, min_chan_width) \ __field(u32, min_center_freq1) \ + __field(u32, min_freq1_offset) \ __field(u32, min_center_freq2) #define MIN_CHANDEF_ASSIGN(c) \ __entry->min_control_freq = (c)->chan ? (c)->chan->center_freq : 0; \ + __entry->min_freq_offset = (c)->chan ? (c)->chan->freq_offset : 0; \ __entry->min_chan_width = (c)->width; \ __entry->min_center_freq1 = (c)->center_freq1; \ + __entry->freq1_offset = (c)->freq1_offset; \ __entry->min_center_freq2 = (c)->center_freq2; -#define MIN_CHANDEF_PR_FMT " min_control:%d MHz min_width:%d min_center: %d/%d MHz" -#define MIN_CHANDEF_PR_ARG __entry->min_control_freq, __entry->min_chan_width, \ - __entry->min_center_freq1, __entry->min_center_freq2 +#define MIN_CHANDEF_PR_FMT " min_control:%d.%03d MHz min_width:%d min_center: %d.%03d/%d MHz" +#define MIN_CHANDEF_PR_ARG __entry->min_control_freq, __entry->min_freq_offset, \ + __entry->min_chan_width, \ + __entry->min_center_freq1, __entry->min_freq1_offset, \ + __entry->min_center_freq2 #define CHANCTX_ENTRY CHANDEF_ENTRY \ MIN_CHANDEF_ENTRY \ @@ -412,6 +422,7 @@ TRACE_EVENT(drv_bss_info_changed, __field(s32, cqm_rssi_hyst) __field(u32, channel_width) __field(u32, channel_cfreq1) + __field(u32, channel_cfreq1_offset) __dynamic_array(u32, arp_addr_list, info->arp_addr_cnt > IEEE80211_BSS_ARP_ADDR_LIST_LEN ? IEEE80211_BSS_ARP_ADDR_LIST_LEN : @@ -452,6 +463,7 @@ TRACE_EVENT(drv_bss_info_changed, __entry->cqm_rssi_hyst = info->cqm_rssi_hyst; __entry->channel_width = info->chandef.width; __entry->channel_cfreq1 = info->chandef.center_freq1; + __entry->channel_cfreq1_offset = info->chandef.freq1_offset; __entry->arp_addr_cnt = info->arp_addr_cnt; memcpy(__get_dynamic_array(arp_addr_list), info->arp_addr_list, sizeof(u32) * (info->arp_addr_cnt > IEEE80211_BSS_ARP_ADDR_LIST_LEN ? @@ -1223,6 +1235,7 @@ TRACE_EVENT(drv_remain_on_channel, LOCAL_ENTRY VIF_ENTRY __field(int, center_freq) + __field(int, freq_offset) __field(unsigned int, duration) __field(u32, type) ), @@ -1231,14 +1244,16 @@ TRACE_EVENT(drv_remain_on_channel, LOCAL_ASSIGN; VIF_ASSIGN; __entry->center_freq = chan->center_freq; + __entry->freq_offset = chan->freq_offset; __entry->duration = duration; __entry->type = type; ), TP_printk( - LOCAL_PR_FMT VIF_PR_FMT " freq:%dMHz duration:%dms type=%d", + LOCAL_PR_FMT VIF_PR_FMT " freq:%d.%03dMHz duration:%dms type=%d", LOCAL_PR_ARG, VIF_PR_ARG, - __entry->center_freq, __entry->duration, __entry->type + __entry->center_freq, __entry->freq_offset, + __entry->duration, __entry->type ) ); @@ -1546,8 +1561,10 @@ struct trace_vif_entry { struct trace_chandef_entry { u32 control_freq; + u32 freq_offset; u32 chan_width; u32 center_freq1; + u32 freq1_offset; u32 center_freq2; } __packed; @@ -1597,18 +1614,26 @@ TRACE_EVENT(drv_switch_vif_chanctx, sizeof(local_vifs[i].vif.vif_name)); SWITCH_ENTRY_ASSIGN(old_chandef.control_freq, old_ctx->def.chan->center_freq); + SWITCH_ENTRY_ASSIGN(old_chandef.freq_offset, + old_ctx->def.chan->freq_offset); SWITCH_ENTRY_ASSIGN(old_chandef.chan_width, old_ctx->def.width); SWITCH_ENTRY_ASSIGN(old_chandef.center_freq1, old_ctx->def.center_freq1); + SWITCH_ENTRY_ASSIGN(old_chandef.freq1_offset, + old_ctx->def.freq1_offset); SWITCH_ENTRY_ASSIGN(old_chandef.center_freq2, old_ctx->def.center_freq2); SWITCH_ENTRY_ASSIGN(new_chandef.control_freq, new_ctx->def.chan->center_freq); + SWITCH_ENTRY_ASSIGN(new_chandef.freq_offset, + new_ctx->def.chan->freq_offset); SWITCH_ENTRY_ASSIGN(new_chandef.chan_width, new_ctx->def.width); SWITCH_ENTRY_ASSIGN(new_chandef.center_freq1, new_ctx->def.center_freq1); + SWITCH_ENTRY_ASSIGN(new_chandef.freq1_offset, + new_ctx->def.freq1_offset); SWITCH_ENTRY_ASSIGN(new_chandef.center_freq2, new_ctx->def.center_freq2); } -- cgit v1.2.3 From 3b23c184f72acddad39c40373f165e1a9e384758 Mon Sep 17 00:00:00 2001 From: Thomas Pedersen Date: Wed, 1 Apr 2020 18:18:05 -0700 Subject: mac80211: add freq_offset to RX status RX status needs a KHz component, so add freq_offset. We can reduce the bits for the frequency since 60 GHz isn't supported. Signed-off-by: Thomas Pedersen Link: https://lore.kernel.org/r/20200402011810.22947-5-thomas@adapt-ip.com [fix commit message] Signed-off-by: Johannes Berg --- include/net/mac80211.h | 10 +++++++++- net/mac80211/mlme.c | 6 ++++-- net/mac80211/rx.c | 1 + net/mac80211/scan.c | 3 ++- 4 files changed, 16 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index f3147633dda2..2936049f918e 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1335,6 +1335,7 @@ enum mac80211_rx_encoding { * @freq: frequency the radio was tuned to when receiving this frame, in MHz * This field must be set for management frames, but isn't strictly needed * for data (other) frames - for those it only affects radiotap reporting. + * @freq_offset: @freq has a positive offset of 500Khz. * @signal: signal strength when receiving this frame, either in dBm, in dB or * unspecified depending on the hardware capabilities flags * @IEEE80211_HW_SIGNAL_* @@ -1365,7 +1366,7 @@ struct ieee80211_rx_status { u32 device_timestamp; u32 ampdu_reference; u32 flag; - u16 freq; + u16 freq: 13, freq_offset: 1; u8 enc_flags; u8 encoding:2, bw:3, he_ru:3; u8 he_gi:2, he_dcm:1; @@ -1381,6 +1382,13 @@ struct ieee80211_rx_status { u8 zero_length_psdu_type; }; +static inline u32 +ieee80211_rx_status_to_khz(struct ieee80211_rx_status *rx_status) +{ + return MHZ_TO_KHZ(rx_status->freq) + + (rx_status->freq_offset ? 500 : 0); +} + /** * struct ieee80211_vendor_radiotap - vendor radiotap data information * @present: presence bitmap for this vendor namespace diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index acc8adf50d69..a259b4487b60 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -3683,7 +3683,8 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata, sdata_assert_lock(sdata); - channel = ieee80211_get_channel(local->hw.wiphy, rx_status->freq); + channel = ieee80211_get_channel_khz(local->hw.wiphy, + ieee80211_rx_status_to_khz(rx_status)); if (!channel) return; @@ -3899,7 +3900,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, return; } - if (rx_status->freq != chanctx_conf->def.chan->center_freq) { + if (ieee80211_rx_status_to_khz(rx_status) != + ieee80211_channel_to_khz(chanctx_conf->def.chan)) { rcu_read_unlock(); return; } diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index a724551b8ddf..eaf8931e4627 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -412,6 +412,7 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, pos++; /* IEEE80211_RADIOTAP_CHANNEL */ + /* TODO: frequency offset in KHz */ put_unaligned_le16(status->freq, pos); pos += 2; if (status->bw == RATE_INFO_BW_10) diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index 4d14118dddca..5db15996524f 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -275,7 +275,8 @@ void ieee80211_scan_rx(struct ieee80211_local *local, struct sk_buff *skb) return; } - channel = ieee80211_get_channel(local->hw.wiphy, rx_status->freq); + channel = ieee80211_get_channel_khz(local->hw.wiphy, + ieee80211_rx_status_to_khz(rx_status)); if (!channel || channel->flags & IEEE80211_CHAN_DISABLED) return; -- cgit v1.2.3 From be689f68d040702a3521035d267949d3927971f0 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 24 Apr 2020 12:01:04 +0200 Subject: cfg80211: reject channels/chandefs with KHz offset >= 1000 This should be covered by the next MHz, make sure that the numbers are always normalized. Signed-off-by: Johannes Berg Link: https://lore.kernel.org/r/20200424120103.12b91ecf75f9.I4bf499d58404283bbfacb517d614a816763bccf2@changeid Signed-off-by: Johannes Berg --- net/wireless/chan.c | 3 +++ net/wireless/core.c | 3 +++ 2 files changed, 6 insertions(+) (limited to 'net') diff --git a/net/wireless/chan.c b/net/wireless/chan.c index d60e50a3b910..e111c08daa0e 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -147,6 +147,9 @@ bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef) if (!chandef->chan) return false; + if (chandef->freq1_offset >= 1000) + return false; + control_freq = chandef->chan->center_freq; switch (chandef->width) { diff --git a/net/wireless/core.c b/net/wireless/core.c index 5757dea2aa94..b795f363d004 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -834,6 +834,9 @@ int wiphy_register(struct wiphy *wiphy) sband->channels[i].orig_mpwr = sband->channels[i].max_power; sband->channels[i].band = band; + + if (WARN_ON(sband->channels[i].freq_offset >= 1000)) + return -EINVAL; } for (i = 0; i < sband->n_iftype_data; i++) { -- cgit v1.2.3 From b6b5c42e3bab939d357d800fd313e3c995164065 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 24 Apr 2020 12:39:46 +0200 Subject: mac80211: fix two missing documentation entries Add documentation for two struct entries that was missing. Signed-off-by: Johannes Berg Link: https://lore.kernel.org/r/20200424123945.6b23a26ab5e7.I664440ab5f33442df8103253bf5b9fe84be8d58c@changeid Signed-off-by: Johannes Berg --- include/net/mac80211.h | 2 ++ net/mac80211/sta_info.h | 2 ++ 2 files changed, 4 insertions(+) (limited to 'net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 2936049f918e..ecb219e3ec4f 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -820,6 +820,8 @@ enum mac80211_tx_info_flags { * @IEEE80211_TX_CTRL_AMSDU: This frame is an A-MSDU frame * @IEEE80211_TX_CTRL_FAST_XMIT: This frame is going through the fast_xmit path * @IEEE80211_TX_CTRL_SKIP_MPATH_LOOKUP: This frame skips mesh path lookup + * @IEEE80211_TX_CTRL_HW_80211_ENCAP: This frame uses hardware encapsulation + * (header conversion) * * These flags are used in tx_info->control.flags. */ diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index a5de3aa6ea42..49728047dfad 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -69,6 +69,8 @@ * @WLAN_STA_MPSP_RECIPIENT: local STA is recipient of a MPSP. * @WLAN_STA_PS_DELIVER: station woke up, but we're still blocking TX * until pending frames are delivered + * @WLAN_STA_USES_ENCRYPTION: This station was configured for encryption, + * so drop all packets without a key later. * * @NUM_WLAN_STA_FLAGS: number of defined flags */ -- cgit v1.2.3 From e73f94d1b6f05f6f22434c63de255a9dec6fd23d Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 24 Apr 2020 21:14:37 +0800 Subject: batman-adv: remove unused inline function batadv_arp_change_timeout There's no callers in-tree. Signed-off-by: YueHaibing Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/distributed-arp-table.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'net') diff --git a/net/batman-adv/distributed-arp-table.h b/net/batman-adv/distributed-arp-table.h index 2bff2f4a325c..4e031661682a 100644 --- a/net/batman-adv/distributed-arp-table.h +++ b/net/batman-adv/distributed-arp-table.h @@ -163,11 +163,6 @@ static inline void batadv_dat_init_own_addr(struct batadv_priv *bat_priv, { } -static inline void batadv_arp_change_timeout(struct net_device *soft_iface, - const char *name) -{ -} - static inline int batadv_dat_init(struct batadv_priv *bat_priv) { return 0; -- cgit v1.2.3 From b70ba69ef1f72e58c8e43b5689a37a66a7b31d11 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 23 Apr 2020 16:57:45 +0200 Subject: net: sched: report ndo_setup_tc failures via extack Help end-users of the 'tc' command to see if the drivers ndo_setup_tc function call fails. Troubleshooting when this happens is non-trivial (see full process here[1]), and results in net_device getting assigned the 'qdisc noop', which will drop all TX packets on the interface. [1]: https://github.com/xdp-project/xdp-project/blob/master/areas/arm64/board_nxp_ls1088/nxp-board04-troubleshoot-qdisc.org Signed-off-by: Jesper Dangaard Brouer Tested-by: Ioana Ciornei Signed-off-by: David S. Miller --- net/sched/cls_api.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 55bd1429678f..11b683c45c28 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -735,8 +735,11 @@ static int tcf_block_offload_cmd(struct tcf_block *block, INIT_LIST_HEAD(&bo.cb_list); err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo); - if (err < 0) + if (err < 0) { + if (err != -EOPNOTSUPP) + NL_SET_ERR_MSG(extack, "Driver ndo_setup_tc failed"); return err; + } return tcf_block_setup(block, &bo); } -- cgit v1.2.3 From 5c05c1dbb177293636a3f5ea4caa872dfcf50ccd Mon Sep 17 00:00:00 2001 From: Russell King Date: Thu, 23 Apr 2020 17:02:56 +0100 Subject: net: phylink, dsa: eliminate phylink_fixed_state_cb() Move the callback into the phylink_config structure, rather than providing a callback to set this up. Signed-off-by: Russell King Tested-by: Florian Fainelli Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/phylink.c | 46 +++++++++++++++------------------------------- include/linux/phylink.h | 6 +++--- net/dsa/slave.c | 20 +++++++++++--------- 3 files changed, 29 insertions(+), 43 deletions(-) (limited to 'net') diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index 34ca12aec61b..0f23bec431c1 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -480,8 +480,8 @@ static void phylink_get_fixed_state(struct phylink *pl, struct phylink_link_state *state) { *state = pl->link_config; - if (pl->get_fixed_state) - pl->get_fixed_state(pl->netdev, state); + if (pl->config->get_fixed_state) + pl->config->get_fixed_state(pl->config, state); else if (pl->link_gpio) state->link = !!gpiod_get_value_cansleep(pl->link_gpio); @@ -1044,32 +1044,6 @@ void phylink_disconnect_phy(struct phylink *pl) } EXPORT_SYMBOL_GPL(phylink_disconnect_phy); -/** - * phylink_fixed_state_cb() - allow setting a fixed link callback - * @pl: a pointer to a &struct phylink returned from phylink_create() - * @cb: callback to execute to determine the fixed link state. - * - * The MAC driver should call this driver when the state of its link - * can be determined through e.g: an out of band MMIO register. - */ -int phylink_fixed_state_cb(struct phylink *pl, - void (*cb)(struct net_device *dev, - struct phylink_link_state *state)) -{ - /* It does not make sense to let the link be overriden unless we use - * MLO_AN_FIXED - */ - if (pl->cfg_link_an_mode != MLO_AN_FIXED) - return -EINVAL; - - mutex_lock(&pl->state_mutex); - pl->get_fixed_state = cb; - mutex_unlock(&pl->state_mutex); - - return 0; -} -EXPORT_SYMBOL_GPL(phylink_fixed_state_cb); - /** * phylink_mac_change() - notify phylink of a change in MAC state * @pl: a pointer to a &struct phylink returned from phylink_create() @@ -1106,6 +1080,8 @@ static irqreturn_t phylink_link_handler(int irq, void *data) */ void phylink_start(struct phylink *pl) { + bool poll = false; + ASSERT_RTNL(); phylink_info(pl, "configuring for %s/%s link mode\n", @@ -1142,10 +1118,18 @@ void phylink_start(struct phylink *pl) irq = 0; } if (irq <= 0) - mod_timer(&pl->link_poll, jiffies + HZ); + poll = true; + } + + switch (pl->cfg_link_an_mode) { + case MLO_AN_FIXED: + poll |= pl->config->poll_fixed_state; + break; + case MLO_AN_INBAND: + poll |= pl->config->pcs_poll; + break; } - if ((pl->cfg_link_an_mode == MLO_AN_FIXED && pl->get_fixed_state) || - pl->config->pcs_poll) + if (poll) mod_timer(&pl->link_poll, jiffies + HZ); if (pl->phydev) phy_start(pl->phydev); diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 3f8d37ec5503..cc5b452a184e 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -67,6 +67,9 @@ struct phylink_config { struct device *dev; enum phylink_op_type type; bool pcs_poll; + bool poll_fixed_state; + void (*get_fixed_state)(struct phylink_config *config, + struct phylink_link_state *state); }; /** @@ -366,9 +369,6 @@ void phylink_destroy(struct phylink *); int phylink_connect_phy(struct phylink *, struct phy_device *); int phylink_of_phy_connect(struct phylink *, struct device_node *, u32 flags); void phylink_disconnect_phy(struct phylink *); -int phylink_fixed_state_cb(struct phylink *, - void (*cb)(struct net_device *dev, - struct phylink_link_state *)); void phylink_mac_change(struct phylink *, bool up); diff --git a/net/dsa/slave.c b/net/dsa/slave.c index f2c241cf3a80..1035230771ae 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1590,10 +1590,10 @@ void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up) } EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_change); -static void dsa_slave_phylink_fixed_state(struct net_device *dev, +static void dsa_slave_phylink_fixed_state(struct phylink_config *config, struct phylink_link_state *state) { - struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); struct dsa_switch *ds = dp->ds; /* No need to check that this operation is valid, the callback would @@ -1633,6 +1633,15 @@ static int dsa_slave_phy_setup(struct net_device *slave_dev) dp->pl_config.dev = &slave_dev->dev; dp->pl_config.type = PHYLINK_NETDEV; + /* The get_fixed_state callback takes precedence over polling the + * link GPIO in PHYLINK (see phylink_get_fixed_state). Only set + * this if the switch provides such a callback. + */ + if (ds->ops->phylink_fixed_state) { + dp->pl_config.get_fixed_state = dsa_slave_phylink_fixed_state; + dp->pl_config.poll_fixed_state = true; + } + dp->pl = phylink_create(&dp->pl_config, of_fwnode_handle(port_dn), mode, &dsa_port_phylink_mac_ops); if (IS_ERR(dp->pl)) { @@ -1641,13 +1650,6 @@ static int dsa_slave_phy_setup(struct net_device *slave_dev) return PTR_ERR(dp->pl); } - /* Register only if the switch provides such a callback, since this - * callback takes precedence over polling the link GPIO in PHYLINK - * (see phylink_get_fixed_state). - */ - if (ds->ops->phylink_fixed_state) - phylink_fixed_state_cb(dp->pl, dsa_slave_phylink_fixed_state); - if (ds->ops->get_phy_flags) phy_flags = ds->ops->get_phy_flags(ds, dp->index); -- cgit v1.2.3 From 071c8ed6e88d2ac0a5f26948fb9c288fd4dd6e40 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 24 Apr 2020 12:31:50 +0200 Subject: tcp: mptcp: use mptcp receive buffer space to select rcv window In MPTCP, the receive window is shared across all subflows, because it refers to the mptcp-level sequence space. MPTCP receivers already place incoming packets on the mptcp socket receive queue and will charge it to the mptcp socket rcvbuf until userspace consumes the data. Update __tcp_select_window to use the occupancy of the parent/mptcp socket instead of the subflow socket in case the tcp socket is part of a logical mptcp connection. This commit doesn't change choice of initial window for passive or active connections. While it would be possible to change those as well, this adds complexity (especially when handling MP_JOIN requests). Furthermore, the MPTCP RFC specifically says that a MPTCP sender 'MUST NOT use the RCV.WND field of a TCP segment at the connection level if it does not also carry a DSS option with a Data ACK field.' SYN/SYNACK packets do not carry a DSS option with a Data ACK field. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/net/mptcp.h | 3 +++ net/ipv4/tcp_output.c | 8 ++++++-- net/mptcp/subflow.c | 18 ++++++++++++++++++ 3 files changed, 27 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/mptcp.h b/include/net/mptcp.h index 0e7c5471010b..5288fba56e55 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -68,6 +68,8 @@ static inline bool rsk_is_mptcp(const struct request_sock *req) return tcp_rsk(req)->is_mptcp; } +void mptcp_space(const struct sock *ssk, int *space, int *full_space); + void mptcp_parse_option(const struct sk_buff *skb, const unsigned char *ptr, int opsize, struct tcp_options_received *opt_rx); bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, @@ -197,6 +199,7 @@ static inline bool mptcp_sk_is_subflow(const struct sock *sk) return false; } +static inline void mptcp_space(const struct sock *ssk, int *s, int *fs) { } static inline void mptcp_seq_show(struct seq_file *seq) { } #endif /* CONFIG_MPTCP */ diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 2f45cde168c4..ba4482130f08 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2772,8 +2772,12 @@ u32 __tcp_select_window(struct sock *sk) int mss = icsk->icsk_ack.rcv_mss; int free_space = tcp_space(sk); int allowed_space = tcp_full_space(sk); - int full_space = min_t(int, tp->window_clamp, allowed_space); - int window; + int full_space, window; + + if (sk_is_mptcp(sk)) + mptcp_space(sk, &free_space, &allowed_space); + + full_space = min_t(int, tp->window_clamp, allowed_space); if (unlikely(mss > full_space)) { mss = full_space; diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index fabd06f2ff45..87c094702d63 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -821,6 +821,24 @@ bool mptcp_subflow_data_available(struct sock *sk) return subflow->data_avail; } +/* If ssk has an mptcp parent socket, use the mptcp rcvbuf occupancy, + * not the ssk one. + * + * In mptcp, rwin is about the mptcp-level connection data. + * + * Data that is still on the ssk rx queue can thus be ignored, + * as far as mptcp peer is concerened that data is still inflight. + * DSS ACK is updated when skb is moved to the mptcp rx queue. + */ +void mptcp_space(const struct sock *ssk, int *space, int *full_space) +{ + const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + const struct sock *sk = subflow->conn; + + *space = tcp_space(sk); + *full_space = tcp_full_space(sk); +} + static void subflow_data_ready(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); -- cgit v1.2.3 From f30e472071c827e2c2f191c8a011b7e371e17af5 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Fri, 24 Apr 2020 12:43:09 +0000 Subject: hsr: remove unnecessary code in hsr_dev_change_mtu() In the hsr_dev_change_mtu(), the 'dev' and 'master->dev' pointer are same. So, the 'master' variable and some code are unnecessary. Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- net/hsr/hsr_device.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index fc7027314ad8..cd99f548e440 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -125,13 +125,11 @@ int hsr_get_max_mtu(struct hsr_priv *hsr) static int hsr_dev_change_mtu(struct net_device *dev, int new_mtu) { struct hsr_priv *hsr; - struct hsr_port *master; hsr = netdev_priv(dev); - master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); if (new_mtu > hsr_get_max_mtu(hsr)) { - netdev_info(master->dev, "A HSR master's MTU cannot be greater than the smallest MTU of its slaves minus the HSR Tag length (%d octets).\n", + netdev_info(dev, "A HSR master's MTU cannot be greater than the smallest MTU of its slaves minus the HSR Tag length (%d octets).\n", HSR_HLEN); return -EINVAL; } -- cgit v1.2.3 From df346f1aac6cef551b69d788b022a942270dc17b Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 24 Apr 2020 21:13:34 +0800 Subject: dccp: remove unused inline function dccp_set_seqno There's no callers in-tree since commit 792b48780e8b ("dccp: Implement both feature-local and feature-remote Sequence Window feature") Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- net/dccp/dccp.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'net') diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 9c3b27c257bb..7dce4f6c7025 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -108,11 +108,6 @@ extern int sysctl_dccp_sync_ratelimit; #define ADD48(a, b) (((a) + (b)) & UINT48_MAX) #define SUB48(a, b) ADD48((a), COMPLEMENT48(b)) -static inline void dccp_set_seqno(u64 *seqno, u64 value) -{ - *seqno = value & UINT48_MAX; -} - static inline void dccp_inc_seqno(u64 *seqno) { *seqno = ADD48(*seqno, 1); -- cgit v1.2.3 From 4b36a0dff794a00989a50581aed2f94c88b57107 Mon Sep 17 00:00:00 2001 From: Tonghao Zhang Date: Sat, 25 Apr 2020 11:39:47 +0800 Subject: net: openvswitch: suitable access to the dp_meters To fix the following sparse warning: | net/openvswitch/meter.c:109:38: sparse: sparse: incorrect type | in assignment (different address spaces) ... | net/openvswitch/meter.c:720:45: sparse: sparse: incorrect type | in argument 1 (different address spaces) ... Reported-by: kbuild test robot Signed-off-by: Tonghao Zhang Signed-off-by: David S. Miller --- net/openvswitch/meter.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c index 915f31123f23..612ad5586ce9 100644 --- a/net/openvswitch/meter.c +++ b/net/openvswitch/meter.c @@ -107,8 +107,8 @@ dp_meter_instance_realloc(struct dp_meter_table *tbl, u32 size) return -ENOMEM; for (i = 0; i < n_meters; i++) - new_ti->dp_meters[i] = - rcu_dereference_ovsl(ti->dp_meters[i]); + if (rcu_dereference_ovsl(ti->dp_meters[i])) + new_ti->dp_meters[i] = ti->dp_meters[i]; rcu_assign_pointer(tbl->ti, new_ti); call_rcu(&ti->rcu, dp_meter_instance_free_rcu); @@ -752,7 +752,7 @@ void ovs_meters_exit(struct datapath *dp) int i; for (i = 0; i < ti->n_meters; i++) - ovs_meter_free(ti->dp_meters[i]); + ovs_meter_free(rcu_dereference_raw(ti->dp_meters[i])); dp_meter_instance_free(ti); } -- cgit v1.2.3 From 659d4587fe7233bfdff303744b20d6f41ad04362 Mon Sep 17 00:00:00 2001 From: Tonghao Zhang Date: Sat, 25 Apr 2020 11:39:48 +0800 Subject: net: openvswitch: use div_u64() for 64-by-32 divisions Compile the kernel for arm 32 platform, the build warning found. To fix that, should use div_u64() for divisions. | net/openvswitch/meter.c:396: undefined reference to `__udivdi3' [add more commit msg, change reported tag, and use div_u64 instead of do_div by Tonghao] Fixes: e57358873bb5d6ca ("net: openvswitch: use u64 for meter bucket") Reported-by: kbuild test robot Signed-off-by: Tonghao Zhang Tested-by: Tonghao Zhang Signed-off-by: David S. Miller --- net/openvswitch/meter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c index 612ad5586ce9..3d3d8e094546 100644 --- a/net/openvswitch/meter.c +++ b/net/openvswitch/meter.c @@ -393,7 +393,7 @@ static struct dp_meter *dp_meter_create(struct nlattr **a) * Start with a full bucket. */ band->bucket = (band->burst_size + band->rate) * 1000ULL; - band_max_delta_t = band->bucket / band->rate; + band_max_delta_t = div_u64(band->bucket, band->rate); if (band_max_delta_t > meter->max_delta_t) meter->max_delta_t = band_max_delta_t; band++; -- cgit v1.2.3 From 0456ea170cd665ddbb9503be92e39f96055dd5fa Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 20 Apr 2020 10:46:10 -0700 Subject: bpf: Enable more helpers for BPF_PROG_TYPE_CGROUP_{DEVICE,SYSCTL,SOCKOPT} Currently the following prog types don't fall back to bpf_base_func_proto() (instead they have cgroup_base_func_proto which has a limited set of helpers from bpf_base_func_proto): * BPF_PROG_TYPE_CGROUP_DEVICE * BPF_PROG_TYPE_CGROUP_SYSCTL * BPF_PROG_TYPE_CGROUP_SOCKOPT I don't see any specific reason why we shouldn't use bpf_base_func_proto(), every other type of program (except bpf-lirc and, understandably, tracing) use it, so let's fall back to bpf_base_func_proto for those prog types as well. This basically boils down to adding access to the following helpers: * BPF_FUNC_get_prandom_u32 * BPF_FUNC_get_smp_processor_id * BPF_FUNC_get_numa_node_id * BPF_FUNC_tail_call * BPF_FUNC_ktime_get_ns * BPF_FUNC_spin_lock (CAP_SYS_ADMIN) * BPF_FUNC_spin_unlock (CAP_SYS_ADMIN) * BPF_FUNC_jiffies64 (CAP_SYS_ADMIN) I've also added bpf_perf_event_output() because it's really handy for logging and debugging. Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200420174610.77494-1-sdf@google.com --- include/linux/bpf.h | 1 + kernel/bpf/cgroup.c | 20 +++--------------- net/core/filter.c | 2 +- .../testing/selftests/bpf/verifier/event_output.c | 24 ++++++++++++++++++++++ 4 files changed, 29 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index fd2b2322412d..25da6ff2a880 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1523,6 +1523,7 @@ extern const struct bpf_func_proto bpf_strtoul_proto; extern const struct bpf_func_proto bpf_tcp_sock_proto; extern const struct bpf_func_proto bpf_jiffies64_proto; extern const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto; +extern const struct bpf_func_proto bpf_event_output_data_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index cb305e71e7de..4d748c5785bc 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1060,30 +1060,16 @@ static const struct bpf_func_proto * cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { - case BPF_FUNC_map_lookup_elem: - return &bpf_map_lookup_elem_proto; - case BPF_FUNC_map_update_elem: - return &bpf_map_update_elem_proto; - case BPF_FUNC_map_delete_elem: - return &bpf_map_delete_elem_proto; - case BPF_FUNC_map_push_elem: - return &bpf_map_push_elem_proto; - case BPF_FUNC_map_pop_elem: - return &bpf_map_pop_elem_proto; - case BPF_FUNC_map_peek_elem: - return &bpf_map_peek_elem_proto; case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; case BPF_FUNC_get_current_cgroup_id: return &bpf_get_current_cgroup_id_proto; - case BPF_FUNC_trace_printk: - if (capable(CAP_SYS_ADMIN)) - return bpf_get_trace_printk_proto(); - /* fall through */ + case BPF_FUNC_perf_event_output: + return &bpf_event_output_data_proto; default: - return NULL; + return bpf_base_func_proto(func_id); } } diff --git a/net/core/filter.c b/net/core/filter.c index 7d6ceaa54d21..a943df3ad8b0 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4214,7 +4214,7 @@ BPF_CALL_5(bpf_event_output_data, void *, ctx, struct bpf_map *, map, u64, flags return bpf_event_output(map, flags, data, size, NULL, 0, NULL); } -static const struct bpf_func_proto bpf_event_output_data_proto = { +const struct bpf_func_proto bpf_event_output_data_proto = { .func = bpf_event_output_data, .gpl_only = true, .ret_type = RET_INTEGER, diff --git a/tools/testing/selftests/bpf/verifier/event_output.c b/tools/testing/selftests/bpf/verifier/event_output.c index 130553e19eca..99f8f582c02b 100644 --- a/tools/testing/selftests/bpf/verifier/event_output.c +++ b/tools/testing/selftests/bpf/verifier/event_output.c @@ -92,3 +92,27 @@ .result = ACCEPT, .retval = 1, }, +{ + "perfevent for cgroup dev", + .insns = { __PERF_EVENT_INSNS__ }, + .prog_type = BPF_PROG_TYPE_CGROUP_DEVICE, + .fixup_map_event_output = { 4 }, + .result = ACCEPT, + .retval = 1, +}, +{ + "perfevent for cgroup sysctl", + .insns = { __PERF_EVENT_INSNS__ }, + .prog_type = BPF_PROG_TYPE_CGROUP_SYSCTL, + .fixup_map_event_output = { 4 }, + .result = ACCEPT, + .retval = 1, +}, +{ + "perfevent for cgroup sockopt", + .insns = { __PERF_EVENT_INSNS__ }, + .prog_type = BPF_PROG_TYPE_CGROUP_SOCKOPT, + .fixup_map_event_output = { 4 }, + .result = ACCEPT, + .retval = 1, +}, -- cgit v1.2.3 From 6890896bd765b0504761c61901c9804fca23bfb2 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 24 Apr 2020 16:59:41 -0700 Subject: bpf: Fix missing bpf_base_func_proto in cgroup_base_func_proto for CGROUP_NET=n linux-next build bot reported compile issue [1] with one of its configs. It looks like when we have CONFIG_NET=n and CONFIG_BPF{,_SYSCALL}=y, we are missing the bpf_base_func_proto definition (from net/core/filter.c) in cgroup_base_func_proto. I'm reshuffling the code a bit to make it work. The common helpers are moved into kernel/bpf/helpers.c and the bpf_base_func_proto is exported from there. Also, bpf_get_raw_cpu_id goes into kernel/bpf/core.c akin to existing bpf_user_rnd_u32. [1] https://lore.kernel.org/linux-next/CAKH8qBsBvKHswiX1nx40LgO+BGeTmb1NX8tiTttt_0uu6T3dCA@mail.gmail.com/T/#mff8b0c083314c68c2e2ef0211cb11bc20dc13c72 Fixes: 0456ea170cd6 ("bpf: Enable more helpers for BPF_PROG_TYPE_CGROUP_{DEVICE,SYSCTL,SOCKOPT}") Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Cc: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200424235941.58382-1-sdf@google.com --- include/linux/bpf.h | 8 ++++++ include/linux/filter.h | 2 -- kernel/bpf/core.c | 5 ++++ kernel/bpf/helpers.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++ net/core/filter.c | 78 +------------------------------------------------- 5 files changed, 87 insertions(+), 79 deletions(-) (limited to 'net') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 25da6ff2a880..5147e11e53ff 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1215,6 +1215,7 @@ int btf_check_type_match(struct bpf_verifier_env *env, struct bpf_prog *prog, struct bpf_prog *bpf_prog_by_id(u32 id); +const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id); #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { @@ -1365,6 +1366,12 @@ static inline struct bpf_prog *bpf_prog_by_id(u32 id) { return ERR_PTR(-ENOTSUPP); } + +static inline const struct bpf_func_proto * +bpf_base_func_proto(enum bpf_func_id func_id) +{ + return NULL; +} #endif /* CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get_type(u32 ufd, @@ -1531,6 +1538,7 @@ const struct bpf_func_proto *bpf_tracing_func_proto( /* Shared helpers among cBPF and eBPF. */ void bpf_user_rnd_init_once(void); u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); +u64 bpf_get_raw_cpu_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); #if defined(CONFIG_NET) bool bpf_sock_common_is_valid_access(int off, int size, diff --git a/include/linux/filter.h b/include/linux/filter.h index 9b5aa5c483cc..af37318bb1c5 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -863,8 +863,6 @@ int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog); int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog, bpf_aux_classic_check_t trans, bool save_orig); void bpf_prog_destroy(struct bpf_prog *fp); -const struct bpf_func_proto * -bpf_base_func_proto(enum bpf_func_id func_id); int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk); int sk_attach_bpf(u32 ufd, struct sock *sk); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 916f5132a984..0cc91805069a 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2136,6 +2136,11 @@ BPF_CALL_0(bpf_user_rnd_u32) return res; } +BPF_CALL_0(bpf_get_raw_cpu_id) +{ + return raw_smp_processor_id(); +} + /* Weak definitions of helper functions in case we don't have bpf syscall. */ const struct bpf_func_proto bpf_map_lookup_elem_proto __weak; const struct bpf_func_proto bpf_map_update_elem_proto __weak; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index bafc53ddd350..dbba4f41d508 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -562,3 +562,76 @@ const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto = { .arg3_type = ARG_PTR_TO_UNINIT_MEM, .arg4_type = ARG_CONST_SIZE, }; + +static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = { + .func = bpf_get_raw_cpu_id, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; + +BPF_CALL_5(bpf_event_output_data, void *, ctx, struct bpf_map *, map, + u64, flags, void *, data, u64, size) +{ + if (unlikely(flags & ~(BPF_F_INDEX_MASK))) + return -EINVAL; + + return bpf_event_output(map, flags, data, size, NULL, 0, NULL); +} + +const struct bpf_func_proto bpf_event_output_data_proto = { + .func = bpf_event_output_data, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, +}; + +const struct bpf_func_proto * +bpf_base_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_map_lookup_elem: + return &bpf_map_lookup_elem_proto; + case BPF_FUNC_map_update_elem: + return &bpf_map_update_elem_proto; + case BPF_FUNC_map_delete_elem: + return &bpf_map_delete_elem_proto; + case BPF_FUNC_map_push_elem: + return &bpf_map_push_elem_proto; + case BPF_FUNC_map_pop_elem: + return &bpf_map_pop_elem_proto; + case BPF_FUNC_map_peek_elem: + return &bpf_map_peek_elem_proto; + case BPF_FUNC_get_prandom_u32: + return &bpf_get_prandom_u32_proto; + case BPF_FUNC_get_smp_processor_id: + return &bpf_get_raw_smp_processor_id_proto; + case BPF_FUNC_get_numa_node_id: + return &bpf_get_numa_node_id_proto; + case BPF_FUNC_tail_call: + return &bpf_tail_call_proto; + case BPF_FUNC_ktime_get_ns: + return &bpf_ktime_get_ns_proto; + default: + break; + } + + if (!capable(CAP_SYS_ADMIN)) + return NULL; + + switch (func_id) { + case BPF_FUNC_spin_lock: + return &bpf_spin_lock_proto; + case BPF_FUNC_spin_unlock: + return &bpf_spin_unlock_proto; + case BPF_FUNC_trace_printk: + return bpf_get_trace_printk_proto(); + case BPF_FUNC_jiffies64: + return &bpf_jiffies64_proto; + default: + return NULL; + } +} diff --git a/net/core/filter.c b/net/core/filter.c index a943df3ad8b0..a605626142b6 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -256,17 +256,6 @@ BPF_CALL_2(bpf_skb_load_helper_32_no_cache, const struct sk_buff *, skb, offset); } -BPF_CALL_0(bpf_get_raw_cpu_id) -{ - return raw_smp_processor_id(); -} - -static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = { - .func = bpf_get_raw_cpu_id, - .gpl_only = false, - .ret_type = RET_INTEGER, -}; - static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg, struct bpf_insn *insn_buf) { @@ -4205,26 +4194,6 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = { .arg1_type = ARG_PTR_TO_CTX, }; -BPF_CALL_5(bpf_event_output_data, void *, ctx, struct bpf_map *, map, u64, flags, - void *, data, u64, size) -{ - if (unlikely(flags & ~(BPF_F_INDEX_MASK))) - return -EINVAL; - - return bpf_event_output(map, flags, data, size, NULL, 0, NULL); -} - -const struct bpf_func_proto bpf_event_output_data_proto = { - .func = bpf_event_output_data, - .gpl_only = true, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_CONST_MAP_PTR, - .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, - .arg5_type = ARG_CONST_SIZE_OR_ZERO, -}; - BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, int, level, int, optname, char *, optval, int, optlen) { @@ -5983,52 +5952,7 @@ bool bpf_helper_changes_pkt_data(void *func) return false; } -const struct bpf_func_proto * -bpf_base_func_proto(enum bpf_func_id func_id) -{ - switch (func_id) { - case BPF_FUNC_map_lookup_elem: - return &bpf_map_lookup_elem_proto; - case BPF_FUNC_map_update_elem: - return &bpf_map_update_elem_proto; - case BPF_FUNC_map_delete_elem: - return &bpf_map_delete_elem_proto; - case BPF_FUNC_map_push_elem: - return &bpf_map_push_elem_proto; - case BPF_FUNC_map_pop_elem: - return &bpf_map_pop_elem_proto; - case BPF_FUNC_map_peek_elem: - return &bpf_map_peek_elem_proto; - case BPF_FUNC_get_prandom_u32: - return &bpf_get_prandom_u32_proto; - case BPF_FUNC_get_smp_processor_id: - return &bpf_get_raw_smp_processor_id_proto; - case BPF_FUNC_get_numa_node_id: - return &bpf_get_numa_node_id_proto; - case BPF_FUNC_tail_call: - return &bpf_tail_call_proto; - case BPF_FUNC_ktime_get_ns: - return &bpf_ktime_get_ns_proto; - default: - break; - } - - if (!capable(CAP_SYS_ADMIN)) - return NULL; - - switch (func_id) { - case BPF_FUNC_spin_lock: - return &bpf_spin_lock_proto; - case BPF_FUNC_spin_unlock: - return &bpf_spin_unlock_proto; - case BPF_FUNC_trace_printk: - return bpf_get_trace_printk_proto(); - case BPF_FUNC_jiffies64: - return &bpf_jiffies64_proto; - default: - return NULL; - } -} +const struct bpf_func_proto bpf_event_output_data_proto __weak; static const struct bpf_func_proto * sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) -- cgit v1.2.3 From 6f3f65d80dac8f2bafce2213005821fccdce194c Mon Sep 17 00:00:00 2001 From: Lorenzo Colitti Date: Mon, 20 Apr 2020 11:34:08 -0700 Subject: net: bpf: Allow TC programs to call BPF_FUNC_skb_change_head MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This allows TC eBPF programs to modify and forward (redirect) packets from interfaces without ethernet headers (for example cellular) to interfaces with (for example ethernet/wifi). The lack of this appears to simply be an oversight. Tested: in active use in Android R on 4.14+ devices for ipv6 cellular to wifi tethering offload. Signed-off-by: Lorenzo Colitti Signed-off-by: Maciej Żenczykowski Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index a605626142b6..da3b7a72c37c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6137,6 +6137,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skb_adjust_room_proto; case BPF_FUNC_skb_change_tail: return &bpf_skb_change_tail_proto; + case BPF_FUNC_skb_change_head: + return &bpf_skb_change_head_proto; case BPF_FUNC_skb_get_tunnel_key: return &bpf_skb_get_tunnel_key_proto; case BPF_FUNC_skb_set_tunnel_key: -- cgit v1.2.3 From 0a05861f80fe7d4dcfdabcc98d9854947573e072 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Wed, 22 Apr 2020 01:29:27 +0200 Subject: xsk: Fix typo in xsk_umem_consume_tx and xsk_generic_xmit comments s/backpreassure/backpressure/ Signed-off-by: Tobias Klauser Signed-off-by: Alexei Starovoitov Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20200421232927.21082-1-tklauser@distanz.ch --- net/xdp/xsk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index c350108aa38d..f6e6609f70a3 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -322,7 +322,7 @@ bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc) if (!xskq_cons_peek_desc(xs->tx, desc, umem)) continue; - /* This is the backpreassure mechanism for the Tx path. + /* This is the backpressure mechanism for the Tx path. * Reserve space in the completion queue and only proceed * if there is space in it. This avoids having to implement * any buffering in the Tx path. @@ -406,7 +406,7 @@ static int xsk_generic_xmit(struct sock *sk) addr = desc.addr; buffer = xdp_umem_get_data(xs->umem, addr); err = skb_store_bits(skb, 0, buffer, len); - /* This is the backpreassure mechanism for the Tx path. + /* This is the backpressure mechanism for the Tx path. * Reserve space in the completion queue and only proceed * if there is space in it. This avoids having to implement * any buffering in the Tx path. -- cgit v1.2.3 From 74f99482eae03195ced512b440b31d62bdb6e943 Mon Sep 17 00:00:00 2001 From: Bodong Wang Date: Tue, 21 Apr 2020 10:04:16 -0500 Subject: netfilter: nf_conntrack: add IPS_HW_OFFLOAD status bit This bit indicates that the conntrack entry is offloaded to hardware flow table. nf_conntrack entry will be tagged with [HW_OFFLOAD] if it's offload to hardware. cat /proc/net/nf_conntrack ipv4 2 tcp 6 \ src=1.1.1.17 dst=1.1.1.16 sport=56394 dport=5001 \ src=1.1.1.16 dst=1.1.1.17 sport=5001 dport=56394 [HW_OFFLOAD] \ mark=0 zone=0 use=3 Note that HW_OFFLOAD/OFFLOAD/ASSURED are mutually exclusive. Changelog: * V1->V2: - Remove check of lastused from stats. It was meant for cases such as removing driver module while traffic still running. Better to handle such cases from garbage collector. Signed-off-by: Bodong Wang Reviewed-by: Oz Shlomo Reviewed-by: Paul Blakey Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_conntrack_common.h | 8 ++++++-- net/netfilter/nf_conntrack_standalone.c | 4 +++- net/netfilter/nf_flow_table_offload.c | 3 +++ 3 files changed, 12 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h index b6f0bb1dc799..4b3395082d15 100644 --- a/include/uapi/linux/netfilter/nf_conntrack_common.h +++ b/include/uapi/linux/netfilter/nf_conntrack_common.h @@ -114,15 +114,19 @@ enum ip_conntrack_status { IPS_OFFLOAD_BIT = 14, IPS_OFFLOAD = (1 << IPS_OFFLOAD_BIT), + /* Conntrack has been offloaded to hardware. */ + IPS_HW_OFFLOAD_BIT = 15, + IPS_HW_OFFLOAD = (1 << IPS_HW_OFFLOAD_BIT), + /* Be careful here, modifying these bits can make things messy, * so don't let users modify them directly. */ IPS_UNCHANGEABLE_MASK = (IPS_NAT_DONE_MASK | IPS_NAT_MASK | IPS_EXPECTED | IPS_CONFIRMED | IPS_DYING | IPS_SEQ_ADJUST | IPS_TEMPLATE | IPS_UNTRACKED | - IPS_OFFLOAD), + IPS_OFFLOAD | IPS_HW_OFFLOAD), - __IPS_MAX_BIT = 15, + __IPS_MAX_BIT = 16, }; /* Connection tracking event types */ diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 9b57330c81f8..5a3e6c43ee68 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -348,7 +348,9 @@ static int ct_seq_show(struct seq_file *s, void *v) if (seq_print_acct(s, ct, IP_CT_DIR_REPLY)) goto release; - if (test_bit(IPS_OFFLOAD_BIT, &ct->status)) + if (test_bit(IPS_HW_OFFLOAD_BIT, &ct->status)) + seq_puts(s, "[HW_OFFLOAD] "); + else if (test_bit(IPS_OFFLOAD_BIT, &ct->status)) seq_puts(s, "[OFFLOAD] "); else if (test_bit(IPS_ASSURED_BIT, &ct->status)) seq_puts(s, "[ASSURED] "); diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index e3b099c14eff..a2abb0feab7f 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -754,12 +754,15 @@ static void flow_offload_work_add(struct flow_offload_work *offload) err = flow_offload_rule_add(offload, flow_rule); if (err < 0) set_bit(NF_FLOW_HW_REFRESH, &offload->flow->flags); + else + set_bit(IPS_HW_OFFLOAD_BIT, &offload->flow->ct->status); nf_flow_offload_destroy(flow_rule); } static void flow_offload_work_del(struct flow_offload_work *offload) { + clear_bit(IPS_HW_OFFLOAD_BIT, &offload->flow->ct->status); flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_ORIGINAL); flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_REPLY); set_bit(NF_FLOW_HW_DEAD, &offload->flow->flags); -- cgit v1.2.3 From 32927393dc1ccd60fb2bdc05b9e8e88753761469 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Apr 2020 08:43:38 +0200 Subject: sysctl: pass kernel pointers to ->proc_handler Instead of having all the sysctl handlers deal with user pointers, which is rather hairy in terms of the BPF interaction, copy the input to and from userspace in common code. This also means that the strings are always NUL-terminated by the common code, making the API a little bit safer. As most handler just pass through the data to one of the common handlers a lot of the changes are mechnical. Signed-off-by: Christoph Hellwig Acked-by: Andrey Ignatov Signed-off-by: Al Viro --- arch/arm64/kernel/armv8_deprecated.c | 2 +- arch/arm64/kernel/fpsimd.c | 3 +- arch/mips/lasat/sysctl.c | 13 +- arch/s390/appldata/appldata_base.c | 11 +- arch/s390/kernel/debug.c | 2 +- arch/s390/kernel/topology.c | 2 +- arch/s390/mm/cmm.c | 12 +- arch/x86/kernel/itmt.c | 3 +- drivers/cdrom/cdrom.c | 2 +- drivers/char/random.c | 2 +- drivers/macintosh/mac_hid.c | 3 +- drivers/parport/procfs.c | 39 +++--- fs/dcache.c | 2 +- fs/drop_caches.c | 2 +- fs/file_table.c | 4 +- fs/fscache/main.c | 3 +- fs/inode.c | 2 +- fs/proc/proc_sysctl.c | 47 ++++--- fs/quota/dquot.c | 2 +- fs/xfs/xfs_sysctl.c | 4 +- include/linux/bpf-cgroup.h | 9 +- include/linux/compaction.h | 2 +- include/linux/fs.h | 6 +- include/linux/ftrace.h | 3 +- include/linux/hugetlb.h | 15 +- include/linux/kprobes.h | 2 +- include/linux/latencytop.h | 4 +- include/linux/mm.h | 12 +- include/linux/mmzone.h | 23 ++- include/linux/nmi.h | 15 +- include/linux/perf_event.h | 13 +- include/linux/printk.h | 2 +- include/linux/sched/sysctl.h | 44 ++---- include/linux/security.h | 2 +- include/linux/sysctl.h | 53 +++---- include/linux/timer.h | 3 +- include/linux/vmstat.h | 8 +- include/linux/writeback.h | 28 ++-- ipc/ipc_sysctl.c | 10 +- ipc/mq_sysctl.c | 4 +- kernel/bpf/cgroup.c | 35 ++--- kernel/events/callchain.c | 2 +- kernel/events/core.c | 6 +- kernel/kprobes.c | 2 +- kernel/latencytop.c | 4 +- kernel/pid_namespace.c | 2 +- kernel/printk/printk.c | 2 +- kernel/sched/core.c | 9 +- kernel/sched/fair.c | 3 +- kernel/sched/rt.c | 10 +- kernel/sched/topology.c | 2 +- kernel/seccomp.c | 2 +- kernel/sysctl.c | 239 ++++++++++++-------------------- kernel/time/timer.c | 3 +- kernel/trace/trace.c | 2 +- kernel/umh.c | 2 +- kernel/utsname_sysctl.c | 2 +- kernel/watchdog.c | 12 +- mm/compaction.c | 2 +- mm/hugetlb.c | 9 +- mm/page-writeback.c | 16 +-- mm/page_alloc.c | 30 ++-- mm/util.c | 10 +- mm/vmstat.c | 4 +- net/bridge/br_netfilter_hooks.c | 2 +- net/core/neighbour.c | 28 ++-- net/core/sysctl_net_core.c | 27 ++-- net/decnet/dn_dev.c | 7 +- net/decnet/sysctl_net_decnet.c | 27 ++-- net/ipv4/devinet.c | 9 +- net/ipv4/route.c | 3 +- net/ipv4/sysctl_net_ipv4.c | 38 ++--- net/ipv6/addrconf.c | 33 ++--- net/ipv6/ndisc.c | 3 +- net/ipv6/route.c | 5 +- net/ipv6/sysctl_net_ipv6.c | 3 +- net/mpls/af_mpls.c | 5 +- net/netfilter/ipvs/ip_vs_ctl.c | 6 +- net/netfilter/nf_conntrack_standalone.c | 2 +- net/netfilter/nf_log.c | 2 +- net/phonet/sysctl.c | 3 +- net/rds/tcp.c | 6 +- net/sctp/sysctl.c | 32 ++--- net/sunrpc/sysctl.c | 29 ++-- net/sunrpc/xprtrdma/svc_rdma.c | 7 +- security/apparmor/lsm.c | 2 +- security/min_addr.c | 2 +- security/yama/yama_lsm.c | 2 +- 88 files changed, 458 insertions(+), 653 deletions(-) (limited to 'net') diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c index c19aa81ddc8c..7364de008bab 100644 --- a/arch/arm64/kernel/armv8_deprecated.c +++ b/arch/arm64/kernel/armv8_deprecated.c @@ -203,7 +203,7 @@ static void __init register_insn_emulation(struct insn_emulation_ops *ops) } static int emulation_proc_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, + void *buffer, size_t *lenp, loff_t *ppos) { int ret = 0; diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 94289d126993..35cb5e66c504 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -341,8 +341,7 @@ static unsigned int find_supported_vector_length(unsigned int vl) #ifdef CONFIG_SYSCTL static int sve_proc_do_default_vl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; int vl = sve_default_vl; diff --git a/arch/mips/lasat/sysctl.c b/arch/mips/lasat/sysctl.c index e666fe26c50d..2119541a5b8b 100644 --- a/arch/mips/lasat/sysctl.c +++ b/arch/mips/lasat/sysctl.c @@ -95,16 +95,15 @@ int proc_lasat_ip(struct ctl_table *table, int write, len = 0; p = buffer; while (len < *lenp) { - if (get_user(c, p++)) - return -EFAULT; + c = *p; + p++; if (c == 0 || c == '\n') break; len++; } if (len >= sizeof(ipbuf)-1) len = sizeof(ipbuf) - 1; - if (copy_from_user(ipbuf, buffer, len)) - return -EFAULT; + memcpy(ipbuf, buffer, len); ipbuf[len] = 0; *ppos += *lenp; /* Now see if we can convert it to a valid IP */ @@ -122,11 +121,9 @@ int proc_lasat_ip(struct ctl_table *table, int write, if (len > *lenp) len = *lenp; if (len) - if (copy_to_user(buffer, ipbuf, len)) - return -EFAULT; + memcpy(buffer, ipbuf, len); if (len < *lenp) { - if (put_user('\n', ((char *) buffer) + len)) - return -EFAULT; + *((char *)buffer + len) = '\n'; len++; } *lenp = len; diff --git a/arch/s390/appldata/appldata_base.c b/arch/s390/appldata/appldata_base.c index aa738cad1338..d74a4c7d5df6 100644 --- a/arch/s390/appldata/appldata_base.c +++ b/arch/s390/appldata/appldata_base.c @@ -51,10 +51,9 @@ static struct platform_device *appldata_pdev; */ static const char appldata_proc_name[APPLDATA_PROC_NAME_LENGTH] = "appldata"; static int appldata_timer_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); static int appldata_interval_handler(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); static struct ctl_table_header *appldata_sysctl_header; static struct ctl_table appldata_table[] = { @@ -217,7 +216,7 @@ static void __appldata_vtimer_setup(int cmd) */ static int appldata_timer_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int timer_active = appldata_timer_active; int rc; @@ -250,7 +249,7 @@ appldata_timer_handler(struct ctl_table *ctl, int write, */ static int appldata_interval_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int interval = appldata_interval; int rc; @@ -280,7 +279,7 @@ appldata_interval_handler(struct ctl_table *ctl, int write, */ static int appldata_generic_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct appldata_ops *ops = NULL, *tmp_ops; struct list_head *lh; diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c index 6d321f5f101d..636446003a06 100644 --- a/arch/s390/kernel/debug.c +++ b/arch/s390/kernel/debug.c @@ -867,7 +867,7 @@ static int debug_active = 1; * if debug_active is already off */ static int s390dbf_procactive(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { if (!write || debug_stoppable || !debug_active) return proc_dointvec(table, write, buffer, lenp, ppos); diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index 5f70cefc13e4..332b542548cd 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -594,7 +594,7 @@ static int __init topology_setup(char *str) early_param("topology", topology_setup); static int topology_ctl_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int enabled = topology_is_enabled(); int new_mode; diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c index ae989b740376..36bce727897b 100644 --- a/arch/s390/mm/cmm.c +++ b/arch/s390/mm/cmm.c @@ -245,7 +245,7 @@ static int cmm_skip_blanks(char *cp, char **endp) } static int cmm_pages_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { long nr = cmm_get_pages(); struct ctl_table ctl_entry = { @@ -264,7 +264,7 @@ static int cmm_pages_handler(struct ctl_table *ctl, int write, } static int cmm_timed_pages_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, + void *buffer, size_t *lenp, loff_t *ppos) { long nr = cmm_get_timed_pages(); @@ -284,7 +284,7 @@ static int cmm_timed_pages_handler(struct ctl_table *ctl, int write, } static int cmm_timeout_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { char buf[64], *p; long nr, seconds; @@ -297,8 +297,7 @@ static int cmm_timeout_handler(struct ctl_table *ctl, int write, if (write) { len = min(*lenp, sizeof(buf)); - if (copy_from_user(buf, buffer, len)) - return -EFAULT; + memcpy(buf, buffer, len); buf[len - 1] = '\0'; cmm_skip_blanks(buf, &p); nr = simple_strtoul(p, &p, 0); @@ -311,8 +310,7 @@ static int cmm_timeout_handler(struct ctl_table *ctl, int write, cmm_timeout_pages, cmm_timeout_seconds); if (len > *lenp) len = *lenp; - if (copy_to_user(buffer, buf, len)) - return -EFAULT; + memcpy(buffer, buf, len); *lenp = len; *ppos += len; } diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c index 1cb3ca9bba49..1afbdd1dd777 100644 --- a/arch/x86/kernel/itmt.c +++ b/arch/x86/kernel/itmt.c @@ -39,8 +39,7 @@ static bool __read_mostly sched_itmt_capable; unsigned int __read_mostly sysctl_sched_itmt_enabled; static int sched_itmt_update_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { unsigned int old_sysctl; int ret; diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index faca0f346fff..e3bbe108eb54 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -3631,7 +3631,7 @@ static void cdrom_update_settings(void) } static int cdrom_sysctl_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; diff --git a/drivers/char/random.c b/drivers/char/random.c index 0d10e31fd342..1e0db78b83ba 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -2057,7 +2057,7 @@ static char sysctl_bootid[16]; * sysctl system call, as 16 bytes of binary data. */ static int proc_do_uuid(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table fake_table; unsigned char buf[64], tmp_uuid[16], *uuid; diff --git a/drivers/macintosh/mac_hid.c b/drivers/macintosh/mac_hid.c index 7af0c536d568..28b8581b44dd 100644 --- a/drivers/macintosh/mac_hid.c +++ b/drivers/macintosh/mac_hid.c @@ -183,8 +183,7 @@ static void mac_hid_stop_emulation(void) } static int mac_hid_toggle_emumouse(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int *valp = table->data; int old_val = *valp; diff --git a/drivers/parport/procfs.c b/drivers/parport/procfs.c index 48804049d697..ee7b5daabfd4 100644 --- a/drivers/parport/procfs.c +++ b/drivers/parport/procfs.c @@ -34,7 +34,7 @@ #define PARPORT_MAX_SPINTIME_VALUE 1000 static int do_active_device(struct ctl_table *table, int write, - void __user *result, size_t *lenp, loff_t *ppos) + void *result, size_t *lenp, loff_t *ppos) { struct parport *port = (struct parport *)table->extra1; char buffer[256]; @@ -65,13 +65,13 @@ static int do_active_device(struct ctl_table *table, int write, *lenp = len; *ppos += len; - - return copy_to_user(result, buffer, len) ? -EFAULT : 0; + memcpy(result, buffer, len); + return 0; } #ifdef CONFIG_PARPORT_1284 static int do_autoprobe(struct ctl_table *table, int write, - void __user *result, size_t *lenp, loff_t *ppos) + void *result, size_t *lenp, loff_t *ppos) { struct parport_device_info *info = table->extra2; const char *str; @@ -108,13 +108,13 @@ static int do_autoprobe(struct ctl_table *table, int write, *ppos += len; - return copy_to_user (result, buffer, len) ? -EFAULT : 0; + memcpy(result, buffer, len); + return 0; } #endif /* IEEE1284.3 support. */ static int do_hardware_base_addr(struct ctl_table *table, int write, - void __user *result, - size_t *lenp, loff_t *ppos) + void *result, size_t *lenp, loff_t *ppos) { struct parport *port = (struct parport *)table->extra1; char buffer[20]; @@ -136,13 +136,12 @@ static int do_hardware_base_addr(struct ctl_table *table, int write, *lenp = len; *ppos += len; - - return copy_to_user(result, buffer, len) ? -EFAULT : 0; + memcpy(result, buffer, len); + return 0; } static int do_hardware_irq(struct ctl_table *table, int write, - void __user *result, - size_t *lenp, loff_t *ppos) + void *result, size_t *lenp, loff_t *ppos) { struct parport *port = (struct parport *)table->extra1; char buffer[20]; @@ -164,13 +163,12 @@ static int do_hardware_irq(struct ctl_table *table, int write, *lenp = len; *ppos += len; - - return copy_to_user(result, buffer, len) ? -EFAULT : 0; + memcpy(result, buffer, len); + return 0; } static int do_hardware_dma(struct ctl_table *table, int write, - void __user *result, - size_t *lenp, loff_t *ppos) + void *result, size_t *lenp, loff_t *ppos) { struct parport *port = (struct parport *)table->extra1; char buffer[20]; @@ -192,13 +190,12 @@ static int do_hardware_dma(struct ctl_table *table, int write, *lenp = len; *ppos += len; - - return copy_to_user(result, buffer, len) ? -EFAULT : 0; + memcpy(result, buffer, len); + return 0; } static int do_hardware_modes(struct ctl_table *table, int write, - void __user *result, - size_t *lenp, loff_t *ppos) + void *result, size_t *lenp, loff_t *ppos) { struct parport *port = (struct parport *)table->extra1; char buffer[40]; @@ -231,8 +228,8 @@ static int do_hardware_modes(struct ctl_table *table, int write, *lenp = len; *ppos += len; - - return copy_to_user(result, buffer, len) ? -EFAULT : 0; + memcpy(result, buffer, len); + return 0; } #define PARPORT_PORT_DIR(CHILD) { .procname = NULL, .mode = 0555, .child = CHILD } diff --git a/fs/dcache.c b/fs/dcache.c index b280e07e162b..8dd4d8d7bd0b 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -165,7 +165,7 @@ static long get_nr_dentry_negative(void) return sum < 0 ? 0 : sum; } -int proc_nr_dentry(struct ctl_table *table, int write, void __user *buffer, +int proc_nr_dentry(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { dentry_stat.nr_dentry = get_nr_dentry(); diff --git a/fs/drop_caches.c b/fs/drop_caches.c index dc1a1d5d825b..f00fcc4a4f72 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -47,7 +47,7 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) } int drop_caches_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { int ret; diff --git a/fs/file_table.c b/fs/file_table.c index 30d55c9a1744..3b612535391f 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -80,14 +80,14 @@ EXPORT_SYMBOL_GPL(get_max_files); */ #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) int proc_nr_files(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { files_stat.nr_files = get_nr_files(); return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } #else int proc_nr_files(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } diff --git a/fs/fscache/main.c b/fs/fscache/main.c index 59c2494efda3..c1e6cc9091aa 100644 --- a/fs/fscache/main.c +++ b/fs/fscache/main.c @@ -51,8 +51,7 @@ static unsigned fscache_op_max_active = 2; static struct ctl_table_header *fscache_sysctl_header; static int fscache_max_active_sysctl(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct workqueue_struct **wqp = table->extra1; unsigned int *datap = table->data; diff --git a/fs/inode.c b/fs/inode.c index 93d9252a00ab..cc6e701b7e5d 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -108,7 +108,7 @@ long get_nr_dirty_inodes(void) */ #ifdef CONFIG_SYSCTL int proc_nr_inodes(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { inodes_stat.nr_inodes = get_nr_inodes(); inodes_stat.nr_unused = get_nr_inodes_unused(); diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index b6f5d459b087..df2143e05c57 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -539,13 +539,13 @@ out: return err; } -static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf, +static ssize_t proc_sys_call_handler(struct file *filp, void __user *ubuf, size_t count, loff_t *ppos, int write) { struct inode *inode = file_inode(filp); struct ctl_table_header *head = grab_header(inode); struct ctl_table *table = PROC_I(inode)->sysctl_entry; - void *new_buf = NULL; + void *kbuf; ssize_t error; if (IS_ERR(head)) @@ -564,27 +564,38 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf, if (!table->proc_handler) goto out; - error = BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, &count, - ppos, &new_buf); + if (write) { + kbuf = memdup_user_nul(ubuf, count); + if (IS_ERR(kbuf)) { + error = PTR_ERR(kbuf); + goto out; + } + } else { + error = -ENOMEM; + kbuf = kzalloc(count, GFP_KERNEL); + if (!kbuf) + goto out; + } + + error = BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, &kbuf, &count, + ppos); if (error) - goto out; + goto out_free_buf; /* careful: calling conventions are nasty here */ - if (new_buf) { - mm_segment_t old_fs; - - old_fs = get_fs(); - set_fs(KERNEL_DS); - error = table->proc_handler(table, write, (void __user *)new_buf, - &count, ppos); - set_fs(old_fs); - kfree(new_buf); - } else { - error = table->proc_handler(table, write, buf, &count, ppos); + error = table->proc_handler(table, write, kbuf, &count, ppos); + if (error) + goto out_free_buf; + + if (!write) { + error = -EFAULT; + if (copy_to_user(ubuf, kbuf, count)) + goto out_free_buf; } - if (!error) - error = count; + error = count; +out_free_buf: + kfree(kbuf); out: sysctl_head_finish(head); diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index b6a4f692d345..7b4bac91146b 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2841,7 +2841,7 @@ const struct quotactl_ops dquot_quotactl_sysfile_ops = { EXPORT_SYMBOL(dquot_quotactl_sysfile_ops); static int do_proc_dqstats(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { unsigned int type = (unsigned long *)table->data - dqstats.stat; s64 value = percpu_counter_sum(&dqstats.counter[type]); diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c index 31b3bdbd2eba..021ef96d0542 100644 --- a/fs/xfs/xfs_sysctl.c +++ b/fs/xfs/xfs_sysctl.c @@ -13,7 +13,7 @@ STATIC int xfs_stats_clear_proc_handler( struct ctl_table *ctl, int write, - void __user *buffer, + void *buffer, size_t *lenp, loff_t *ppos) { @@ -33,7 +33,7 @@ STATIC int xfs_panic_mask_proc_handler( struct ctl_table *ctl, int write, - void __user *buffer, + void *buffer, size_t *lenp, loff_t *ppos) { diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index c11b413d5b1a..0b41fd5fc96b 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -138,8 +138,7 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, struct ctl_table *table, int write, - void __user *buf, size_t *pcount, - loff_t *ppos, void **new_buf, + void **buf, size_t *pcount, loff_t *ppos, enum bpf_attach_type type); int __cgroup_bpf_run_filter_setsockopt(struct sock *sock, int *level, @@ -302,12 +301,12 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, }) -#define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, count, pos, nbuf) \ +#define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, count, pos) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled) \ __ret = __cgroup_bpf_run_filter_sysctl(head, table, write, \ - buf, count, pos, nbuf, \ + buf, count, pos, \ BPF_CGROUP_SYSCTL); \ __ret; \ }) @@ -429,7 +428,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, #define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; }) -#define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,pos,nbuf) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,pos) ({ 0; }) #define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) ({ 0; }) #define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, \ optlen, max_optlen, retval) ({ retval; }) diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 4b898cdbdf05..a0eabfbeb0e1 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -86,7 +86,7 @@ static inline unsigned long compact_gap(unsigned int order) #ifdef CONFIG_COMPACTION extern int sysctl_compact_memory; extern int sysctl_compaction_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos); + void *buffer, size_t *length, loff_t *ppos); extern int sysctl_extfrag_threshold; extern int sysctl_compact_unevictable_allowed; diff --git a/include/linux/fs.h b/include/linux/fs.h index 4f6f59b4f22a..9b028d260649 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3536,11 +3536,11 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf, struct ctl_table; int proc_nr_files(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); int proc_nr_dentry(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); int proc_nr_inodes(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); int __init get_filesystem_list(char *buf); #define __FMODE_EXEC ((__force int) FMODE_EXEC) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index db95244a62d4..ddfc377de0d2 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -1005,8 +1005,7 @@ extern void disable_trace_on_warning(void); extern int __disable_trace_on_warning; int tracepoint_printk_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); #else /* CONFIG_TRACING */ static inline void disable_trace_on_warning(void) { } diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 43a1cef8f0f1..92c21c5ccc58 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -105,14 +105,13 @@ struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages, void hugepage_put_subpool(struct hugepage_subpool *spool); void reset_vma_resv_huge_pages(struct vm_area_struct *vma); -int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); -int hugetlb_overcommit_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); -int hugetlb_treat_movable_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); - -#ifdef CONFIG_NUMA -int hugetlb_mempolicy_sysctl_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); -#endif +int hugetlb_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *); +int hugetlb_overcommit_handler(struct ctl_table *, int, void *, size_t *, + loff_t *); +int hugetlb_treat_movable_handler(struct ctl_table *, int, void *, size_t *, + loff_t *); +int hugetlb_mempolicy_sysctl_handler(struct ctl_table *, int, void *, size_t *, + loff_t *); int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *); long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 04bdaf01112c..594265bfd390 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -312,7 +312,7 @@ DEFINE_INSN_CACHE_OPS(optinsn); #ifdef CONFIG_SYSCTL extern int sysctl_kprobes_optimization; extern int proc_kprobes_optimization_handler(struct ctl_table *table, - int write, void __user *buffer, + int write, void *buffer, size_t *length, loff_t *ppos); #endif extern void wait_for_kprobe_optimizer(void); diff --git a/include/linux/latencytop.h b/include/linux/latencytop.h index 9022f0c2e2e4..abe3d95f795b 100644 --- a/include/linux/latencytop.h +++ b/include/linux/latencytop.h @@ -38,8 +38,8 @@ account_scheduler_latency(struct task_struct *task, int usecs, int inter) void clear_tsk_latency_tracing(struct task_struct *p); -extern int sysctl_latencytop(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); +int sysctl_latencytop(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); #else diff --git a/include/linux/mm.h b/include/linux/mm.h index 9c4e7e76dedd..a7b1ef8ed970 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -201,10 +201,10 @@ extern int sysctl_overcommit_memory; extern int sysctl_overcommit_ratio; extern unsigned long sysctl_overcommit_kbytes; -extern int overcommit_ratio_handler(struct ctl_table *, int, void __user *, - size_t *, loff_t *); -extern int overcommit_kbytes_handler(struct ctl_table *, int, void __user *, - size_t *, loff_t *); +int overcommit_ratio_handler(struct ctl_table *, int, void *, size_t *, + loff_t *); +int overcommit_kbytes_handler(struct ctl_table *, int, void *, size_t *, + loff_t *); #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) @@ -2957,8 +2957,8 @@ extern bool process_shares_mm(struct task_struct *p, struct mm_struct *mm); #ifdef CONFIG_SYSCTL extern int sysctl_drop_caches; -int drop_caches_sysctl_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); +int drop_caches_sysctl_handler(struct ctl_table *, int, void *, size_t *, + loff_t *); #endif void drop_slab(void); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index b2af594ef0f7..93cf20f41e26 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -910,22 +910,21 @@ static inline int is_highmem(struct zone *zone) /* These two functions are used to setup the per zone pages min values */ struct ctl_table; -int min_free_kbytes_sysctl_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); -int watermark_scale_factor_sysctl_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); +int min_free_kbytes_sysctl_handler(struct ctl_table *, int, void *, size_t *, + loff_t *); +int watermark_scale_factor_sysctl_handler(struct ctl_table *, int, void *, + size_t *, loff_t *); extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES]; -int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); +int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, void *, + size_t *, loff_t *); int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); + void *, size_t *, loff_t *); int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); + void *, size_t *, loff_t *); int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); - -extern int numa_zonelist_order_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); + void *, size_t *, loff_t *); +int numa_zonelist_order_handler(struct ctl_table *, int, + void *, size_t *, loff_t *); extern int percpu_pagelist_fraction; extern char numa_zonelist_order[]; #define NUMA_ZONELIST_ORDER_LEN 16 diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 9003e29cde46..750c7f395ca9 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -202,16 +202,11 @@ static inline void watchdog_update_hrtimer_threshold(u64 period) { } #endif struct ctl_table; -extern int proc_watchdog(struct ctl_table *, int , - void __user *, size_t *, loff_t *); -extern int proc_nmi_watchdog(struct ctl_table *, int , - void __user *, size_t *, loff_t *); -extern int proc_soft_watchdog(struct ctl_table *, int , - void __user *, size_t *, loff_t *); -extern int proc_watchdog_thresh(struct ctl_table *, int , - void __user *, size_t *, loff_t *); -extern int proc_watchdog_cpumask(struct ctl_table *, int, - void __user *, size_t *, loff_t *); +int proc_watchdog(struct ctl_table *, int, void *, size_t *, loff_t *); +int proc_nmi_watchdog(struct ctl_table *, int , void *, size_t *, loff_t *); +int proc_soft_watchdog(struct ctl_table *, int , void *, size_t *, loff_t *); +int proc_watchdog_thresh(struct ctl_table *, int , void *, size_t *, loff_t *); +int proc_watchdog_cpumask(struct ctl_table *, int, void *, size_t *, loff_t *); #ifdef CONFIG_HAVE_ACPI_APEI_NMI #include diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 9c3e7619c929..347ea379622a 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1280,15 +1280,12 @@ extern int sysctl_perf_cpu_time_max_percent; extern void perf_sample_event_took(u64 sample_len_ns); -extern int perf_proc_update_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); -extern int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); - +int perf_proc_update_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); +int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); int perf_event_max_stack_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); /* Access to perf_event_open(2) syscall. */ #define PERF_SECURITY_OPEN 0 diff --git a/include/linux/printk.h b/include/linux/printk.h index e061635e0409..fcde0772ec98 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -189,7 +189,7 @@ extern int printk_delay_msec; extern int dmesg_restrict; extern int -devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, void __user *buf, +devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, void *buf, size_t *lenp, loff_t *ppos); extern void wake_up_klogd(void); diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index d4f6215ee03f..7b4d3a49b6c5 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -12,9 +12,8 @@ extern unsigned int sysctl_hung_task_panic; extern unsigned long sysctl_hung_task_timeout_secs; extern unsigned long sysctl_hung_task_check_interval_secs; extern int sysctl_hung_task_warnings; -extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos); +int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); #else /* Avoid need for ifdefs elsewhere in the code */ enum { sysctl_hung_task_timeout_secs = 0 }; @@ -43,8 +42,7 @@ extern __read_mostly unsigned int sysctl_sched_migration_cost; extern __read_mostly unsigned int sysctl_sched_nr_migrate; int sched_proc_update_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, - loff_t *ppos); + void *buffer, size_t *length, loff_t *ppos); #endif /* @@ -72,33 +70,21 @@ extern unsigned int sysctl_sched_autogroup_enabled; extern int sysctl_sched_rr_timeslice; extern int sched_rr_timeslice; -extern int sched_rr_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); - -extern int sched_rt_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); - -#ifdef CONFIG_UCLAMP_TASK -extern int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); -#endif - -extern int sysctl_numa_balancing(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); - -extern int sysctl_schedstats(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); +int sched_rr_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); +int sched_rt_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); +int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); +int sysctl_numa_balancing(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); +int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) extern unsigned int sysctl_sched_energy_aware; -extern int sched_energy_aware_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); +int sched_energy_aware_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); #endif #endif /* _LINUX_SCHED_SYSCTL_H */ diff --git a/include/linux/security.h b/include/linux/security.h index a8d9310472df..6aa229b252ce 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -211,7 +211,7 @@ struct request_sock; #ifdef CONFIG_MMU extern int mmap_min_addr_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); #endif /* security_inode_init_security callback function to write xattrs */ diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 36143ca40b56..f2401e45a3c2 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -44,35 +44,26 @@ struct ctl_dir; extern const int sysctl_vals[]; -typedef int proc_handler (struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); - -extern int proc_dostring(struct ctl_table *, int, - void __user *, size_t *, loff_t *); -extern int proc_dointvec(struct ctl_table *, int, - void __user *, size_t *, loff_t *); -extern int proc_douintvec(struct ctl_table *, int, - void __user *, size_t *, loff_t *); -extern int proc_dointvec_minmax(struct ctl_table *, int, - void __user *, size_t *, loff_t *); -extern int proc_douintvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); -extern int proc_dointvec_jiffies(struct ctl_table *, int, - void __user *, size_t *, loff_t *); -extern int proc_dointvec_userhz_jiffies(struct ctl_table *, int, - void __user *, size_t *, loff_t *); -extern int proc_dointvec_ms_jiffies(struct ctl_table *, int, - void __user *, size_t *, loff_t *); -extern int proc_doulongvec_minmax(struct ctl_table *, int, - void __user *, size_t *, loff_t *); -extern int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int, - void __user *, size_t *, loff_t *); -extern int proc_do_large_bitmap(struct ctl_table *, int, - void __user *, size_t *, loff_t *); -extern int proc_do_static_key(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); +typedef int proc_handler(struct ctl_table *ctl, int write, void *buffer, + size_t *lenp, loff_t *ppos); + +int proc_dostring(struct ctl_table *, int, void *, size_t *, loff_t *); +int proc_dointvec(struct ctl_table *, int, void *, size_t *, loff_t *); +int proc_douintvec(struct ctl_table *, int, void *, size_t *, loff_t *); +int proc_dointvec_minmax(struct ctl_table *, int, void *, size_t *, loff_t *); +int proc_douintvec_minmax(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); +int proc_dointvec_jiffies(struct ctl_table *, int, void *, size_t *, loff_t *); +int proc_dointvec_userhz_jiffies(struct ctl_table *, int, void *, size_t *, + loff_t *); +int proc_dointvec_ms_jiffies(struct ctl_table *, int, void *, size_t *, + loff_t *); +int proc_doulongvec_minmax(struct ctl_table *, int, void *, size_t *, loff_t *); +int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int, void *, + size_t *, loff_t *); +int proc_do_large_bitmap(struct ctl_table *, int, void *, size_t *, loff_t *); +int proc_do_static_key(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); /* * Register a set of sysctl names by calling register_sysctl_table @@ -246,7 +237,7 @@ static inline void setup_sysctl_set(struct ctl_table_set *p, #endif /* CONFIG_SYSCTL */ -int sysctl_max_threads(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); +int sysctl_max_threads(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); #endif /* _LINUX_SYSCTL_H */ diff --git a/include/linux/timer.h b/include/linux/timer.h index 0dc19a8c39c9..07910ae5ddd9 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -201,8 +201,7 @@ struct ctl_table; extern unsigned int sysctl_timer_migration; int timer_migration_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); #endif unsigned long __round_jiffies(unsigned long j, int cpu); diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 292485f3d24d..cb507151710f 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -16,8 +16,8 @@ extern int sysctl_stat_interval; #define DISABLE_NUMA_STAT 0 extern int sysctl_vm_numa_stat; DECLARE_STATIC_KEY_TRUE(vm_numa_stat_key); -extern int sysctl_vm_numa_stat_handler(struct ctl_table *table, - int write, void __user *buffer, size_t *length, loff_t *ppos); +int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write, + void *buffer, size_t *length, loff_t *ppos); #endif struct reclaim_stat { @@ -274,8 +274,8 @@ void cpu_vm_stats_fold(int cpu); void refresh_zone_stat_thresholds(void); struct ctl_table; -int vmstat_refresh(struct ctl_table *, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); +int vmstat_refresh(struct ctl_table *, int write, void *buffer, size_t *lenp, + loff_t *ppos); void drain_zonestat(struct zone *zone, struct per_cpu_pageset *); diff --git a/include/linux/writeback.h b/include/linux/writeback.h index a19d845dd7eb..f8a7e1a850fb 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -362,24 +362,18 @@ extern int vm_highmem_is_dirtyable; extern int block_dump; extern int laptop_mode; -extern int dirty_background_ratio_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); -extern int dirty_background_bytes_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); -extern int dirty_ratio_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); -extern int dirty_bytes_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); +int dirty_background_ratio_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); +int dirty_background_bytes_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); +int dirty_ratio_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); +int dirty_bytes_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); int dirtytime_interval_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); - -struct ctl_table; -int dirty_writeback_centisecs_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); + void *buffer, size_t *lenp, loff_t *ppos); +int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh); diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c index affd66537e87..d1b8644bfb88 100644 --- a/ipc/ipc_sysctl.c +++ b/ipc/ipc_sysctl.c @@ -24,7 +24,7 @@ static void *get_ipc(struct ctl_table *table) #ifdef CONFIG_PROC_SYSCTL static int proc_ipc_dointvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table ipc_table; @@ -35,7 +35,7 @@ static int proc_ipc_dointvec(struct ctl_table *table, int write, } static int proc_ipc_dointvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table ipc_table; @@ -46,7 +46,7 @@ static int proc_ipc_dointvec_minmax(struct ctl_table *table, int write, } static int proc_ipc_dointvec_minmax_orphans(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ipc_namespace *ns = current->nsproxy->ipc_ns; int err = proc_ipc_dointvec_minmax(table, write, buffer, lenp, ppos); @@ -59,7 +59,7 @@ static int proc_ipc_dointvec_minmax_orphans(struct ctl_table *table, int write, } static int proc_ipc_doulongvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table ipc_table; memcpy(&ipc_table, table, sizeof(ipc_table)); @@ -70,7 +70,7 @@ static int proc_ipc_doulongvec_minmax(struct ctl_table *table, int write, } static int proc_ipc_auto_msgmni(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table ipc_table; int dummy = 0; diff --git a/ipc/mq_sysctl.c b/ipc/mq_sysctl.c index 7c00f28923a8..72a92a08c848 100644 --- a/ipc/mq_sysctl.c +++ b/ipc/mq_sysctl.c @@ -19,7 +19,7 @@ static void *get_mq(struct ctl_table *table) } static int proc_mq_dointvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table mq_table; memcpy(&mq_table, table, sizeof(mq_table)); @@ -29,7 +29,7 @@ static int proc_mq_dointvec(struct ctl_table *table, int write, } static int proc_mq_dointvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table mq_table; memcpy(&mq_table, table, sizeof(mq_table)); diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index cb305e71e7de..977bc69bb1c5 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1137,16 +1137,13 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = { * @head: sysctl table header * @table: sysctl table * @write: sysctl is being read (= 0) or written (= 1) - * @buf: pointer to buffer passed by user space + * @buf: pointer to buffer (in and out) * @pcount: value-result argument: value is size of buffer pointed to by @buf, * result is size of @new_buf if program set new value, initial value * otherwise * @ppos: value-result argument: value is position at which read from or write * to sysctl is happening, result is new position if program overrode it, * initial value otherwise - * @new_buf: pointer to pointer to new buffer that will be allocated if program - * overrides new value provided by user space on sysctl write - * NOTE: it's caller responsibility to free *new_buf if it was set * @type: type of program to be executed * * Program is run when sysctl is being accessed, either read or written, and @@ -1157,8 +1154,7 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = { */ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, struct ctl_table *table, int write, - void __user *buf, size_t *pcount, - loff_t *ppos, void **new_buf, + void **buf, size_t *pcount, loff_t *ppos, enum bpf_attach_type type) { struct bpf_sysctl_kern ctx = { @@ -1173,36 +1169,28 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, .new_updated = 0, }; struct cgroup *cgrp; + loff_t pos = 0; int ret; ctx.cur_val = kmalloc_track_caller(ctx.cur_len, GFP_KERNEL); - if (ctx.cur_val) { - mm_segment_t old_fs; - loff_t pos = 0; - - old_fs = get_fs(); - set_fs(KERNEL_DS); - if (table->proc_handler(table, 0, (void __user *)ctx.cur_val, - &ctx.cur_len, &pos)) { - /* Let BPF program decide how to proceed. */ - ctx.cur_len = 0; - } - set_fs(old_fs); - } else { + if (!ctx.cur_val || + table->proc_handler(table, 0, ctx.cur_val, &ctx.cur_len, &pos)) { /* Let BPF program decide how to proceed. */ ctx.cur_len = 0; } - if (write && buf && *pcount) { + if (write && *buf && *pcount) { /* BPF program should be able to override new value with a * buffer bigger than provided by user. */ ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL); ctx.new_len = min_t(size_t, PAGE_SIZE, *pcount); - if (!ctx.new_val || - copy_from_user(ctx.new_val, buf, ctx.new_len)) + if (ctx.new_val) { + memcpy(ctx.new_val, *buf, ctx.new_len); + } else { /* Let BPF program decide how to proceed. */ ctx.new_len = 0; + } } rcu_read_lock(); @@ -1213,7 +1201,8 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, kfree(ctx.cur_val); if (ret == 1 && ctx.new_updated) { - *new_buf = ctx.new_val; + kfree(*buf); + *buf = ctx.new_val; *pcount = ctx.new_len; } else { kfree(ctx.new_val); diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index c2b41a263166..bdb1533ada81 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -236,7 +236,7 @@ exit_put: * sysctl_perf_event_max_contexts_per_stack. */ int perf_event_max_stack_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int *value = table->data; int new_value = *value, ret; diff --git a/kernel/events/core.c b/kernel/events/core.c index bc9b98a9af9a..f86d46f2c4d9 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -437,8 +437,7 @@ static void update_perf_cpu_limits(void) static bool perf_rotate_context(struct perf_cpu_context *cpuctx); int perf_proc_update_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; int perf_cpu = sysctl_perf_cpu_time_max_percent; @@ -462,8 +461,7 @@ int perf_proc_update_handler(struct ctl_table *table, int write, int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT; int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 2625c241ac00..ffbe03a45c16 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -892,7 +892,7 @@ static void unoptimize_all_kprobes(void) static DEFINE_MUTEX(kprobe_sysctl_mutex); int sysctl_kprobes_optimization; int proc_kprobes_optimization_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, + void *buffer, size_t *length, loff_t *ppos) { int ret; diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 8d1c15832e55..166d7bf49666 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -269,8 +269,8 @@ static int __init init_lstats_procfs(void) return 0; } -int sysctl_latencytop(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int sysctl_latencytop(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int err; diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 01f8ba32cc0c..3ccaba5f15c0 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -263,7 +263,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) #ifdef CONFIG_CHECKPOINT_RESTORE static int pid_ns_ctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct pid_namespace *pid_ns = task_active_pid_ns(current); struct ctl_table tmp = *table; diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 9a9b6156270b..471f649b5868 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -173,7 +173,7 @@ __setup("printk.devkmsg=", control_devkmsg); char devkmsg_log_str[DEVKMSG_STR_MAX_SIZE] = "ratelimit"; int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { char old_str[DEVKMSG_STR_MAX_SIZE]; unsigned int old; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3a61a3b8eaa9..5c589a2e4d19 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1110,8 +1110,7 @@ static void uclamp_update_root_tg(void) { } #endif int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { bool update_root_tg = false; int old_min, old_max; @@ -2723,7 +2722,7 @@ void set_numabalancing_state(bool enabled) #ifdef CONFIG_PROC_SYSCTL int sysctl_numa_balancing(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; int err; @@ -2797,8 +2796,8 @@ static void __init init_schedstats(void) } #ifdef CONFIG_PROC_SYSCTL -int sysctl_schedstats(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { struct ctl_table t; int err; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 02f323b85b6d..b6077fd5b32f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -645,8 +645,7 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) */ int sched_proc_update_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); unsigned int factor = get_update_sysctl_factor(); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index df11d88c9895..45da29de3ecc 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2714,9 +2714,8 @@ static void sched_rt_do_global(void) def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period()); } -int sched_rt_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +int sched_rt_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int old_period, old_runtime; static DEFINE_MUTEX(mutex); @@ -2754,9 +2753,8 @@ undo: return ret; } -int sched_rr_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +int sched_rr_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int ret; static DEFINE_MUTEX(mutex); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 8344757bba6e..fa64b2ee9fe6 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -209,7 +209,7 @@ bool sched_energy_update; #ifdef CONFIG_PROC_SYSCTL int sched_energy_aware_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret, state; diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 55a6184f5990..d653d8426de9 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -1776,7 +1776,7 @@ static void audit_actions_logged(u32 actions_logged, u32 old_actions_logged, } static int seccomp_actions_logged_handler(struct ctl_table *ro_table, int write, - void __user *buffer, size_t *lenp, + void *buffer, size_t *lenp, loff_t *ppos) { int ret; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 3fafca3ced98..e961286d0e14 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -208,12 +208,10 @@ static int max_extfrag_threshold = 1000; #ifdef CONFIG_PROC_SYSCTL static int _proc_do_string(char *data, int maxlen, int write, - char __user *buffer, - size_t *lenp, loff_t *ppos) + char *buffer, size_t *lenp, loff_t *ppos) { size_t len; - char __user *p; - char c; + char c, *p; if (!data || !maxlen || !*lenp) { *lenp = 0; @@ -238,8 +236,7 @@ static int _proc_do_string(char *data, int maxlen, int write, *ppos += *lenp; p = buffer; while ((p - buffer) < *lenp && len < maxlen - 1) { - if (get_user(c, p++)) - return -EFAULT; + c = *(p++); if (c == 0 || c == '\n') break; data[len++] = c; @@ -261,11 +258,9 @@ static int _proc_do_string(char *data, int maxlen, int write, if (len > *lenp) len = *lenp; if (len) - if (copy_to_user(buffer, data, len)) - return -EFAULT; + memcpy(buffer, data, len); if (len < *lenp) { - if (put_user('\n', buffer + len)) - return -EFAULT; + buffer[len] = '\n'; len++; } *lenp = len; @@ -326,13 +321,13 @@ static bool proc_first_pos_non_zero_ignore(loff_t *ppos, * Returns 0 on success. */ int proc_dostring(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { if (write) proc_first_pos_non_zero_ignore(ppos, table); - return _proc_do_string((char *)(table->data), table->maxlen, write, - (char __user *)buffer, lenp, ppos); + return _proc_do_string(table->data, table->maxlen, write, buffer, lenp, + ppos); } static size_t proc_skip_spaces(char **buf) @@ -463,11 +458,10 @@ static int proc_get_long(char **buf, size_t *size, * @val: the integer to be converted * @neg: sign of the number, %TRUE for negative * - * In case of success %0 is returned and @buf and @size are updated with - * the amount of bytes written. + * In case of success @buf and @size are updated with the amount of bytes + * written. */ -static int proc_put_long(void __user **buf, size_t *size, unsigned long val, - bool neg) +static void proc_put_long(void **buf, size_t *size, unsigned long val, bool neg) { int len; char tmp[TMPBUFLEN], *p = tmp; @@ -476,24 +470,22 @@ static int proc_put_long(void __user **buf, size_t *size, unsigned long val, len = strlen(tmp); if (len > *size) len = *size; - if (copy_to_user(*buf, tmp, len)) - return -EFAULT; + memcpy(*buf, tmp, len); *size -= len; *buf += len; - return 0; } #undef TMPBUFLEN -static int proc_put_char(void __user **buf, size_t *size, char c) +static void proc_put_char(void **buf, size_t *size, char c) { if (*size) { - char __user **buffer = (char __user **)buf; - if (put_user(c, *buffer)) - return -EFAULT; - (*size)--, (*buffer)++; + char **buffer = (char **)buf; + **buffer = c; + + (*size)--; + (*buffer)++; *buf = *buffer; } - return 0; } static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp, @@ -541,7 +533,7 @@ static int do_proc_douintvec_conv(unsigned long *lvalp, static const char proc_wspace_sep[] = { ' ', '\t', '\n' }; static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, - int write, void __user *buffer, + int write, void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(bool *negp, unsigned long *lvalp, int *valp, int write, void *data), @@ -549,7 +541,7 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, { int *i, vleft, first = 1, err = 0; size_t left; - char *kbuf = NULL, *p; + char *p; if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) { *lenp = 0; @@ -569,9 +561,7 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; - p = kbuf = memdup_user_nul(buffer, left); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); + p = buffer; } for (; left && vleft--; i++, first=0) { @@ -598,24 +588,17 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, break; } if (!first) - err = proc_put_char(&buffer, &left, '\t'); - if (err) - break; - err = proc_put_long(&buffer, &left, lval, neg); - if (err) - break; + proc_put_char(&buffer, &left, '\t'); + proc_put_long(&buffer, &left, lval, neg); } } if (!write && !first && left && !err) - err = proc_put_char(&buffer, &left, '\n'); + proc_put_char(&buffer, &left, '\n'); if (write && !err && left) left -= proc_skip_spaces(&p); - if (write) { - kfree(kbuf); - if (first) - return err ? : -EINVAL; - } + if (write && first) + return err ? : -EINVAL; *lenp -= left; out: *ppos += *lenp; @@ -623,7 +606,7 @@ out: } static int do_proc_dointvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos, + void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(bool *negp, unsigned long *lvalp, int *valp, int write, void *data), void *data) @@ -634,7 +617,7 @@ static int do_proc_dointvec(struct ctl_table *table, int write, static int do_proc_douintvec_w(unsigned int *tbl_data, struct ctl_table *table, - void __user *buffer, + void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(unsigned long *lvalp, unsigned int *valp, @@ -645,7 +628,7 @@ static int do_proc_douintvec_w(unsigned int *tbl_data, int err = 0; size_t left; bool neg; - char *kbuf = NULL, *p; + char *p = buffer; left = *lenp; @@ -655,10 +638,6 @@ static int do_proc_douintvec_w(unsigned int *tbl_data, if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; - p = kbuf = memdup_user_nul(buffer, left); - if (IS_ERR(kbuf)) - return -EINVAL; - left -= proc_skip_spaces(&p); if (!left) { err = -EINVAL; @@ -682,7 +661,6 @@ static int do_proc_douintvec_w(unsigned int *tbl_data, left -= proc_skip_spaces(&p); out_free: - kfree(kbuf); if (err) return -EINVAL; @@ -694,7 +672,7 @@ bail_early: return err; } -static int do_proc_douintvec_r(unsigned int *tbl_data, void __user *buffer, +static int do_proc_douintvec_r(unsigned int *tbl_data, void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(unsigned long *lvalp, unsigned int *valp, @@ -712,11 +690,11 @@ static int do_proc_douintvec_r(unsigned int *tbl_data, void __user *buffer, goto out; } - err = proc_put_long(&buffer, &left, lval, false); - if (err || !left) + proc_put_long(&buffer, &left, lval, false); + if (!left) goto out; - err = proc_put_char(&buffer, &left, '\n'); + proc_put_char(&buffer, &left, '\n'); out: *lenp -= left; @@ -726,7 +704,7 @@ out: } static int __do_proc_douintvec(void *tbl_data, struct ctl_table *table, - int write, void __user *buffer, + int write, void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(unsigned long *lvalp, unsigned int *valp, @@ -762,7 +740,7 @@ static int __do_proc_douintvec(void *tbl_data, struct ctl_table *table, } static int do_proc_douintvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos, + void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(unsigned long *lvalp, unsigned int *valp, int write, void *data), @@ -785,16 +763,15 @@ static int do_proc_douintvec(struct ctl_table *table, int write, * * Returns 0 on success. */ -int proc_dointvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int proc_dointvec(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL); } #ifdef CONFIG_COMPACTION static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table, - int write, void __user *buffer, - size_t *lenp, loff_t *ppos) + int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret, old; @@ -826,8 +803,8 @@ static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table, * * Returns 0 on success. */ -int proc_douintvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int proc_douintvec(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { return do_proc_douintvec(table, write, buffer, lenp, ppos, do_proc_douintvec_conv, NULL); @@ -838,7 +815,7 @@ int proc_douintvec(struct ctl_table *table, int write, * This means we can safely use a temporary. */ static int proc_taint(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; unsigned long tmptaint = get_taint(); @@ -870,7 +847,7 @@ static int proc_taint(struct ctl_table *table, int write, #ifdef CONFIG_PRINTK static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; @@ -936,7 +913,7 @@ static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp, * Returns 0 on success or -EINVAL on write when the range check fails. */ int proc_dointvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct do_proc_dointvec_minmax_conv_param param = { .min = (int *) table->extra1, @@ -1005,7 +982,7 @@ static int do_proc_douintvec_minmax_conv(unsigned long *lvalp, * Returns 0 on success or -ERANGE on write when the range check fails. */ int proc_douintvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct do_proc_douintvec_minmax_conv_param param = { .min = (unsigned int *) table->extra1, @@ -1036,7 +1013,7 @@ static int do_proc_dopipe_max_size_conv(unsigned long *lvalp, } static int proc_dopipe_max_size(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return do_proc_douintvec(table, write, buffer, lenp, ppos, do_proc_dopipe_max_size_conv, NULL); @@ -1057,7 +1034,7 @@ static void validate_coredump_safety(void) } static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!error) @@ -1067,7 +1044,7 @@ static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, #ifdef CONFIG_COREDUMP static int proc_dostring_coredump(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int error = proc_dostring(table, write, buffer, lenp, ppos); if (!error) @@ -1078,7 +1055,7 @@ static int proc_dostring_coredump(struct ctl_table *table, int write, #ifdef CONFIG_MAGIC_SYSRQ static int sysrq_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int tmp, ret; @@ -1096,16 +1073,14 @@ static int sysrq_sysctl_handler(struct ctl_table *table, int write, } #endif -static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos, - unsigned long convmul, - unsigned long convdiv) +static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, + int write, void *buffer, size_t *lenp, loff_t *ppos, + unsigned long convmul, unsigned long convdiv) { unsigned long *i, *min, *max; int vleft, first = 1, err = 0; size_t left; - char *kbuf = NULL, *p; + char *p; if (!data || !table->maxlen || !*lenp || (*ppos && !write)) { *lenp = 0; @@ -1124,9 +1099,7 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; - p = kbuf = memdup_user_nul(buffer, left); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); + p = buffer; } for (; left && vleft--; i++, first = 0) { @@ -1154,26 +1127,18 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int *i = val; } else { val = convdiv * (*i) / convmul; - if (!first) { - err = proc_put_char(&buffer, &left, '\t'); - if (err) - break; - } - err = proc_put_long(&buffer, &left, val, false); - if (err) - break; + if (!first) + proc_put_char(&buffer, &left, '\t'); + proc_put_long(&buffer, &left, val, false); } } if (!write && !first && left && !err) - err = proc_put_char(&buffer, &left, '\n'); + proc_put_char(&buffer, &left, '\n'); if (write && !err) left -= proc_skip_spaces(&p); - if (write) { - kfree(kbuf); - if (first) - return err ? : -EINVAL; - } + if (write && first) + return err ? : -EINVAL; *lenp -= left; out: *ppos += *lenp; @@ -1181,10 +1146,8 @@ out: } static int do_proc_doulongvec_minmax(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos, - unsigned long convmul, - unsigned long convdiv) + void *buffer, size_t *lenp, loff_t *ppos, unsigned long convmul, + unsigned long convdiv) { return __do_proc_doulongvec_minmax(table->data, table, write, buffer, lenp, ppos, convmul, convdiv); @@ -1207,7 +1170,7 @@ static int do_proc_doulongvec_minmax(struct ctl_table *table, int write, * Returns 0 on success. */ int proc_doulongvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, 1l, 1l); } @@ -1230,8 +1193,7 @@ int proc_doulongvec_minmax(struct ctl_table *table, int write, * Returns 0 on success. */ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, HZ, 1000l); @@ -1325,7 +1287,7 @@ static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp, * Returns 0 on success. */ int proc_dointvec_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return do_proc_dointvec(table,write,buffer,lenp,ppos, do_proc_dointvec_jiffies_conv,NULL); @@ -1347,7 +1309,7 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, * Returns 0 on success. */ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return do_proc_dointvec(table,write,buffer,lenp,ppos, do_proc_dointvec_userhz_jiffies_conv,NULL); @@ -1369,15 +1331,15 @@ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, * * Returns 0 on success. */ -int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { return do_proc_dointvec(table, write, buffer, lenp, ppos, do_proc_dointvec_ms_jiffies_conv, NULL); } -static int proc_do_cad_pid(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int proc_do_cad_pid(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { struct pid *new_pid; pid_t tmp; @@ -1416,7 +1378,7 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, * Returns 0 on success. */ int proc_do_large_bitmap(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int err = 0; bool first = 1; @@ -1432,7 +1394,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, } if (write) { - char *kbuf, *p; + char *p = buffer; size_t skipped = 0; if (left > PAGE_SIZE - 1) { @@ -1441,15 +1403,9 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, skipped = *lenp - left; } - p = kbuf = memdup_user_nul(buffer, left); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); - tmp_bitmap = bitmap_zalloc(bitmap_len, GFP_KERNEL); - if (!tmp_bitmap) { - kfree(kbuf); + if (!tmp_bitmap) return -ENOMEM; - } proc_skip_char(&p, &left, '\n'); while (!err && left) { unsigned long val_a, val_b; @@ -1513,7 +1469,6 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, first = 0; proc_skip_char(&p, &left, '\n'); } - kfree(kbuf); left += skipped; } else { unsigned long bit_a, bit_b = 0; @@ -1525,27 +1480,17 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, bit_b = find_next_zero_bit(bitmap, bitmap_len, bit_a + 1) - 1; - if (!first) { - err = proc_put_char(&buffer, &left, ','); - if (err) - break; - } - err = proc_put_long(&buffer, &left, bit_a, false); - if (err) - break; + if (!first) + proc_put_char(&buffer, &left, ','); + proc_put_long(&buffer, &left, bit_a, false); if (bit_a != bit_b) { - err = proc_put_char(&buffer, &left, '-'); - if (err) - break; - err = proc_put_long(&buffer, &left, bit_b, false); - if (err) - break; + proc_put_char(&buffer, &left, '-'); + proc_put_long(&buffer, &left, bit_b, false); } first = 0; bit_b++; } - if (!err) - err = proc_put_char(&buffer, &left, '\n'); + proc_put_char(&buffer, &left, '\n'); } if (!err) { @@ -1566,68 +1511,67 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, #else /* CONFIG_PROC_SYSCTL */ int proc_dostring(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_dointvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_douintvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_dointvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_douintvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_dointvec_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_doulongvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { - return -ENOSYS; + return -ENOSYS; } int proc_do_large_bitmap(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } @@ -1636,8 +1580,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, #if defined(CONFIG_SYSCTL) int proc_do_static_key(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct static_key *key = (struct static_key *)table->data; static DEFINE_MUTEX(static_key_mutex); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index a5221abb4594..398e6eadb861 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -249,8 +249,7 @@ void timers_update_nohz(void) } int timer_migration_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8d2b98812625..167a74a15b1a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2661,7 +2661,7 @@ static void output_printk(struct trace_event_buffer *fbuffer) } int tracepoint_printk_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, + void *buffer, size_t *lenp, loff_t *ppos) { int save_tracepoint_printk; diff --git a/kernel/umh.c b/kernel/umh.c index 7f255b5a8845..9788ed481a6a 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -630,7 +630,7 @@ int call_usermodehelper(const char *path, char **argv, char **envp, int wait) EXPORT_SYMBOL(call_usermodehelper); static int proc_cap_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; unsigned long cap_array[_KERNEL_CAPABILITY_U32S]; diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 3732c888a949..4ca61d49885b 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -30,7 +30,7 @@ static void *get_uts(struct ctl_table *table) * to observe. Should this be in kernel/sys.c ???? */ static int proc_do_uts_string(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table uts_table; int r; diff --git a/kernel/watchdog.c b/kernel/watchdog.c index b6b1f54a7837..53ff2c81b084 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -661,7 +661,7 @@ static void proc_watchdog_update(void) * proc_soft_watchdog | soft_watchdog_user_enabled | SOFT_WATCHDOG_ENABLED */ static int proc_watchdog_common(int which, struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int err, old, *param = table->data; @@ -688,7 +688,7 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, * /proc/sys/kernel/watchdog */ int proc_watchdog(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return proc_watchdog_common(NMI_WATCHDOG_ENABLED|SOFT_WATCHDOG_ENABLED, table, write, buffer, lenp, ppos); @@ -698,7 +698,7 @@ int proc_watchdog(struct ctl_table *table, int write, * /proc/sys/kernel/nmi_watchdog */ int proc_nmi_watchdog(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { if (!nmi_watchdog_available && write) return -ENOTSUPP; @@ -710,7 +710,7 @@ int proc_nmi_watchdog(struct ctl_table *table, int write, * /proc/sys/kernel/soft_watchdog */ int proc_soft_watchdog(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return proc_watchdog_common(SOFT_WATCHDOG_ENABLED, table, write, buffer, lenp, ppos); @@ -720,7 +720,7 @@ int proc_soft_watchdog(struct ctl_table *table, int write, * /proc/sys/kernel/watchdog_thresh */ int proc_watchdog_thresh(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int err, old; @@ -743,7 +743,7 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, * been brought online, if desired. */ int proc_watchdog_cpumask(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int err; diff --git a/mm/compaction.c b/mm/compaction.c index 46f0fcc93081..d8cfb7b99a83 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2463,7 +2463,7 @@ int sysctl_compact_memory; * /proc/sys/vm/compact_memory */ int sysctl_compaction_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { if (write) compact_nodes(); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index cd459155d28a..2277c5728b1f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3352,7 +3352,7 @@ static unsigned int cpuset_mems_nr(unsigned int *array) #ifdef CONFIG_SYSCTL static int hugetlb_sysctl_handler_common(bool obey_mempolicy, struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { struct hstate *h = &default_hstate; unsigned long tmp = h->max_huge_pages; @@ -3375,7 +3375,7 @@ out: } int hugetlb_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { return hugetlb_sysctl_handler_common(false, table, write, @@ -3384,7 +3384,7 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write, #ifdef CONFIG_NUMA int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { return hugetlb_sysctl_handler_common(true, table, write, buffer, length, ppos); @@ -3392,8 +3392,7 @@ int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, #endif /* CONFIG_NUMA */ int hugetlb_overcommit_handler(struct ctl_table *table, int write, - void __user *buffer, - size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { struct hstate *h = &default_hstate; unsigned long tmp; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 7326b54ab728..d3ee4c4dafac 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -512,8 +512,7 @@ bool node_dirty_ok(struct pglist_data *pgdat) } int dirty_background_ratio_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -524,8 +523,7 @@ int dirty_background_ratio_handler(struct ctl_table *table, int write, } int dirty_background_bytes_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -535,9 +533,8 @@ int dirty_background_bytes_handler(struct ctl_table *table, int write, return ret; } -int dirty_ratio_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +int dirty_ratio_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int old_ratio = vm_dirty_ratio; int ret; @@ -551,8 +548,7 @@ int dirty_ratio_handler(struct ctl_table *table, int write, } int dirty_bytes_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { unsigned long old_bytes = vm_dirty_bytes; int ret; @@ -1972,7 +1968,7 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb) * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs */ int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { unsigned int old_interval = dirty_writeback_interval; int ret; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 62c1550cd43e..0c43e9ae5004 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5546,21 +5546,11 @@ char numa_zonelist_order[] = "Node"; * sysctl handler for numa_zonelist_order */ int numa_zonelist_order_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, - loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { - char *str; - int ret; - - if (!write) - return proc_dostring(table, write, buffer, length, ppos); - str = memdup_user_nul(buffer, 16); - if (IS_ERR(str)) - return PTR_ERR(str); - - ret = __parse_numa_zonelist_order(str); - kfree(str); - return ret; + if (write) + return __parse_numa_zonelist_order(buffer); + return proc_dostring(table, write, buffer, length, ppos); } @@ -7963,7 +7953,7 @@ core_initcall(init_per_zone_wmark_min) * changes. */ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { int rc; @@ -7979,7 +7969,7 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, } int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { int rc; @@ -8009,7 +7999,7 @@ static void setup_min_unmapped_ratio(void) int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { int rc; @@ -8036,7 +8026,7 @@ static void setup_min_slab_ratio(void) } int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { int rc; @@ -8060,7 +8050,7 @@ int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, * if in function of the boot time zone sizes. */ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { proc_dointvec_minmax(table, write, buffer, length, ppos); setup_per_zone_lowmem_reserve(); @@ -8082,7 +8072,7 @@ static void __zone_pcp_update(struct zone *zone) * pagelist can have before it gets flushed back to buddy allocator. */ int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { struct zone *zone; int old_percpu_pagelist_fraction; diff --git a/mm/util.c b/mm/util.c index 988d11e6c17c..8defc8ec141f 100644 --- a/mm/util.c +++ b/mm/util.c @@ -717,9 +717,8 @@ int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ -int overcommit_ratio_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +int overcommit_ratio_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int ret; @@ -729,9 +728,8 @@ int overcommit_ratio_handler(struct ctl_table *table, int write, return ret; } -int overcommit_kbytes_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +int overcommit_kbytes_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int ret; diff --git a/mm/vmstat.c b/mm/vmstat.c index 96d21a792b57..c03a8c914922 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -76,7 +76,7 @@ static void invalid_numa_statistics(void) static DEFINE_MUTEX(vm_numa_stat_lock); int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { int ret, oldval; @@ -1751,7 +1751,7 @@ static void refresh_vm_stats(struct work_struct *work) } int vmstat_refresh(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { long val; int err; diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 59980ecfc962..04c3f9a82650 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -1027,7 +1027,7 @@ int br_nf_hook_thresh(unsigned int hook, struct net *net, #ifdef CONFIG_SYSCTL static int brnf_sysctl_call_tables(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 39d37d0ef575..3f2263e79e4b 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -3379,7 +3379,7 @@ EXPORT_SYMBOL(neigh_app_ns); static int unres_qlen_max = INT_MAX / SKB_TRUESIZE(ETH_FRAME_LEN); static int proc_unres_qlen(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int size, ret; struct ctl_table tmp = *ctl; @@ -3443,8 +3443,8 @@ static void neigh_proc_update(struct ctl_table *ctl, int write) } static int neigh_proc_dointvec_zero_intmax(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, + loff_t *ppos) { struct ctl_table tmp = *ctl; int ret; @@ -3457,8 +3457,8 @@ static int neigh_proc_dointvec_zero_intmax(struct ctl_table *ctl, int write, return ret; } -int neigh_proc_dointvec(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int neigh_proc_dointvec(struct ctl_table *ctl, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); @@ -3467,8 +3467,7 @@ int neigh_proc_dointvec(struct ctl_table *ctl, int write, } EXPORT_SYMBOL(neigh_proc_dointvec); -int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write, - void __user *buffer, +int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos); @@ -3479,8 +3478,8 @@ int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write, EXPORT_SYMBOL(neigh_proc_dointvec_jiffies); static int neigh_proc_dointvec_userhz_jiffies(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, + loff_t *ppos) { int ret = proc_dointvec_userhz_jiffies(ctl, write, buffer, lenp, ppos); @@ -3489,8 +3488,7 @@ static int neigh_proc_dointvec_userhz_jiffies(struct ctl_table *ctl, int write, } int neigh_proc_dointvec_ms_jiffies(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_ms_jiffies(ctl, write, buffer, lenp, ppos); @@ -3500,8 +3498,8 @@ int neigh_proc_dointvec_ms_jiffies(struct ctl_table *ctl, int write, EXPORT_SYMBOL(neigh_proc_dointvec_ms_jiffies); static int neigh_proc_dointvec_unres_qlen(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, + loff_t *ppos) { int ret = proc_unres_qlen(ctl, write, buffer, lenp, ppos); @@ -3510,8 +3508,8 @@ static int neigh_proc_dointvec_unres_qlen(struct ctl_table *ctl, int write, } static int neigh_proc_base_reachable_time(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, + loff_t *ppos) { struct neigh_parms *p = ctl->extra2; int ret; diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 9f9e00ba3ad7..0ddb13a6282b 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -45,7 +45,7 @@ EXPORT_SYMBOL(sysctl_devconf_inherit_init_net); #ifdef CONFIG_RPS static int rps_sock_flow_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { unsigned int orig_size, size; int ret, i; @@ -115,8 +115,7 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write, static DEFINE_MUTEX(flow_limit_update_mutex); static int flow_limit_cpu_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct sd_flow_limit *cur; struct softnet_data *sd; @@ -180,10 +179,7 @@ write_unlock: } if (len < *lenp) kbuf[len++] = '\n'; - if (copy_to_user(buffer, kbuf, len)) { - ret = -EFAULT; - goto done; - } + memcpy(buffer, kbuf, len); *lenp = len; *ppos += len; } @@ -194,8 +190,7 @@ done: } static int flow_limit_table_len_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { unsigned int old, *ptr; int ret; @@ -217,7 +212,7 @@ static int flow_limit_table_len_sysctl(struct ctl_table *table, int write, #ifdef CONFIG_NET_SCHED static int set_default_qdisc(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { char id[IFNAMSIZ]; struct ctl_table tbl = { @@ -236,7 +231,7 @@ static int set_default_qdisc(struct ctl_table *table, int write, #endif static int proc_do_dev_weight(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -251,7 +246,7 @@ static int proc_do_dev_weight(struct ctl_table *table, int write, } static int proc_do_rss_key(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table fake_table; char buf[NETDEV_RSS_KEY_LEN * 3]; @@ -264,7 +259,7 @@ static int proc_do_rss_key(struct ctl_table *table, int write, #ifdef CONFIG_BPF_JIT static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, + void *buffer, size_t *lenp, loff_t *ppos) { int ret, jit_enable = *(int *)table->data; @@ -291,8 +286,7 @@ static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write, # ifdef CONFIG_HAVE_EBPF_JIT static int proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -303,8 +297,7 @@ proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write, static int proc_dolongvec_minmax_bpf_restricted(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c index cca7ae712995..65abcf1b3210 100644 --- a/net/decnet/dn_dev.c +++ b/net/decnet/dn_dev.c @@ -160,8 +160,8 @@ static int max_t3[] = { 8191 }; /* Must fit in 16 bits when multiplied by BCT3MU static int min_priority[1]; static int max_priority[] = { 127 }; /* From DECnet spec */ -static int dn_forwarding_proc(struct ctl_table *, int, - void __user *, size_t *, loff_t *); +static int dn_forwarding_proc(struct ctl_table *, int, void *, size_t *, + loff_t *); static struct dn_dev_sysctl_table { struct ctl_table_header *sysctl_header; struct ctl_table dn_dev_vars[5]; @@ -245,8 +245,7 @@ static void dn_dev_sysctl_unregister(struct dn_dev_parms *parms) } static int dn_forwarding_proc(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { #ifdef CONFIG_DECNET_ROUTER struct net_device *dev = table->extra1; diff --git a/net/decnet/sysctl_net_decnet.c b/net/decnet/sysctl_net_decnet.c index 55bf64a22b59..deae519bdeec 100644 --- a/net/decnet/sysctl_net_decnet.c +++ b/net/decnet/sysctl_net_decnet.c @@ -134,8 +134,7 @@ static int parse_addr(__le16 *addr, char *str) } static int dn_node_address_handler(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { char addr[DN_ASCBUF_LEN]; size_t len; @@ -148,10 +147,7 @@ static int dn_node_address_handler(struct ctl_table *table, int write, if (write) { len = (*lenp < DN_ASCBUF_LEN) ? *lenp : (DN_ASCBUF_LEN-1); - - if (copy_from_user(addr, buffer, len)) - return -EFAULT; - + memcpy(addr, buffer, len); addr[len] = 0; strip_it(addr); @@ -173,11 +169,9 @@ static int dn_node_address_handler(struct ctl_table *table, int write, len = strlen(addr); addr[len++] = '\n'; - if (len > *lenp) len = *lenp; - - if (copy_to_user(buffer, addr, len)) - return -EFAULT; - + if (len > *lenp) + len = *lenp; + memcpy(buffer, addr, len); *lenp = len; *ppos += len; @@ -185,8 +179,7 @@ static int dn_node_address_handler(struct ctl_table *table, int write, } static int dn_def_dev_handler(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { size_t len; struct net_device *dev; @@ -201,9 +194,7 @@ static int dn_def_dev_handler(struct ctl_table *table, int write, if (*lenp > 16) return -E2BIG; - if (copy_from_user(devname, buffer, *lenp)) - return -EFAULT; - + memcpy(devname, buffer, *lenp); devname[*lenp] = 0; strip_it(devname); @@ -238,9 +229,7 @@ static int dn_def_dev_handler(struct ctl_table *table, int write, if (len > *lenp) len = *lenp; - if (copy_to_user(buffer, devname, len)) - return -EFAULT; - + memcpy(buffer, devname, len); *lenp = len; *ppos += len; diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 30fa42f5997d..a118978d222c 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -2361,8 +2361,7 @@ static int devinet_conf_ifindex(struct net *net, struct ipv4_devconf *cnf) } static int devinet_conf_proc(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int old_value = *(int *)ctl->data; int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); @@ -2414,8 +2413,7 @@ static int devinet_conf_proc(struct ctl_table *ctl, int write, } static int devinet_sysctl_forward(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int val = *valp; @@ -2458,8 +2456,7 @@ static int devinet_sysctl_forward(struct ctl_table *ctl, int write, } static int ipv4_doint_and_flush(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int val = *valp; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 788c69d9bfe0..041f4dcac440 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -3336,8 +3336,7 @@ static int ip_rt_gc_elasticity __read_mostly = 8; static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU; static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = (struct net *)__ctl->extra1; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 81b267e990a1..868e317cc324 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -71,8 +71,7 @@ static void set_local_port_range(struct net *net, int range[2]) /* Validate changes from /proc interface. */ static int ipv4_local_port_range(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(table->data, struct net, ipv4.ip_local_ports.range); @@ -107,7 +106,7 @@ static int ipv4_local_port_range(struct ctl_table *table, int write, /* Validate changes from /proc interface. */ static int ipv4_privileged_ports(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(table->data, struct net, ipv4.sysctl_ip_prot_sock); @@ -168,8 +167,7 @@ static void set_ping_group_range(struct ctl_table *table, kgid_t low, kgid_t hig /* Validate changes from /proc interface. */ static int ipv4_ping_group_range(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct user_namespace *user_ns = current_user_ns(); int ret; @@ -204,8 +202,7 @@ static int ipv4_ping_group_range(struct ctl_table *table, int write, } static int ipv4_fwd_update_priority(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net; int ret; @@ -221,7 +218,7 @@ static int ipv4_fwd_update_priority(struct ctl_table *table, int write, } static int proc_tcp_congestion_control(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(ctl->data, struct net, ipv4.tcp_congestion_control); @@ -241,9 +238,8 @@ static int proc_tcp_congestion_control(struct ctl_table *ctl, int write, } static int proc_tcp_available_congestion_control(struct ctl_table *ctl, - int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + int write, void *buffer, + size_t *lenp, loff_t *ppos) { struct ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX, }; int ret; @@ -258,9 +254,8 @@ static int proc_tcp_available_congestion_control(struct ctl_table *ctl, } static int proc_allowed_congestion_control(struct ctl_table *ctl, - int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + int write, void *buffer, + size_t *lenp, loff_t *ppos) { struct ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX }; int ret; @@ -296,8 +291,7 @@ static int sscanf_key(char *buf, __le32 *key) } static int proc_tcp_fastopen_key(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(table->data, struct net, ipv4.sysctl_tcp_fastopen); @@ -399,7 +393,7 @@ static void proc_configure_early_demux(int enabled, int protocol) } static int proc_tcp_early_demux(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret = 0; @@ -415,7 +409,7 @@ static int proc_tcp_early_demux(struct ctl_table *table, int write, } static int proc_udp_early_demux(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret = 0; @@ -431,8 +425,7 @@ static int proc_udp_early_demux(struct ctl_table *table, int write, } static int proc_tfo_blackhole_detect_timeout(struct ctl_table *table, - int write, - void __user *buffer, + int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(table->data, struct net, @@ -447,8 +440,7 @@ static int proc_tfo_blackhole_detect_timeout(struct ctl_table *table, } static int proc_tcp_available_ulp(struct ctl_table *ctl, - int write, - void __user *buffer, size_t *lenp, + int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table tbl = { .maxlen = TCP_ULP_BUF_MAX, }; @@ -466,7 +458,7 @@ static int proc_tcp_available_ulp(struct ctl_table *ctl, #ifdef CONFIG_IP_ROUTE_MULTIPATH static int proc_fib_multipath_hash_policy(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(table->data, struct net, diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 24e319dfb510..9d0e89bccb90 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -6108,9 +6108,8 @@ static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) #ifdef CONFIG_SYSCTL -static -int addrconf_sysctl_forward(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int addrconf_sysctl_forward(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int val = *valp; @@ -6134,9 +6133,8 @@ int addrconf_sysctl_forward(struct ctl_table *ctl, int write, return ret; } -static -int addrconf_sysctl_mtu(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int addrconf_sysctl_mtu(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) { struct inet6_dev *idev = ctl->extra1; int min_mtu = IPV6_MIN_MTU; @@ -6206,9 +6204,8 @@ static int addrconf_disable_ipv6(struct ctl_table *table, int *p, int newf) return 0; } -static -int addrconf_sysctl_disable(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int addrconf_sysctl_disable(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int val = *valp; @@ -6232,9 +6229,8 @@ int addrconf_sysctl_disable(struct ctl_table *ctl, int write, return ret; } -static -int addrconf_sysctl_proxy_ndp(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int addrconf_sysctl_proxy_ndp(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int ret; @@ -6275,7 +6271,7 @@ int addrconf_sysctl_proxy_ndp(struct ctl_table *ctl, int write, } static int addrconf_sysctl_addr_gen_mode(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, + void *buffer, size_t *lenp, loff_t *ppos) { int ret = 0; @@ -6337,7 +6333,7 @@ out: } static int addrconf_sysctl_stable_secret(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, + void *buffer, size_t *lenp, loff_t *ppos) { int err; @@ -6404,8 +6400,7 @@ out: static int addrconf_sysctl_ignore_routes_with_linkdown(struct ctl_table *ctl, - int write, - void __user *buffer, + int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -6505,10 +6500,8 @@ int addrconf_disable_policy(struct ctl_table *ctl, int *valp, int val) return 0; } -static -int addrconf_sysctl_disable_policy(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +static int addrconf_sysctl_disable_policy(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int val = *valp; diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 1ecd4e9b0bdf..58f1255295d3 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1835,7 +1835,8 @@ static void ndisc_warn_deprecated_sysctl(struct ctl_table *ctl, } } -int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) +int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, void *buffer, + size_t *lenp, loff_t *ppos) { struct net_device *dev = ctl->extra1; struct inet6_dev *idev; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 310cbddaa533..acdb31e38412 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -6088,9 +6088,8 @@ static int rt6_stats_seq_show(struct seq_file *seq, void *v) #ifdef CONFIG_SYSCTL -static -int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net; int delay; diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index 63b657aa8d29..fac2135aa47b 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -26,8 +26,7 @@ static int auto_flowlabels_min; static int auto_flowlabels_max = IP6_AUTO_FLOW_LABEL_MAX; static int proc_rt6_multipath_hash_policy(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net; int ret; diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 4701edffb1f7..a42e4ed5ab0e 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -1362,8 +1362,7 @@ done: (&((struct mpls_dev *)0)->field) static int mpls_conf_proc(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int oval = *(int *)ctl->data; int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); @@ -2594,7 +2593,7 @@ nolabels: } static int mpls_platform_labels(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = table->data; int platform_labels = net->mpls.platform_labels; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 8d14a1acbc37..412656c34f20 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -1736,7 +1736,7 @@ static int three = 3; static int proc_do_defense_mode(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct netns_ipvs *ipvs = table->extra2; int *valp = table->data; @@ -1763,7 +1763,7 @@ proc_do_defense_mode(struct ctl_table *table, int write, static int proc_do_sync_threshold(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int *valp = table->data; int val[2]; @@ -1788,7 +1788,7 @@ proc_do_sync_threshold(struct ctl_table *table, int write, static int proc_do_sync_ports(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int *valp = table->data; int val = *valp; diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 9b57330c81f8..31b027b12ff3 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -517,7 +517,7 @@ static unsigned int nf_conntrack_htable_size_user __read_mostly; static int nf_conntrack_hash_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index bb25d4c794c7..6cb9f9474b05 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -414,7 +414,7 @@ static struct ctl_table nf_log_sysctl_ftable[] = { }; static int nf_log_proc_dostring(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { const struct nf_logger *logger; char buf[NFLOGGER_NAME_LEN]; diff --git a/net/phonet/sysctl.c b/net/phonet/sysctl.c index 251e750fd9aa..0d0bf41381c2 100644 --- a/net/phonet/sysctl.c +++ b/net/phonet/sysctl.c @@ -49,8 +49,7 @@ void phonet_get_local_port_range(int *min, int *max) } static int proc_local_port_range(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; int range[2] = {local_port_range[0], local_port_range[1]}; diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 66121bc6f34e..46782fac4c16 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -62,8 +62,7 @@ static atomic_t rds_tcp_unloading = ATOMIC_INIT(0); static struct kmem_cache *rds_tcp_conn_slab; static int rds_tcp_skbuf_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *fpos); + void *buffer, size_t *lenp, loff_t *fpos); static int rds_tcp_min_sndbuf = SOCK_MIN_SNDBUF; static int rds_tcp_min_rcvbuf = SOCK_MIN_RCVBUF; @@ -676,8 +675,7 @@ static void rds_tcp_sysctl_reset(struct net *net) } static int rds_tcp_skbuf_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *fpos) + void *buffer, size_t *lenp, loff_t *fpos) { struct net *net = current->nsproxy->net_ns; int err; diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index 4740aa70e652..c16c80963e55 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -43,20 +43,15 @@ static unsigned long max_autoclose_max = ? UINT_MAX : MAX_SCHEDULE_TIMEOUT / HZ; static int proc_sctp_do_hmac_alg(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); static int proc_sctp_do_rto_min(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); -static int proc_sctp_do_rto_max(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); +static int proc_sctp_do_rto_max(struct ctl_table *ctl, int write, void *buffer, + size_t *lenp, loff_t *ppos); static int proc_sctp_do_alpha_beta(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); static int proc_sctp_do_auth(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); static struct ctl_table sctp_table[] = { { @@ -343,8 +338,7 @@ static struct ctl_table sctp_net_table[] = { }; static int proc_sctp_do_hmac_alg(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = current->nsproxy->net_ns; struct ctl_table tbl; @@ -389,8 +383,7 @@ static int proc_sctp_do_hmac_alg(struct ctl_table *ctl, int write, } static int proc_sctp_do_rto_min(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = current->nsproxy->net_ns; unsigned int min = *(unsigned int *) ctl->extra1; @@ -418,8 +411,7 @@ static int proc_sctp_do_rto_min(struct ctl_table *ctl, int write, } static int proc_sctp_do_rto_max(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = current->nsproxy->net_ns; unsigned int min = *(unsigned int *) ctl->extra1; @@ -447,8 +439,7 @@ static int proc_sctp_do_rto_max(struct ctl_table *ctl, int write, } static int proc_sctp_do_alpha_beta(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { if (write) pr_warn_once("Changing rto_alpha or rto_beta may lead to " @@ -458,8 +449,7 @@ static int proc_sctp_do_alpha_beta(struct ctl_table *ctl, int write, } static int proc_sctp_do_auth(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = current->nsproxy->net_ns; struct ctl_table tbl; diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c index d75f17b56f0e..999eee1ed61c 100644 --- a/net/sunrpc/sysctl.c +++ b/net/sunrpc/sysctl.c @@ -60,7 +60,7 @@ rpc_unregister_sysctl(void) } static int proc_do_xprt(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { char tmpbuf[256]; size_t len; @@ -70,15 +70,15 @@ static int proc_do_xprt(struct ctl_table *table, int write, return 0; } len = svc_print_xprts(tmpbuf, sizeof(tmpbuf)); - return simple_read_from_buffer(buffer, *lenp, ppos, tmpbuf, len); + return memory_read_from_buffer(buffer, *lenp, ppos, tmpbuf, len); } static int -proc_dodebug(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +proc_dodebug(struct ctl_table *table, int write, void *buffer, size_t *lenp, + loff_t *ppos) { - char tmpbuf[20], c, *s = NULL; - char __user *p; + char tmpbuf[20], *s = NULL; + char *p; unsigned int value; size_t left, len; @@ -90,18 +90,17 @@ proc_dodebug(struct ctl_table *table, int write, left = *lenp; if (write) { - if (!access_ok(buffer, left)) - return -EFAULT; p = buffer; - while (left && __get_user(c, p) >= 0 && isspace(c)) - left--, p++; + while (left && isspace(*p)) { + left--; + p++; + } if (!left) goto done; if (left > sizeof(tmpbuf) - 1) return -EINVAL; - if (copy_from_user(tmpbuf, p, left)) - return -EFAULT; + memcpy(tmpbuf, p, left); tmpbuf[left] = '\0'; value = simple_strtol(tmpbuf, &s, 0); @@ -121,11 +120,9 @@ proc_dodebug(struct ctl_table *table, int write, len = sprintf(tmpbuf, "0x%04x", *(unsigned int *) table->data); if (len > left) len = left; - if (copy_to_user(buffer, tmpbuf, len)) - return -EFAULT; + memcpy(buffer, tmpbuf, len); if ((left -= len) > 0) { - if (put_user('\n', (char __user *)buffer + len)) - return -EFAULT; + *((char *)buffer + len) = '\n'; left--; } } diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c index 97bca509a391..526da5d4710b 100644 --- a/net/sunrpc/xprtrdma/svc_rdma.c +++ b/net/sunrpc/xprtrdma/svc_rdma.c @@ -80,8 +80,7 @@ atomic_t rdma_stat_sq_prod; * current value. */ static int read_reset_stat(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { atomic_t *stat = (atomic_t *)table->data; @@ -103,8 +102,8 @@ static int read_reset_stat(struct ctl_table *table, int write, len -= *ppos; if (len > *lenp) len = *lenp; - if (len && copy_to_user(buffer, str_buf, len)) - return -EFAULT; + if (len) + memcpy(buffer, str_buf, len); *lenp = len; *ppos += len; } diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index b621ad74f54a..27e371b44dad 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -1696,7 +1696,7 @@ static int __init alloc_buffers(void) #ifdef CONFIG_SYSCTL static int apparmor_dointvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { if (!policy_admin_capable(NULL)) return -EPERM; diff --git a/security/min_addr.c b/security/min_addr.c index 94d2b0cf0e7b..88c9a6a21f47 100644 --- a/security/min_addr.c +++ b/security/min_addr.c @@ -30,7 +30,7 @@ static void update_mmap_min_addr(void) * calls update_mmap_min_addr() so non MAP_FIXED hints get rounded properly */ int mmap_min_addr_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; diff --git a/security/yama/yama_lsm.c b/security/yama/yama_lsm.c index 94dc346370b1..536c99646f6a 100644 --- a/security/yama/yama_lsm.c +++ b/security/yama/yama_lsm.c @@ -430,7 +430,7 @@ static struct security_hook_list yama_hooks[] __lsm_ro_after_init = { #ifdef CONFIG_SYSCTL static int yama_dointvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table table_copy; -- cgit v1.2.3 From e62905ae34eaf5fe2cfb254be5e0c097b3b1f798 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Thu, 23 Apr 2020 11:39:20 +0200 Subject: xfrm interface: don't take extra reference to netdev I don't see any reason to do this. Maybe needed before commit 56c5ee1a5823 ("xfrm interface: fix memory leak on creation"). Signed-off-by: Nicolas Dichtel Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_interface.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index 3361e3ac5714..eb9928c0a87c 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -145,7 +145,6 @@ static int xfrmi_create(struct net_device *dev) if (err < 0) goto out; - dev_hold(dev); xfrmi_link(xfrmn, xi); return 0; @@ -175,7 +174,6 @@ static void xfrmi_dev_uninit(struct net_device *dev) struct xfrmi_net *xfrmn = net_generic(xi->net, xfrmi_net_id); xfrmi_unlink(xfrmn, xi); - dev_put(dev); } static void xfrmi_scrub_packet(struct sk_buff *skb, bool xnet) -- cgit v1.2.3 From 7d3118016787b5c05da94b3bcdb96c9d6ff82c44 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Sat, 25 Apr 2020 12:28:14 +0100 Subject: net: rtnetlink: remove redundant assignment to variable err The variable err is being initializeed with a value that is never read and it is being updated later with a new value. The initialization is redundant and can be removed. Addresses-Coverity: ("Unused value") Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index d6f4f4a9e8ba..2269199c5891 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3997,8 +3997,8 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, struct ndmsg *ndm; struct nlattr *tb[NDA_MAX+1]; struct net_device *dev; - int err = -EINVAL; __u8 *addr; + int err; u16 vid; if (!netlink_capable(skb, CAP_NET_ADMIN)) -- cgit v1.2.3 From 2cc974f83fb505751b7fbcf8dee27bdcc7054a7e Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Sun, 26 Apr 2020 15:21:59 +0200 Subject: bridge: mrp: Update Kconfig Add the option BRIDGE_MRP to allow to build in or not MRP support. The default value is N. Reviewed-by: Nikolay Aleksandrov Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- net/bridge/Kconfig | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'net') diff --git a/net/bridge/Kconfig b/net/bridge/Kconfig index e4fb050e2078..51a6414145d2 100644 --- a/net/bridge/Kconfig +++ b/net/bridge/Kconfig @@ -61,3 +61,15 @@ config BRIDGE_VLAN_FILTERING Say N to exclude this support and reduce the binary size. If unsure, say Y. + +config BRIDGE_MRP + bool "MRP protocol" + depends on BRIDGE + default n + help + If you say Y here, then the Ethernet bridge will be able to run MRP + protocol to detect loops + + Say N to exclude this support and reduce the binary size. + + If unsure, say N. -- cgit v1.2.3 From 4b8d7d4c599182393421c190bae3604b4db9629a Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Sun, 26 Apr 2020 15:22:00 +0200 Subject: bridge: mrp: Extend bridge interface To integrate MRP into the bridge, first the bridge needs to be aware of ports that are part of an MRP ring and which rings are on the bridge. Therefore extend bridge interface with the following: - add new flag(BR_MPP_AWARE) to the net bridge ports, this bit will be set when the port is added to an MRP instance. In this way it knows if the frame was received on MRP ring port - add new flag(BR_MRP_LOST_CONT) to the net bridge ports, this bit will be set when the port lost the continuity of MRP Test frames. - add a list of MRP instances Reviewed-by: Nikolay Aleksandrov Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- include/linux/if_bridge.h | 2 ++ net/bridge/br_private.h | 4 ++++ 2 files changed, 6 insertions(+) (limited to 'net') diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index 9e57c4411734..b3a8d3054af0 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -47,6 +47,8 @@ struct br_ip_list { #define BR_BCAST_FLOOD BIT(14) #define BR_NEIGH_SUPPRESS BIT(15) #define BR_ISOLATED BIT(16) +#define BR_MRP_AWARE BIT(17) +#define BR_MRP_LOST_CONT BIT(18) #define BR_DEFAULT_AGEING_TIME (300 * HZ) diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 1f97703a52ff..835a70f8d3ea 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -428,6 +428,10 @@ struct net_bridge { int offload_fwd_mark; #endif struct hlist_head fdb_list; + +#if IS_ENABLED(CONFIG_BRIDGE_MRP) + struct list_head __rcu mrp_list; +#endif }; struct br_input_skb_cb { -- cgit v1.2.3 From 3e54442c93845316762b1b3c75e654463fd1b715 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Sun, 26 Apr 2020 15:22:01 +0200 Subject: net: bridge: Add port attribute IFLA_BRPORT_MRP_RING_OPEN This patch adds a new port attribute, IFLA_BRPORT_MRP_RING_OPEN, which allows to notify the userspace when the port lost the continuite of MRP frames. This attribute is set by kernel whenever the SW or HW detects that the ring is being open or closed. Reviewed-by: Nikolay Aleksandrov Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 1 + net/bridge/br_netlink.c | 3 +++ tools/include/uapi/linux/if_link.h | 1 + 3 files changed, 5 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 127c704eeba9..a009365ad67b 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -343,6 +343,7 @@ enum { IFLA_BRPORT_NEIGH_SUPPRESS, IFLA_BRPORT_ISOLATED, IFLA_BRPORT_BACKUP_PORT, + IFLA_BRPORT_MRP_RING_OPEN, __IFLA_BRPORT_MAX }; #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 43dab4066f91..4084f1ef8641 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -151,6 +151,7 @@ static inline size_t br_port_info_size(void) + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MULTICAST_ROUTER */ #endif + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_GROUP_FWD_MASK */ + + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MRP_RING_OPEN */ + 0; } @@ -213,6 +214,8 @@ static int br_port_fill_attrs(struct sk_buff *skb, nla_put_u16(skb, IFLA_BRPORT_GROUP_FWD_MASK, p->group_fwd_mask) || nla_put_u8(skb, IFLA_BRPORT_NEIGH_SUPPRESS, !!(p->flags & BR_NEIGH_SUPPRESS)) || + nla_put_u8(skb, IFLA_BRPORT_MRP_RING_OPEN, !!(p->flags & + BR_MRP_LOST_CONT)) || nla_put_u8(skb, IFLA_BRPORT_ISOLATED, !!(p->flags & BR_ISOLATED))) return -EMSGSIZE; diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h index ca6665ea758a..cafedbbfefbe 100644 --- a/tools/include/uapi/linux/if_link.h +++ b/tools/include/uapi/linux/if_link.h @@ -343,6 +343,7 @@ enum { IFLA_BRPORT_NEIGH_SUPPRESS, IFLA_BRPORT_ISOLATED, IFLA_BRPORT_BACKUP_PORT, + IFLA_BRPORT_MRP_RING_OPEN, __IFLA_BRPORT_MAX }; #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) -- cgit v1.2.3 From 2f1a11ae11d222b3a3b41d09a85cb2bf8f83db49 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Sun, 26 Apr 2020 15:22:02 +0200 Subject: bridge: mrp: Add MRP interface. Define the MRP interface. This interface is used by the netlink to update the MRP instances and by the MRP to make the calls to switchdev to offload it to HW. It defines an MRP instance 'struct br_mrp' which is a list of MRP instances. Which will be part of the 'struct net_bridge'. Each instance has 2 ring ports, a bridge and an ID. In case the HW can't generate MRP Test frames then the SW will generate those. br_mrp_add - adds a new MRP instance. br_mrp_del - deletes an existing MRP instance. Each instance has an ID(ring_id). br_mrp_set_port_state - changes the port state. The port can be in forwarding state, which means that the frames can pass through or in blocked state which means that the frames can't pass through except MRP frames. This will eventually call the switchdev API to notify the HW. This information is used also by the SW bridge to know how to forward frames in case the HW doesn't have this capability. br_mrp_set_port_role - a port role can be primary or secondary. This information is required to be pushed to HW in case the HW can generate MRP_Test frames. Because the MRP_Test frames contains a file with this information. Otherwise the HW will not be able to generate the frames correctly. br_mrp_set_ring_state - a ring can be in state open or closed. State open means that the mrp port stopped receiving MRP_Test frames, while closed means that the mrp port received MRP_Test frames. Similar with br_mrp_port_role, this information is pushed in HW because the MRP_Test frames contain this information. br_mrp_set_ring_role - a ring can have the following roles MRM or MRC. For the role MRM it is expected that the HW can terminate the MRP frames, notify the SW that it stopped receiving MRP_Test frames and trapp all the other MRP frames. While for MRC mode it is expected that the HW can forward the MRP frames only between the MRP ports and copy MRP_Topology frames to CPU. In case the HW doesn't support a role it needs to return an error code different than -EOPNOTSUPP. br_mrp_start_test - this starts/stops the generation of MRP_Test frames. To stop the generation of frames the interval needs to have a value of 0. In this case the userspace needs to know if the HW supports this or not. Not to have duplicate frames(generated by HW and SW). Because if the HW supports this then the SW will not generate anymore frames and will expect that the HW will notify when it stopped receiving MRP frames using the function br_mrp_port_open. br_mrp_port_open - this function is used by drivers to notify the userspace via a netlink callback that one of the ports stopped receiving MRP_Test frames. This function is called only when the node has the role MRM. It is not supposed to be called from userspace. br_mrp_port_switchdev_add - this corresponds to the function br_mrp_add, and will notify the HW that a MRP instance is added. The function gets as parameter the MRP instance. br_mrp_port_switchdev_del - this corresponds to the function br_mrp_del, and will notify the HW that a MRP instance is removed. The function gets as parameter the ID of the MRP instance that is removed. br_mrp_port_switchdev_set_state - this corresponds to the function br_mrp_set_port_state. It would notify the HW if it should block or not non-MRP frames. br_mrp_port_switchdev_set_port - this corresponds to the function br_mrp_set_port_role. It would set the port role, primary or secondary. br_mrp_switchdev_set_role - this corresponds to the function br_mrp_set_ring_role and would set one of the role MRM or MRC. br_mrp_switchdev_set_ring_state - this corresponds to the function br_mrp_set_ring_state and would set the ring to be open or closed. br_mrp_switchdev_send_ring_test - this corresponds to the function br_mrp_start_test. This will notify the HW to start or stop generating MRP_Test frames. Value 0 for the interval parameter means to stop generating the frames. br_mrp_port_open - this function is used to notify the userspace that the port lost the continuity of MRP Test frames. Reviewed-by: Nikolay Aleksandrov Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- net/bridge/br_private_mrp.h | 63 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 net/bridge/br_private_mrp.h (limited to 'net') diff --git a/net/bridge/br_private_mrp.h b/net/bridge/br_private_mrp.h new file mode 100644 index 000000000000..2921a4b59f8e --- /dev/null +++ b/net/bridge/br_private_mrp.h @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef _BR_PRIVATE_MRP_H_ +#define _BR_PRIVATE_MRP_H_ + +#include "br_private.h" +#include + +struct br_mrp { + /* list of mrp instances */ + struct list_head __rcu list; + + struct net_bridge_port __rcu *p_port; + struct net_bridge_port __rcu *s_port; + + u32 ring_id; + + enum br_mrp_ring_role_type ring_role; + u8 ring_role_offloaded; + enum br_mrp_ring_state_type ring_state; + u32 ring_transitions; + + struct delayed_work test_work; + u32 test_interval; + unsigned long test_end; + u32 test_count_miss; + u32 test_max_miss; + + u32 seq_id; + + struct rcu_head rcu; +}; + +/* br_mrp.c */ +int br_mrp_add(struct net_bridge *br, struct br_mrp_instance *instance); +int br_mrp_del(struct net_bridge *br, struct br_mrp_instance *instance); +int br_mrp_set_port_state(struct net_bridge_port *p, + enum br_mrp_port_state_type state); +int br_mrp_set_port_role(struct net_bridge_port *p, + struct br_mrp_port_role *role); +int br_mrp_set_ring_state(struct net_bridge *br, + struct br_mrp_ring_state *state); +int br_mrp_set_ring_role(struct net_bridge *br, struct br_mrp_ring_role *role); +int br_mrp_start_test(struct net_bridge *br, struct br_mrp_start_test *test); + +/* br_mrp_switchdev.c */ +int br_mrp_switchdev_add(struct net_bridge *br, struct br_mrp *mrp); +int br_mrp_switchdev_del(struct net_bridge *br, struct br_mrp *mrp); +int br_mrp_switchdev_set_ring_role(struct net_bridge *br, struct br_mrp *mrp, + enum br_mrp_ring_role_type role); +int br_mrp_switchdev_set_ring_state(struct net_bridge *br, struct br_mrp *mrp, + enum br_mrp_ring_state_type state); +int br_mrp_switchdev_send_ring_test(struct net_bridge *br, struct br_mrp *mrp, + u32 interval, u8 max_miss, u32 period); +int br_mrp_port_switchdev_set_state(struct net_bridge_port *p, + enum br_mrp_port_state_type state); +int br_mrp_port_switchdev_set_role(struct net_bridge_port *p, + enum br_mrp_port_role_type role); + +/* br_mrp_netlink.c */ +int br_mrp_port_open(struct net_device *dev, u8 loc); + +#endif /* _BR_PRIVATE_MRP_H */ -- cgit v1.2.3 From fadd409136f0f21192d80816edd9529f27d88c17 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Sun, 26 Apr 2020 15:22:04 +0200 Subject: bridge: switchdev: mrp: Implement MRP API for switchdev Implement the MRP api for switchdev. These functions will just eventually call the switchdev functions: switchdev_port_obj_add/del and switchdev_port_attr_set. Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- net/bridge/Makefile | 2 + net/bridge/br_mrp_switchdev.c | 140 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 142 insertions(+) create mode 100644 net/bridge/br_mrp_switchdev.c (limited to 'net') diff --git a/net/bridge/Makefile b/net/bridge/Makefile index 49da7ae6f077..3cacf9dd78d5 100644 --- a/net/bridge/Makefile +++ b/net/bridge/Makefile @@ -25,3 +25,5 @@ bridge-$(CONFIG_BRIDGE_VLAN_FILTERING) += br_vlan.o br_vlan_tunnel.o br_vlan_opt bridge-$(CONFIG_NET_SWITCHDEV) += br_switchdev.o obj-$(CONFIG_NETFILTER) += netfilter/ + +bridge-$(CONFIG_BRIDGE_MRP) += br_mrp_switchdev.o diff --git a/net/bridge/br_mrp_switchdev.c b/net/bridge/br_mrp_switchdev.c new file mode 100644 index 000000000000..51cb1d5a24b4 --- /dev/null +++ b/net/bridge/br_mrp_switchdev.c @@ -0,0 +1,140 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include + +#include "br_private_mrp.h" + +int br_mrp_switchdev_add(struct net_bridge *br, struct br_mrp *mrp) +{ + struct switchdev_obj_mrp mrp_obj = { + .obj.orig_dev = br->dev, + .obj.id = SWITCHDEV_OBJ_ID_MRP, + .p_port = rtnl_dereference(mrp->p_port)->dev, + .s_port = rtnl_dereference(mrp->s_port)->dev, + .ring_id = mrp->ring_id, + }; + int err; + + err = switchdev_port_obj_add(br->dev, &mrp_obj.obj, NULL); + + if (err && err != -EOPNOTSUPP) + return err; + + return 0; +} + +int br_mrp_switchdev_del(struct net_bridge *br, struct br_mrp *mrp) +{ + struct switchdev_obj_mrp mrp_obj = { + .obj.orig_dev = br->dev, + .obj.id = SWITCHDEV_OBJ_ID_MRP, + .p_port = NULL, + .s_port = NULL, + .ring_id = mrp->ring_id, + }; + int err; + + err = switchdev_port_obj_del(br->dev, &mrp_obj.obj); + + if (err && err != -EOPNOTSUPP) + return err; + + return 0; +} + +int br_mrp_switchdev_set_ring_role(struct net_bridge *br, + struct br_mrp *mrp, + enum br_mrp_ring_role_type role) +{ + struct switchdev_obj_ring_role_mrp mrp_role = { + .obj.orig_dev = br->dev, + .obj.id = SWITCHDEV_OBJ_ID_RING_ROLE_MRP, + .ring_role = role, + .ring_id = mrp->ring_id, + }; + int err; + + if (role == BR_MRP_RING_ROLE_DISABLED) + err = switchdev_port_obj_del(br->dev, &mrp_role.obj); + else + err = switchdev_port_obj_add(br->dev, &mrp_role.obj, NULL); + + return err; +} + +int br_mrp_switchdev_send_ring_test(struct net_bridge *br, + struct br_mrp *mrp, u32 interval, + u8 max_miss, u32 period) +{ + struct switchdev_obj_ring_test_mrp test = { + .obj.orig_dev = br->dev, + .obj.id = SWITCHDEV_OBJ_ID_RING_TEST_MRP, + .interval = interval, + .max_miss = max_miss, + .ring_id = mrp->ring_id, + .period = period, + }; + int err; + + if (interval == 0) + err = switchdev_port_obj_del(br->dev, &test.obj); + else + err = switchdev_port_obj_add(br->dev, &test.obj, NULL); + + return err; +} + +int br_mrp_switchdev_set_ring_state(struct net_bridge *br, + struct br_mrp *mrp, + enum br_mrp_ring_state_type state) +{ + struct switchdev_obj_ring_state_mrp mrp_state = { + .obj.orig_dev = br->dev, + .obj.id = SWITCHDEV_OBJ_ID_RING_STATE_MRP, + .ring_state = state, + .ring_id = mrp->ring_id, + }; + int err; + + err = switchdev_port_obj_add(br->dev, &mrp_state.obj, NULL); + + if (err && err != -EOPNOTSUPP) + return err; + + return 0; +} + +int br_mrp_port_switchdev_set_state(struct net_bridge_port *p, + enum br_mrp_port_state_type state) +{ + struct switchdev_attr attr = { + .orig_dev = p->dev, + .id = SWITCHDEV_ATTR_ID_MRP_PORT_STATE, + .u.mrp_port_state = state, + }; + int err; + + err = switchdev_port_attr_set(p->dev, &attr); + if (err && err != -EOPNOTSUPP) + br_warn(p->br, "error setting offload MRP state on port %u(%s)\n", + (unsigned int)p->port_no, p->dev->name); + + return err; +} + +int br_mrp_port_switchdev_set_role(struct net_bridge_port *p, + enum br_mrp_port_role_type role) +{ + struct switchdev_attr attr = { + .orig_dev = p->dev, + .id = SWITCHDEV_ATTR_ID_MRP_PORT_ROLE, + .u.mrp_port_role = role, + }; + int err; + + err = switchdev_port_attr_set(p->dev, &attr); + if (err && err != -EOPNOTSUPP) + return err; + + return 0; +} -- cgit v1.2.3 From 9a9f26e8f7ea300e8efffcae036dbef239be433a Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Sun, 26 Apr 2020 15:22:05 +0200 Subject: bridge: mrp: Connect MRP API with the switchdev API Implement the MRP API. In case the HW can't generate MRP Test frames then the SW will try to generate the frames. In case that also the SW will fail in generating the frames then a error is return to the userspace. The userspace is responsible to generate all the other MRP frames regardless if the test frames are generated by HW or SW. The forwarding/termination of MRP frames is happening in the kernel and is done by the MRP instance. The userspace application doesn't do the forwarding. Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- net/bridge/Makefile | 2 +- net/bridge/br_mrp.c | 559 ++++++++++++++++++++++++++++++++++++++++++++ net/bridge/br_mrp_netlink.c | 29 +++ 3 files changed, 589 insertions(+), 1 deletion(-) create mode 100644 net/bridge/br_mrp.c create mode 100644 net/bridge/br_mrp_netlink.c (limited to 'net') diff --git a/net/bridge/Makefile b/net/bridge/Makefile index 3cacf9dd78d5..ccb394236fbd 100644 --- a/net/bridge/Makefile +++ b/net/bridge/Makefile @@ -26,4 +26,4 @@ bridge-$(CONFIG_NET_SWITCHDEV) += br_switchdev.o obj-$(CONFIG_NETFILTER) += netfilter/ -bridge-$(CONFIG_BRIDGE_MRP) += br_mrp_switchdev.o +bridge-$(CONFIG_BRIDGE_MRP) += br_mrp_switchdev.o br_mrp.o br_mrp_netlink.o diff --git a/net/bridge/br_mrp.c b/net/bridge/br_mrp.c new file mode 100644 index 000000000000..d7bc09de4c13 --- /dev/null +++ b/net/bridge/br_mrp.c @@ -0,0 +1,559 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include "br_private_mrp.h" + +static const u8 mrp_test_dmac[ETH_ALEN] = { 0x1, 0x15, 0x4e, 0x0, 0x0, 0x1 }; + +static struct net_bridge_port *br_mrp_get_port(struct net_bridge *br, + u32 ifindex) +{ + struct net_bridge_port *res = NULL; + struct net_bridge_port *port; + + list_for_each_entry(port, &br->port_list, list) { + if (port->dev->ifindex == ifindex) { + res = port; + break; + } + } + + return res; +} + +static struct br_mrp *br_mrp_find_id(struct net_bridge *br, u32 ring_id) +{ + struct br_mrp *res = NULL; + struct br_mrp *mrp; + + list_for_each_entry_rcu(mrp, &br->mrp_list, list, + lockdep_rtnl_is_held()) { + if (mrp->ring_id == ring_id) { + res = mrp; + break; + } + } + + return res; +} + +static struct br_mrp *br_mrp_find_port(struct net_bridge *br, + struct net_bridge_port *p) +{ + struct br_mrp *res = NULL; + struct br_mrp *mrp; + + list_for_each_entry_rcu(mrp, &br->mrp_list, list, + lockdep_rtnl_is_held()) { + if (rcu_access_pointer(mrp->p_port) == p || + rcu_access_pointer(mrp->s_port) == p) { + res = mrp; + break; + } + } + + return res; +} + +static int br_mrp_next_seq(struct br_mrp *mrp) +{ + mrp->seq_id++; + return mrp->seq_id; +} + +static struct sk_buff *br_mrp_skb_alloc(struct net_bridge_port *p, + const u8 *src, const u8 *dst) +{ + struct ethhdr *eth_hdr; + struct sk_buff *skb; + u16 *version; + + skb = dev_alloc_skb(MRP_MAX_FRAME_LENGTH); + if (!skb) + return NULL; + + skb->dev = p->dev; + skb->protocol = htons(ETH_P_MRP); + skb->priority = MRP_FRAME_PRIO; + skb_reserve(skb, sizeof(*eth_hdr)); + + eth_hdr = skb_push(skb, sizeof(*eth_hdr)); + ether_addr_copy(eth_hdr->h_dest, dst); + ether_addr_copy(eth_hdr->h_source, src); + eth_hdr->h_proto = htons(ETH_P_MRP); + + version = skb_put(skb, sizeof(*version)); + *version = cpu_to_be16(MRP_VERSION); + + return skb; +} + +static void br_mrp_skb_tlv(struct sk_buff *skb, + enum br_mrp_tlv_header_type type, + u8 length) +{ + struct br_mrp_tlv_hdr *hdr; + + hdr = skb_put(skb, sizeof(*hdr)); + hdr->type = type; + hdr->length = length; +} + +static void br_mrp_skb_common(struct sk_buff *skb, struct br_mrp *mrp) +{ + struct br_mrp_common_hdr *hdr; + + br_mrp_skb_tlv(skb, BR_MRP_TLV_HEADER_COMMON, sizeof(*hdr)); + + hdr = skb_put(skb, sizeof(*hdr)); + hdr->seq_id = cpu_to_be16(br_mrp_next_seq(mrp)); + memset(hdr->domain, 0xff, MRP_DOMAIN_UUID_LENGTH); +} + +static struct sk_buff *br_mrp_alloc_test_skb(struct br_mrp *mrp, + struct net_bridge_port *p, + enum br_mrp_port_role_type port_role) +{ + struct br_mrp_ring_test_hdr *hdr = NULL; + struct sk_buff *skb = NULL; + + if (!p) + return NULL; + + skb = br_mrp_skb_alloc(p, p->dev->dev_addr, mrp_test_dmac); + if (!skb) + return NULL; + + br_mrp_skb_tlv(skb, BR_MRP_TLV_HEADER_RING_TEST, sizeof(*hdr)); + hdr = skb_put(skb, sizeof(*hdr)); + + hdr->prio = cpu_to_be16(MRP_DEFAULT_PRIO); + ether_addr_copy(hdr->sa, p->br->dev->dev_addr); + hdr->port_role = cpu_to_be16(port_role); + hdr->state = cpu_to_be16(mrp->ring_state); + hdr->transitions = cpu_to_be16(mrp->ring_transitions); + hdr->timestamp = cpu_to_be32(jiffies_to_msecs(jiffies)); + + br_mrp_skb_common(skb, mrp); + br_mrp_skb_tlv(skb, BR_MRP_TLV_HEADER_END, 0x0); + + return skb; +} + +static void br_mrp_test_work_expired(struct work_struct *work) +{ + struct delayed_work *del_work = to_delayed_work(work); + struct br_mrp *mrp = container_of(del_work, struct br_mrp, test_work); + struct net_bridge_port *p; + bool notify_open = false; + struct sk_buff *skb; + + if (time_before_eq(mrp->test_end, jiffies)) + return; + + if (mrp->test_count_miss < mrp->test_max_miss) { + mrp->test_count_miss++; + } else { + /* Notify that the ring is open only if the ring state is + * closed, otherwise it would continue to notify at every + * interval. + */ + if (mrp->ring_state == BR_MRP_RING_STATE_CLOSED) + notify_open = true; + } + + rcu_read_lock(); + + p = rcu_dereference(mrp->p_port); + if (p) { + skb = br_mrp_alloc_test_skb(mrp, p, BR_MRP_PORT_ROLE_PRIMARY); + if (!skb) + goto out; + + skb_reset_network_header(skb); + dev_queue_xmit(skb); + + if (notify_open && !mrp->ring_role_offloaded) + br_mrp_port_open(p->dev, true); + } + + p = rcu_dereference(mrp->s_port); + if (p) { + skb = br_mrp_alloc_test_skb(mrp, p, BR_MRP_PORT_ROLE_SECONDARY); + if (!skb) + goto out; + + skb_reset_network_header(skb); + dev_queue_xmit(skb); + + if (notify_open && !mrp->ring_role_offloaded) + br_mrp_port_open(p->dev, true); + } + +out: + rcu_read_unlock(); + + queue_delayed_work(system_wq, &mrp->test_work, + usecs_to_jiffies(mrp->test_interval)); +} + +/* Deletes the MRP instance. + * note: called under rtnl_lock + */ +static void br_mrp_del_impl(struct net_bridge *br, struct br_mrp *mrp) +{ + struct net_bridge_port *p; + + /* Stop sending MRP_Test frames */ + cancel_delayed_work_sync(&mrp->test_work); + br_mrp_switchdev_send_ring_test(br, mrp, 0, 0, 0); + + br_mrp_switchdev_del(br, mrp); + + /* Reset the ports */ + p = rtnl_dereference(mrp->p_port); + if (p) { + spin_lock_bh(&br->lock); + p->state = BR_STATE_FORWARDING; + p->flags &= ~BR_MRP_AWARE; + spin_unlock_bh(&br->lock); + br_mrp_port_switchdev_set_state(p, BR_STATE_FORWARDING); + rcu_assign_pointer(mrp->p_port, NULL); + } + + p = rtnl_dereference(mrp->s_port); + if (p) { + spin_lock_bh(&br->lock); + p->state = BR_STATE_FORWARDING; + p->flags &= ~BR_MRP_AWARE; + spin_unlock_bh(&br->lock); + br_mrp_port_switchdev_set_state(p, BR_STATE_FORWARDING); + rcu_assign_pointer(mrp->s_port, NULL); + } + + list_del_rcu(&mrp->list); + kfree_rcu(mrp, rcu); +} + +/* Adds a new MRP instance. + * note: called under rtnl_lock + */ +int br_mrp_add(struct net_bridge *br, struct br_mrp_instance *instance) +{ + struct net_bridge_port *p; + struct br_mrp *mrp; + int err; + + /* If the ring exists, it is not possible to create another one with the + * same ring_id + */ + mrp = br_mrp_find_id(br, instance->ring_id); + if (mrp) + return -EINVAL; + + if (!br_mrp_get_port(br, instance->p_ifindex) || + !br_mrp_get_port(br, instance->s_ifindex)) + return -EINVAL; + + mrp = kzalloc(sizeof(*mrp), GFP_KERNEL); + if (!mrp) + return -ENOMEM; + + mrp->ring_id = instance->ring_id; + + p = br_mrp_get_port(br, instance->p_ifindex); + spin_lock_bh(&br->lock); + p->state = BR_STATE_FORWARDING; + p->flags |= BR_MRP_AWARE; + spin_unlock_bh(&br->lock); + rcu_assign_pointer(mrp->p_port, p); + + p = br_mrp_get_port(br, instance->s_ifindex); + spin_lock_bh(&br->lock); + p->state = BR_STATE_FORWARDING; + p->flags |= BR_MRP_AWARE; + spin_unlock_bh(&br->lock); + rcu_assign_pointer(mrp->s_port, p); + + INIT_DELAYED_WORK(&mrp->test_work, br_mrp_test_work_expired); + list_add_tail_rcu(&mrp->list, &br->mrp_list); + + err = br_mrp_switchdev_add(br, mrp); + if (err) + goto delete_mrp; + + return 0; + +delete_mrp: + br_mrp_del_impl(br, mrp); + + return err; +} + +/* Deletes the MRP instance from which the port is part of + * note: called under rtnl_lock + */ +void br_mrp_port_del(struct net_bridge *br, struct net_bridge_port *p) +{ + struct br_mrp *mrp = br_mrp_find_port(br, p); + + /* If the port is not part of a MRP instance just bail out */ + if (!mrp) + return; + + br_mrp_del_impl(br, mrp); +} + +/* Deletes existing MRP instance based on ring_id + * note: called under rtnl_lock + */ +int br_mrp_del(struct net_bridge *br, struct br_mrp_instance *instance) +{ + struct br_mrp *mrp = br_mrp_find_id(br, instance->ring_id); + + if (!mrp) + return -EINVAL; + + br_mrp_del_impl(br, mrp); + + return 0; +} + +/* Set port state, port state can be forwarding, blocked or disabled + * note: already called with rtnl_lock + */ +int br_mrp_set_port_state(struct net_bridge_port *p, + enum br_mrp_port_state_type state) +{ + if (!p || !(p->flags & BR_MRP_AWARE)) + return -EINVAL; + + spin_lock_bh(&p->br->lock); + + if (state == BR_MRP_PORT_STATE_FORWARDING) + p->state = BR_STATE_FORWARDING; + else + p->state = BR_STATE_BLOCKING; + + spin_unlock_bh(&p->br->lock); + + br_mrp_port_switchdev_set_state(p, state); + + return 0; +} + +/* Set port role, port role can be primary or secondary + * note: already called with rtnl_lock + */ +int br_mrp_set_port_role(struct net_bridge_port *p, + struct br_mrp_port_role *role) +{ + struct br_mrp *mrp; + + if (!p || !(p->flags & BR_MRP_AWARE)) + return -EINVAL; + + mrp = br_mrp_find_id(p->br, role->ring_id); + + if (!mrp) + return -EINVAL; + + if (role->role == BR_MRP_PORT_ROLE_PRIMARY) + rcu_assign_pointer(mrp->p_port, p); + else + rcu_assign_pointer(mrp->s_port, p); + + br_mrp_port_switchdev_set_role(p, role->role); + + return 0; +} + +/* Set ring state, ring state can be only Open or Closed + * note: already called with rtnl_lock + */ +int br_mrp_set_ring_state(struct net_bridge *br, + struct br_mrp_ring_state *state) +{ + struct br_mrp *mrp = br_mrp_find_id(br, state->ring_id); + + if (!mrp) + return -EINVAL; + + if (mrp->ring_state == BR_MRP_RING_STATE_CLOSED && + state->ring_state != BR_MRP_RING_STATE_CLOSED) + mrp->ring_transitions++; + + mrp->ring_state = state->ring_state; + + br_mrp_switchdev_set_ring_state(br, mrp, state->ring_state); + + return 0; +} + +/* Set ring role, ring role can be only MRM(Media Redundancy Manager) or + * MRC(Media Redundancy Client). + * note: already called with rtnl_lock + */ +int br_mrp_set_ring_role(struct net_bridge *br, + struct br_mrp_ring_role *role) +{ + struct br_mrp *mrp = br_mrp_find_id(br, role->ring_id); + int err; + + if (!mrp) + return -EINVAL; + + mrp->ring_role = role->ring_role; + + /* If there is an error just bailed out */ + err = br_mrp_switchdev_set_ring_role(br, mrp, role->ring_role); + if (err && err != -EOPNOTSUPP) + return err; + + /* Now detect if the HW actually applied the role or not. If the HW + * applied the role it means that the SW will not to do those operations + * anymore. For example if the role ir MRM then the HW will notify the + * SW when ring is open, but if the is not pushed to the HW the SW will + * need to detect when the ring is open + */ + mrp->ring_role_offloaded = err == -EOPNOTSUPP ? 0 : 1; + + return 0; +} + +/* Start to generate MRP test frames, the frames are generated by HW and if it + * fails, they are generated by the SW. + * note: already called with rtnl_lock + */ +int br_mrp_start_test(struct net_bridge *br, + struct br_mrp_start_test *test) +{ + struct br_mrp *mrp = br_mrp_find_id(br, test->ring_id); + + if (!mrp) + return -EINVAL; + + /* Try to push it to the HW and if it fails then continue to generate in + * SW and if that also fails then return error + */ + if (!br_mrp_switchdev_send_ring_test(br, mrp, test->interval, + test->max_miss, test->period)) + return 0; + + mrp->test_interval = test->interval; + mrp->test_end = jiffies + usecs_to_jiffies(test->period); + mrp->test_max_miss = test->max_miss; + mrp->test_count_miss = 0; + queue_delayed_work(system_wq, &mrp->test_work, + usecs_to_jiffies(test->interval)); + + return 0; +} + +/* Process only MRP Test frame. All the other MRP frames are processed by + * userspace application + * note: already called with rcu_read_lock + */ +static void br_mrp_mrm_process(struct br_mrp *mrp, struct net_bridge_port *port, + struct sk_buff *skb) +{ + const struct br_mrp_tlv_hdr *hdr; + struct br_mrp_tlv_hdr _hdr; + + /* Each MRP header starts with a version field which is 16 bits. + * Therefore skip the version and get directly the TLV header. + */ + hdr = skb_header_pointer(skb, sizeof(uint16_t), sizeof(_hdr), &_hdr); + if (!hdr) + return; + + if (hdr->type != BR_MRP_TLV_HEADER_RING_TEST) + return; + + mrp->test_count_miss = 0; + + /* Notify the userspace that the ring is closed only when the ring is + * not closed + */ + if (mrp->ring_state != BR_MRP_RING_STATE_CLOSED) + br_mrp_port_open(port->dev, false); +} + +/* This will just forward the frame to the other mrp ring port(MRC role) or will + * not do anything. + * note: already called with rcu_read_lock + */ +static int br_mrp_rcv(struct net_bridge_port *p, + struct sk_buff *skb, struct net_device *dev) +{ + struct net_device *s_dev, *p_dev, *d_dev; + struct net_bridge_port *p_port, *s_port; + struct net_bridge *br; + struct sk_buff *nskb; + struct br_mrp *mrp; + + /* If port is disabled don't accept any frames */ + if (p->state == BR_STATE_DISABLED) + return 0; + + br = p->br; + mrp = br_mrp_find_port(br, p); + if (unlikely(!mrp)) + return 0; + + p_port = rcu_dereference(mrp->p_port); + if (!p_port) + return 0; + + s_port = rcu_dereference(mrp->s_port); + if (!s_port) + return 0; + + /* If the role is MRM then don't forward the frames */ + if (mrp->ring_role == BR_MRP_RING_ROLE_MRM) { + br_mrp_mrm_process(mrp, p, skb); + return 1; + } + + /* Clone the frame and forward it on the other MRP port */ + nskb = skb_clone(skb, GFP_ATOMIC); + if (!nskb) + return 0; + + p_dev = p_port->dev; + s_dev = s_port->dev; + + if (p_dev == dev) + d_dev = s_dev; + else + d_dev = p_dev; + + nskb->dev = d_dev; + skb_push(nskb, ETH_HLEN); + dev_queue_xmit(nskb); + + return 1; +} + +/* Check if the frame was received on a port that is part of MRP ring + * and if the frame has MRP eth. In that case process the frame otherwise do + * normal forwarding. + * note: already called with rcu_read_lock + */ +int br_mrp_process(struct net_bridge_port *p, struct sk_buff *skb) +{ + /* If there is no MRP instance do normal forwarding */ + if (likely(!(p->flags & BR_MRP_AWARE))) + goto out; + + if (unlikely(skb->protocol == htons(ETH_P_MRP))) + return br_mrp_rcv(p, skb, p->dev); + +out: + return 0; +} + +bool br_mrp_enabled(struct net_bridge *br) +{ + return !list_empty(&br->mrp_list); +} diff --git a/net/bridge/br_mrp_netlink.c b/net/bridge/br_mrp_netlink.c new file mode 100644 index 000000000000..b982db14bbf4 --- /dev/null +++ b/net/bridge/br_mrp_netlink.c @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include + +#include +#include "br_private.h" +#include "br_private_mrp.h" + +int br_mrp_port_open(struct net_device *dev, u8 loc) +{ + struct net_bridge_port *p; + int err = 0; + + p = br_port_get_rcu(dev); + if (!p) { + err = -EINVAL; + goto out; + } + + if (loc) + p->flags |= BR_MRP_LOST_CONT; + else + p->flags &= ~BR_MRP_LOST_CONT; + + br_ifinfo_notify(RTM_NEWLINK, NULL, p); + +out: + return err; +} -- cgit v1.2.3 From 4d02b8f075153508562803e590f76c4dfe5f4b66 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Sun, 26 Apr 2020 15:22:06 +0200 Subject: bridge: mrp: Implement netlink interface to configure MRP Implement netlink interface to configure MRP. The implementation will do sanity checks over the attributes and then eventually call the MRP interface. Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- net/bridge/br_mrp_netlink.c | 91 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) (limited to 'net') diff --git a/net/bridge/br_mrp_netlink.c b/net/bridge/br_mrp_netlink.c index b982db14bbf4..503896638be0 100644 --- a/net/bridge/br_mrp_netlink.c +++ b/net/bridge/br_mrp_netlink.c @@ -6,6 +6,97 @@ #include "br_private.h" #include "br_private_mrp.h" +static const struct nla_policy br_mrp_policy[IFLA_BRIDGE_MRP_MAX + 1] = { + [IFLA_BRIDGE_MRP_UNSPEC] = { .type = NLA_REJECT }, + [IFLA_BRIDGE_MRP_INSTANCE] = { .type = NLA_EXACT_LEN, + .len = sizeof(struct br_mrp_instance)}, + [IFLA_BRIDGE_MRP_PORT_STATE] = { .type = NLA_U32 }, + [IFLA_BRIDGE_MRP_PORT_ROLE] = { .type = NLA_EXACT_LEN, + .len = sizeof(struct br_mrp_port_role)}, + [IFLA_BRIDGE_MRP_RING_STATE] = { .type = NLA_EXACT_LEN, + .len = sizeof(struct br_mrp_ring_state)}, + [IFLA_BRIDGE_MRP_RING_ROLE] = { .type = NLA_EXACT_LEN, + .len = sizeof(struct br_mrp_ring_role)}, + [IFLA_BRIDGE_MRP_START_TEST] = { .type = NLA_EXACT_LEN, + .len = sizeof(struct br_mrp_start_test)}, +}; + +int br_mrp_parse(struct net_bridge *br, struct net_bridge_port *p, + struct nlattr *attr, int cmd, struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_BRIDGE_MRP_MAX + 1]; + int err; + + if (br->stp_enabled != BR_NO_STP) { + NL_SET_ERR_MSG_MOD(extack, "MRP can't be enabled if STP is already enabled\n"); + return -EINVAL; + } + + err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_MAX, attr, + br_mrp_policy, extack); + if (err) + return err; + + if (tb[IFLA_BRIDGE_MRP_INSTANCE]) { + struct br_mrp_instance *instance = + nla_data(tb[IFLA_BRIDGE_MRP_INSTANCE]); + + if (cmd == RTM_SETLINK) + err = br_mrp_add(br, instance); + else + err = br_mrp_del(br, instance); + if (err) + return err; + } + + if (tb[IFLA_BRIDGE_MRP_PORT_STATE]) { + enum br_mrp_port_state_type state = + nla_get_u32(tb[IFLA_BRIDGE_MRP_PORT_STATE]); + + err = br_mrp_set_port_state(p, state); + if (err) + return err; + } + + if (tb[IFLA_BRIDGE_MRP_PORT_ROLE]) { + struct br_mrp_port_role *role = + nla_data(tb[IFLA_BRIDGE_MRP_PORT_ROLE]); + + err = br_mrp_set_port_role(p, role); + if (err) + return err; + } + + if (tb[IFLA_BRIDGE_MRP_RING_STATE]) { + struct br_mrp_ring_state *state = + nla_data(tb[IFLA_BRIDGE_MRP_RING_STATE]); + + err = br_mrp_set_ring_state(br, state); + if (err) + return err; + } + + if (tb[IFLA_BRIDGE_MRP_RING_ROLE]) { + struct br_mrp_ring_role *role = + nla_data(tb[IFLA_BRIDGE_MRP_RING_ROLE]); + + err = br_mrp_set_ring_role(br, role); + if (err) + return err; + } + + if (tb[IFLA_BRIDGE_MRP_START_TEST]) { + struct br_mrp_start_test *test = + nla_data(tb[IFLA_BRIDGE_MRP_START_TEST]); + + err = br_mrp_start_test(br, test); + if (err) + return err; + } + + return 0; +} + int br_mrp_port_open(struct net_device *dev, u8 loc) { struct net_bridge_port *p; -- cgit v1.2.3 From 6536993371fab3de4e8379649b60e94d03e6ff37 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Sun, 26 Apr 2020 15:22:07 +0200 Subject: bridge: mrp: Integrate MRP into the bridge To integrate MRP into the bridge, the bridge needs to do the following: - detect if the MRP frame was received on MRP ring port in that case it would be processed otherwise just forward it as usual. - enable parsing of MRP - before whenever the bridge was set up, it would set all the ports in forwarding state. Add an extra check to not set ports in forwarding state if the port is an MRP ring port. The reason of this change is that if the MRP instance initially sets the port in blocked state by setting the bridge up it would overwrite this setting. Reviewed-by: Nikolay Aleksandrov Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- net/bridge/br_device.c | 3 +++ net/bridge/br_if.c | 2 ++ net/bridge/br_input.c | 3 +++ net/bridge/br_netlink.c | 5 +++++ net/bridge/br_private.h | 31 +++++++++++++++++++++++++++++++ 5 files changed, 44 insertions(+) (limited to 'net') diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 0e3dbc5f3c34..8ec1362588af 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -463,6 +463,9 @@ void br_dev_setup(struct net_device *dev) spin_lock_init(&br->lock); INIT_LIST_HEAD(&br->port_list); INIT_HLIST_HEAD(&br->fdb_list); +#if IS_ENABLED(CONFIG_BRIDGE_MRP) + INIT_LIST_HEAD(&br->mrp_list); +#endif spin_lock_init(&br->hash_lock); br->bridge_id.prio[0] = 0x80; diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 4fe30b182ee7..ca685c0cdf95 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -333,6 +333,8 @@ static void del_nbp(struct net_bridge_port *p) br_stp_disable_port(p); spin_unlock_bh(&br->lock); + br_mrp_port_del(br, p); + br_ifinfo_notify(RTM_DELLINK, NULL, p); list_del_rcu(&p->list); diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index fcc260840028..d5c34f36f0f4 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -342,6 +342,9 @@ rx_handler_result_t br_handle_frame(struct sk_buff **pskb) } } + if (unlikely(br_mrp_process(p, skb))) + return RX_HANDLER_PASS; + forward: switch (p->state) { case BR_STATE_FORWARDING: diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 4084f1ef8641..1a5e681a626a 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -672,6 +672,11 @@ static int br_afspec(struct net_bridge *br, if (err) return err; break; + case IFLA_BRIDGE_MRP: + err = br_mrp_parse(br, p, attr, cmd, extack); + if (err) + return err; + break; } } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 835a70f8d3ea..5835828320b6 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -1308,6 +1308,37 @@ unsigned long br_timer_value(const struct timer_list *timer); extern int (*br_fdb_test_addr_hook)(struct net_device *dev, unsigned char *addr); #endif +/* br_mrp.c */ +#if IS_ENABLED(CONFIG_BRIDGE_MRP) +int br_mrp_parse(struct net_bridge *br, struct net_bridge_port *p, + struct nlattr *attr, int cmd, struct netlink_ext_ack *extack); +int br_mrp_process(struct net_bridge_port *p, struct sk_buff *skb); +bool br_mrp_enabled(struct net_bridge *br); +void br_mrp_port_del(struct net_bridge *br, struct net_bridge_port *p); +#else +static inline int br_mrp_parse(struct net_bridge *br, struct net_bridge_port *p, + struct nlattr *attr, int cmd, + struct netlink_ext_ack *extack) +{ + return -EOPNOTSUPP; +} + +static inline int br_mrp_process(struct net_bridge_port *p, struct sk_buff *skb) +{ + return 0; +} + +static inline bool br_mrp_enabled(struct net_bridge *br) +{ + return 0; +} + +static inline void br_mrp_port_del(struct net_bridge *br, + struct net_bridge_port *p) +{ +} +#endif + /* br_netlink.c */ extern struct rtnl_link_ops br_link_ops; int br_netlink_init(void); -- cgit v1.2.3 From 419dba8a49d7cc355e5b495d20dea8217369ed63 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Sun, 26 Apr 2020 15:22:08 +0200 Subject: net: bridge: Add checks for enabling the STP. It is not possible to have the MRP and STP running at the same time on the bridge, therefore add check when enabling the STP to check if MRP is already enabled. In that case return error. Reviewed-by: Nikolay Aleksandrov Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- net/bridge/br_ioctl.c | 3 +-- net/bridge/br_netlink.c | 4 +++- net/bridge/br_private.h | 3 ++- net/bridge/br_stp.c | 6 ++++++ net/bridge/br_stp_if.c | 11 ++++++++++- net/bridge/br_sysfs_br.c | 4 +--- 6 files changed, 23 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c index ae22d784b88a..5e71fc8b826f 100644 --- a/net/bridge/br_ioctl.c +++ b/net/bridge/br_ioctl.c @@ -242,8 +242,7 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; - br_stp_set_enabled(br, args[1]); - ret = 0; + ret = br_stp_set_enabled(br, args[1], NULL); break; case BRCTL_SET_BRIDGE_PRIORITY: diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 1a5e681a626a..a774e19c41bb 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -1109,7 +1109,9 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], if (data[IFLA_BR_STP_STATE]) { u32 stp_enabled = nla_get_u32(data[IFLA_BR_STP_STATE]); - br_stp_set_enabled(br, stp_enabled); + err = br_stp_set_enabled(br, stp_enabled, extack); + if (err) + return err; } if (data[IFLA_BR_PRIORITY]) { diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 5835828320b6..c35647cb138a 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -1283,7 +1283,8 @@ int br_set_ageing_time(struct net_bridge *br, clock_t ageing_time); /* br_stp_if.c */ void br_stp_enable_bridge(struct net_bridge *br); void br_stp_disable_bridge(struct net_bridge *br); -void br_stp_set_enabled(struct net_bridge *br, unsigned long val); +int br_stp_set_enabled(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack); void br_stp_enable_port(struct net_bridge_port *p); void br_stp_disable_port(struct net_bridge_port *p); bool br_stp_recalculate_bridge_id(struct net_bridge *br); diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c index 1f14b8455345..3e88be7aa269 100644 --- a/net/bridge/br_stp.c +++ b/net/bridge/br_stp.c @@ -36,6 +36,12 @@ void br_set_state(struct net_bridge_port *p, unsigned int state) }; int err; + /* Don't change the state of the ports if they are driven by a different + * protocol. + */ + if (p->flags & BR_MRP_AWARE) + return; + p->state = state; err = switchdev_port_attr_set(p->dev, &attr); if (err && err != -EOPNOTSUPP) diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index d174d3a566aa..a42850b7eb9a 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -196,10 +196,17 @@ static void br_stp_stop(struct net_bridge *br) br->stp_enabled = BR_NO_STP; } -void br_stp_set_enabled(struct net_bridge *br, unsigned long val) +int br_stp_set_enabled(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) { ASSERT_RTNL(); + if (br_mrp_enabled(br)) { + NL_SET_ERR_MSG_MOD(extack, + "STP can't be enabled if MRP is already enabled\n"); + return -EINVAL; + } + if (val) { if (br->stp_enabled == BR_NO_STP) br_stp_start(br); @@ -207,6 +214,8 @@ void br_stp_set_enabled(struct net_bridge *br, unsigned long val) if (br->stp_enabled != BR_NO_STP) br_stp_stop(br); } + + return 0; } /* called under bridge lock */ diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c index 9ab0f00b1081..7db06e3f642a 100644 --- a/net/bridge/br_sysfs_br.c +++ b/net/bridge/br_sysfs_br.c @@ -126,9 +126,7 @@ static ssize_t stp_state_show(struct device *d, static int set_stp_state(struct net_bridge *br, unsigned long val) { - br_stp_set_enabled(br, val); - - return 0; + return br_stp_set_enabled(br, val, NULL); } static ssize_t stp_state_store(struct device *d, -- cgit v1.2.3 From fdb9c405e35bdc6e305b9b4e20ebc141ed14fc81 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 24 Apr 2020 21:55:33 +0200 Subject: netfilter: nf_tables: allow up to 64 bytes in the set element data area So far, the set elements could store up to 128-bits in the data area. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 4 ++++ net/netfilter/nf_tables_api.c | 38 ++++++++++++++++++++++++++------------ 2 files changed, 30 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 4ff7c81e6717..d4e29c952c40 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -243,6 +243,10 @@ struct nft_set_elem { u32 buf[NFT_DATA_VALUE_MAXLEN / sizeof(u32)]; struct nft_data val; } key_end; + union { + u32 buf[NFT_DATA_VALUE_MAXLEN / sizeof(u32)]; + struct nft_data val; + } data; void *priv; }; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 9780bd93b7e4..3558e76e2733 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4669,6 +4669,25 @@ static int nft_setelem_parse_key(struct nft_ctx *ctx, struct nft_set *set, return 0; } +static int nft_setelem_parse_data(struct nft_ctx *ctx, struct nft_set *set, + struct nft_data_desc *desc, + struct nft_data *data, + struct nlattr *attr) +{ + int err; + + err = nft_data_init(ctx, data, NFT_DATA_VALUE_MAXLEN, desc, attr); + if (err < 0) + return err; + + if (desc->type != NFT_DATA_VERDICT && desc->len != set->dlen) { + nft_data_release(data, desc->type); + return -EINVAL; + } + + return 0; +} + static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set, const struct nlattr *attr) { @@ -4946,7 +4965,6 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, struct nft_expr *expr = NULL; struct nft_userdata *udata; struct nft_data_desc desc; - struct nft_data data; enum nft_registers dreg; struct nft_trans *trans; u32 flags = 0; @@ -5072,15 +5090,11 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, } if (nla[NFTA_SET_ELEM_DATA] != NULL) { - err = nft_data_init(ctx, &data, sizeof(data), &desc, - nla[NFTA_SET_ELEM_DATA]); + err = nft_setelem_parse_data(ctx, set, &desc, &elem.data.val, + nla[NFTA_SET_ELEM_DATA]); if (err < 0) goto err_parse_key_end; - err = -EINVAL; - if (set->dtype != NFT_DATA_VERDICT && desc.len != set->dlen) - goto err_parse_data; - dreg = nft_type_to_reg(set->dtype); list_for_each_entry(binding, &set->bindings, list) { struct nft_ctx bind_ctx = { @@ -5094,14 +5108,14 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, continue; err = nft_validate_register_store(&bind_ctx, dreg, - &data, + &elem.data.val, desc.type, desc.len); if (err < 0) goto err_parse_data; if (desc.type == NFT_DATA_VERDICT && - (data.verdict.code == NFT_GOTO || - data.verdict.code == NFT_JUMP)) + (elem.data.val.verdict.code == NFT_GOTO || + elem.data.val.verdict.code == NFT_JUMP)) nft_validate_state_update(ctx->net, NFT_VALIDATE_NEED); } @@ -5123,7 +5137,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, err = -ENOMEM; elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, - elem.key_end.val.data, data.data, + elem.key_end.val.data, elem.data.val.data, timeout, expiration, GFP_KERNEL); if (elem.priv == NULL) goto err_parse_data; @@ -5201,7 +5215,7 @@ err_trans: nf_tables_set_elem_destroy(ctx, set, elem.priv); err_parse_data: if (nla[NFTA_SET_ELEM_DATA] != NULL) - nft_data_release(&data, desc.type); + nft_data_release(&elem.data.val, desc.type); err_parse_key_end: nft_data_release(&elem.key_end.val, NFT_DATA_VALUE); err_parse_key: -- cgit v1.2.3 From 0d7c83463fdf7841350f37960a7abadd3e650b41 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 24 Apr 2020 21:55:34 +0200 Subject: netfilter: nft_nat: return EOPNOTSUPP if type or flags are not supported Instead of EINVAL which should be used for malformed netlink messages. Fixes: eb31628e37a0 ("netfilter: nf_tables: Add support for IPv6 NAT") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_nat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index 8b44a4de5329..bb49a217635e 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -129,7 +129,7 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, priv->type = NF_NAT_MANIP_DST; break; default: - return -EINVAL; + return -EOPNOTSUPP; } if (tb[NFTA_NAT_FAMILY] == NULL) @@ -196,7 +196,7 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, if (tb[NFTA_NAT_FLAGS]) { priv->flags = ntohl(nla_get_be32(tb[NFTA_NAT_FLAGS])); if (priv->flags & ~NF_NAT_RANGE_MASK) - return -EINVAL; + return -EOPNOTSUPP; } return nf_ct_netns_get(ctx->net, family); -- cgit v1.2.3 From 4566aa440008103c9bd364f38c03ca5309acc8f4 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 24 Apr 2020 21:55:35 +0200 Subject: netfilter: nft_nat: set flags from initialization path This patch sets the NAT flags from the control plane path. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_nat.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index bb49a217635e..5c7ff213c030 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -55,7 +55,6 @@ static void nft_nat_eval(const struct nft_expr *expr, ®s->data[priv->sreg_addr_max], sizeof(range.max_addr.ip6)); } - range.flags |= NF_NAT_RANGE_MAP_IPS; } if (priv->sreg_proto_min) { @@ -63,10 +62,9 @@ static void nft_nat_eval(const struct nft_expr *expr, ®s->data[priv->sreg_proto_min]); range.max_proto.all = (__force __be16)nft_reg_load16( ®s->data[priv->sreg_proto_max]); - range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } - range.flags |= priv->flags; + range.flags = priv->flags; regs->verdict.code = nf_nat_setup_info(ct, &range, priv->type); } @@ -169,6 +167,8 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, } else { priv->sreg_addr_max = priv->sreg_addr_min; } + + priv->flags |= NF_NAT_RANGE_MAP_IPS; } plen = sizeof_field(struct nf_nat_range, min_addr.all); @@ -191,10 +191,12 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, } else { priv->sreg_proto_max = priv->sreg_proto_min; } + + priv->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } if (tb[NFTA_NAT_FLAGS]) { - priv->flags = ntohl(nla_get_be32(tb[NFTA_NAT_FLAGS])); + priv->flags |= ntohl(nla_get_be32(tb[NFTA_NAT_FLAGS])); if (priv->flags & ~NF_NAT_RANGE_MASK) return -EOPNOTSUPP; } -- cgit v1.2.3 From acd766e31bb96b90c2dc4954f86e573c9ac16c66 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 24 Apr 2020 21:55:36 +0200 Subject: netfilter: nft_nat: add helper function to set up NAT address and protocol This patch add nft_nat_setup_addr() and nft_nat_setup_proto() to set up the NAT mangling. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_nat.c | 56 ++++++++++++++++++++++++++++++------------------- 1 file changed, 34 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index 5c7ff213c030..7442aa8b1555 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -30,6 +30,36 @@ struct nft_nat { u16 flags; }; +static void nft_nat_setup_addr(struct nf_nat_range2 *range, + const struct nft_regs *regs, + const struct nft_nat *priv) +{ + switch (priv->family) { + case AF_INET: + range->min_addr.ip = (__force __be32) + regs->data[priv->sreg_addr_min]; + range->max_addr.ip = (__force __be32) + regs->data[priv->sreg_addr_max]; + break; + case AF_INET6: + memcpy(range->min_addr.ip6, ®s->data[priv->sreg_addr_min], + sizeof(range->min_addr.ip6)); + memcpy(range->max_addr.ip6, ®s->data[priv->sreg_addr_max], + sizeof(range->max_addr.ip6)); + break; + } +} + +static void nft_nat_setup_proto(struct nf_nat_range2 *range, + const struct nft_regs *regs, + const struct nft_nat *priv) +{ + range->min_proto.all = (__force __be16) + nft_reg_load16(®s->data[priv->sreg_proto_min]); + range->max_proto.all = (__force __be16) + nft_reg_load16(®s->data[priv->sreg_proto_max]); +} + static void nft_nat_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -40,29 +70,11 @@ static void nft_nat_eval(const struct nft_expr *expr, struct nf_nat_range2 range; memset(&range, 0, sizeof(range)); - if (priv->sreg_addr_min) { - if (priv->family == AF_INET) { - range.min_addr.ip = (__force __be32) - regs->data[priv->sreg_addr_min]; - range.max_addr.ip = (__force __be32) - regs->data[priv->sreg_addr_max]; - - } else { - memcpy(range.min_addr.ip6, - ®s->data[priv->sreg_addr_min], - sizeof(range.min_addr.ip6)); - memcpy(range.max_addr.ip6, - ®s->data[priv->sreg_addr_max], - sizeof(range.max_addr.ip6)); - } - } + if (priv->sreg_addr_min) + nft_nat_setup_addr(&range, regs, priv); - if (priv->sreg_proto_min) { - range.min_proto.all = (__force __be16)nft_reg_load16( - ®s->data[priv->sreg_proto_min]); - range.max_proto.all = (__force __be16)nft_reg_load16( - ®s->data[priv->sreg_proto_max]); - } + if (priv->sreg_proto_min) + nft_nat_setup_proto(&range, regs, priv); range.flags = priv->flags; -- cgit v1.2.3 From 3ff7ddb1353da9b535e65702704cbadea1da9a00 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 24 Apr 2020 21:55:37 +0200 Subject: netfilter: nft_nat: add netmap support This patch allows you to NAT the network address prefix onto another network address prefix, a.k.a. netmapping. Userspace must specify the NF_NAT_RANGE_NETMAP flag and the prefix address through the NFTA_NAT_REG_ADDR_MIN and NFTA_NAT_REG_ADDR_MAX netlink attributes. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_nat.h | 4 ++- net/netfilter/nft_nat.c | 46 ++++++++++++++++++++++++++++++++++- 2 files changed, 48 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/netfilter/nf_nat.h b/include/uapi/linux/netfilter/nf_nat.h index 4a95c0db14d4..a64586e77b24 100644 --- a/include/uapi/linux/netfilter/nf_nat.h +++ b/include/uapi/linux/netfilter/nf_nat.h @@ -11,6 +11,7 @@ #define NF_NAT_RANGE_PERSISTENT (1 << 3) #define NF_NAT_RANGE_PROTO_RANDOM_FULLY (1 << 4) #define NF_NAT_RANGE_PROTO_OFFSET (1 << 5) +#define NF_NAT_RANGE_NETMAP (1 << 6) #define NF_NAT_RANGE_PROTO_RANDOM_ALL \ (NF_NAT_RANGE_PROTO_RANDOM | NF_NAT_RANGE_PROTO_RANDOM_FULLY) @@ -18,7 +19,8 @@ #define NF_NAT_RANGE_MASK \ (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED | \ NF_NAT_RANGE_PROTO_RANDOM | NF_NAT_RANGE_PERSISTENT | \ - NF_NAT_RANGE_PROTO_RANDOM_FULLY | NF_NAT_RANGE_PROTO_OFFSET) + NF_NAT_RANGE_PROTO_RANDOM_FULLY | NF_NAT_RANGE_PROTO_OFFSET | \ + NF_NAT_RANGE_NETMAP) struct nf_nat_ipv4_range { unsigned int flags; diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index 7442aa8b1555..23a7bfd10521 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -60,6 +60,46 @@ static void nft_nat_setup_proto(struct nf_nat_range2 *range, nft_reg_load16(®s->data[priv->sreg_proto_max]); } +static void nft_nat_setup_netmap(struct nf_nat_range2 *range, + const struct nft_pktinfo *pkt, + const struct nft_nat *priv) +{ + struct sk_buff *skb = pkt->skb; + union nf_inet_addr new_addr; + __be32 netmask; + int i, len = 0; + + switch (priv->type) { + case NFT_NAT_SNAT: + if (nft_pf(pkt) == NFPROTO_IPV4) { + new_addr.ip = ip_hdr(skb)->saddr; + len = sizeof(struct in_addr); + } else { + new_addr.in6 = ipv6_hdr(skb)->saddr; + len = sizeof(struct in6_addr); + } + break; + case NFT_NAT_DNAT: + if (nft_pf(pkt) == NFPROTO_IPV4) { + new_addr.ip = ip_hdr(skb)->daddr; + len = sizeof(struct in_addr); + } else { + new_addr.in6 = ipv6_hdr(skb)->daddr; + len = sizeof(struct in6_addr); + } + break; + } + + for (i = 0; i < len / sizeof(__be32); i++) { + netmask = ~(range->min_addr.ip6[i] ^ range->max_addr.ip6[i]); + new_addr.ip6[i] &= ~netmask; + new_addr.ip6[i] |= range->min_addr.ip6[i] & netmask; + } + + range->min_addr = new_addr; + range->max_addr = new_addr; +} + static void nft_nat_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -70,8 +110,12 @@ static void nft_nat_eval(const struct nft_expr *expr, struct nf_nat_range2 range; memset(&range, 0, sizeof(range)); - if (priv->sreg_addr_min) + + if (priv->sreg_addr_min) { nft_nat_setup_addr(&range, regs, priv); + if (priv->flags & NF_NAT_RANGE_NETMAP) + nft_nat_setup_netmap(&range, pkt, priv); + } if (priv->sreg_proto_min) nft_nat_setup_proto(&range, regs, priv); -- cgit v1.2.3 From 0146dca70b877b73c5fd9c67912b8a0ca8a7bac7 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Mon, 27 Apr 2020 17:59:34 +0200 Subject: xfrm: add support for UDPv6 encapsulation of ESP This patch adds support for encapsulation of ESP over UDPv6. The code is very similar to the IPv4 encapsulation implementation, and allows to easily add espintcp on IPv6 as a follow-up. Signed-off-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- include/net/ipv6_stubs.h | 3 + include/net/xfrm.h | 5 + net/ipv4/udp.c | 10 +- net/ipv6/af_inet6.c | 4 + net/ipv6/ah6.c | 1 + net/ipv6/esp6.c | 226 +++++++++++++++++++++++++++++++++++++++++----- net/ipv6/esp6_offload.c | 7 +- net/ipv6/ip6_vti.c | 18 +++- net/ipv6/ipcomp6.c | 1 + net/ipv6/xfrm6_input.c | 106 +++++++++++++++++++++- net/ipv6/xfrm6_protocol.c | 48 ++++++++++ net/xfrm/xfrm_interface.c | 3 + 12 files changed, 395 insertions(+), 37 deletions(-) (limited to 'net') diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h index 3e7d2c0e79ca..f033a17b53b6 100644 --- a/include/net/ipv6_stubs.h +++ b/include/net/ipv6_stubs.h @@ -56,6 +56,9 @@ struct ipv6_stub { void (*ndisc_send_na)(struct net_device *dev, const struct in6_addr *daddr, const struct in6_addr *solicited_addr, bool router, bool solicited, bool override, bool inc_opt); +#if IS_ENABLED(CONFIG_XFRM) + int (*xfrm6_udp_encap_rcv)(struct sock *sk, struct sk_buff *skb); +#endif struct neigh_table *nd_tbl; }; extern const struct ipv6_stub *ipv6_stub __read_mostly; diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 8f71c111e65a..2577666c34c8 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1406,6 +1406,8 @@ struct xfrm4_protocol { struct xfrm6_protocol { int (*handler)(struct sk_buff *skb); + int (*input_handler)(struct sk_buff *skb, int nexthdr, __be32 spi, + int encap_type); int (*cb_handler)(struct sk_buff *skb, int err); int (*err_handler)(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info); @@ -1590,6 +1592,8 @@ int xfrm6_extract_header(struct sk_buff *skb); int xfrm6_extract_input(struct xfrm_state *x, struct sk_buff *skb); int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi, struct ip6_tnl *t); +int xfrm6_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi, + int encap_type); int xfrm6_transport_finish(struct sk_buff *skb, int async); int xfrm6_rcv_tnl(struct sk_buff *skb, struct ip6_tnl *t); int xfrm6_rcv(struct sk_buff *skb); @@ -1610,6 +1614,7 @@ int xfrm6_find_1stfragopt(struct xfrm_state *x, struct sk_buff *skb, #ifdef CONFIG_XFRM int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb); +int xfrm6_udp_encap_rcv(struct sock *sk, struct sk_buff *skb); int xfrm_user_policy(struct sock *sk, int optname, u8 __user *optval, int optlen); #else diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 32564b350823..1b7ebbcae497 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -112,6 +112,9 @@ #include #include #include +#if IS_ENABLED(CONFIG_IPV6) +#include +#endif struct udp_table udp_table __read_mostly; EXPORT_SYMBOL(udp_table); @@ -2563,7 +2566,12 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, #ifdef CONFIG_XFRM case UDP_ENCAP_ESPINUDP: case UDP_ENCAP_ESPINUDP_NON_IKE: - up->encap_rcv = xfrm4_udp_encap_rcv; +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) + up->encap_rcv = ipv6_stub->xfrm6_udp_encap_rcv; + else +#endif + up->encap_rcv = xfrm4_udp_encap_rcv; #endif fallthrough; case UDP_ENCAP_L2TPINUDP: diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 345baa0a754f..b0b99c08350a 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -60,6 +60,7 @@ #include #include #include +#include #include #include @@ -961,6 +962,9 @@ static const struct ipv6_stub ipv6_stub_impl = { .ip6_del_rt = ip6_del_rt, .udpv6_encap_enable = udpv6_encap_enable, .ndisc_send_na = ndisc_send_na, +#if IS_ENABLED(CONFIG_XFRM) + .xfrm6_udp_encap_rcv = xfrm6_udp_encap_rcv, +#endif .nd_tbl = &nd_tbl, }; diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index 45e2adc56610..d88d97617f7e 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -767,6 +767,7 @@ static const struct xfrm_type ah6_type = { static struct xfrm6_protocol ah6_protocol = { .handler = xfrm6_rcv, + .input_handler = xfrm_input, .cb_handler = ah6_rcv_cb, .err_handler = ah6_err, .priority = 0, diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 11143d039f16..e8800968e209 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -26,10 +26,12 @@ #include #include #include +#include #include #include #include #include +#include #include #include @@ -39,6 +41,11 @@ struct esp_skb_cb { void *tmp; }; +struct esp_output_extra { + __be32 seqhi; + u32 esphoff; +}; + #define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)&((__skb)->cb[0])) /* @@ -72,9 +79,9 @@ static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags, int seqihlen) return kmalloc(len, GFP_ATOMIC); } -static inline __be32 *esp_tmp_seqhi(void *tmp) +static inline void *esp_tmp_extra(void *tmp) { - return PTR_ALIGN((__be32 *)tmp, __alignof__(__be32)); + return PTR_ALIGN(tmp, __alignof__(struct esp_output_extra)); } static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp, int seqhilen) @@ -104,16 +111,17 @@ static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead, static void esp_ssg_unref(struct xfrm_state *x, void *tmp) { + struct esp_output_extra *extra = esp_tmp_extra(tmp); struct crypto_aead *aead = x->data; - int seqhilen = 0; + int extralen = 0; u8 *iv; struct aead_request *req; struct scatterlist *sg; if (x->props.flags & XFRM_STATE_ESN) - seqhilen += sizeof(__be32); + extralen += sizeof(*extra); - iv = esp_tmp_iv(aead, tmp, seqhilen); + iv = esp_tmp_iv(aead, tmp, extralen); req = esp_tmp_req(aead, iv); /* Unref skb_frag_pages in the src scatterlist if necessary. @@ -124,6 +132,23 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp) put_page(sg_page(sg)); } +static void esp_output_encap_csum(struct sk_buff *skb) +{ + /* UDP encap with IPv6 requires a valid checksum */ + if (*skb_mac_header(skb) == IPPROTO_UDP) { + struct udphdr *uh = udp_hdr(skb); + struct ipv6hdr *ip6h = ipv6_hdr(skb); + int len = ntohs(uh->len); + unsigned int offset = skb_transport_offset(skb); + __wsum csum = skb_checksum(skb, offset, skb->len - offset, 0); + + uh->check = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, + len, IPPROTO_UDP, csum); + if (uh->check == 0) + uh->check = CSUM_MANGLED_0; + } +} + static void esp_output_done(struct crypto_async_request *base, int err) { struct sk_buff *skb = base->data; @@ -143,6 +168,8 @@ static void esp_output_done(struct crypto_async_request *base, int err) esp_ssg_unref(x, tmp); kfree(tmp); + esp_output_encap_csum(skb); + if (xo && (xo->flags & XFRM_DEV_RESUME)) { if (err) { XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTSTATEPROTOERROR); @@ -163,7 +190,7 @@ static void esp_restore_header(struct sk_buff *skb, unsigned int offset) { struct ip_esp_hdr *esph = (void *)(skb->data + offset); void *tmp = ESP_SKB_CB(skb)->tmp; - __be32 *seqhi = esp_tmp_seqhi(tmp); + __be32 *seqhi = esp_tmp_extra(tmp); esph->seq_no = esph->spi; esph->spi = *seqhi; @@ -171,27 +198,36 @@ static void esp_restore_header(struct sk_buff *skb, unsigned int offset) static void esp_output_restore_header(struct sk_buff *skb) { - esp_restore_header(skb, skb_transport_offset(skb) - sizeof(__be32)); + void *tmp = ESP_SKB_CB(skb)->tmp; + struct esp_output_extra *extra = esp_tmp_extra(tmp); + + esp_restore_header(skb, skb_transport_offset(skb) + extra->esphoff - + sizeof(__be32)); } static struct ip_esp_hdr *esp_output_set_esn(struct sk_buff *skb, struct xfrm_state *x, struct ip_esp_hdr *esph, - __be32 *seqhi) + struct esp_output_extra *extra) { /* For ESN we move the header forward by 4 bytes to * accomodate the high bits. We will move it back after * encryption. */ if ((x->props.flags & XFRM_STATE_ESN)) { + __u32 seqhi; struct xfrm_offload *xo = xfrm_offload(skb); - esph = (void *)(skb_transport_header(skb) - sizeof(__be32)); - *seqhi = esph->spi; if (xo) - esph->seq_no = htonl(xo->seq.hi); + seqhi = xo->seq.hi; else - esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.hi); + seqhi = XFRM_SKB_CB(skb)->seq.output.hi; + + extra->esphoff = (unsigned char *)esph - + skb_transport_header(skb); + esph = (struct ip_esp_hdr *)((unsigned char *)esph - 4); + extra->seqhi = esph->spi; + esph->seq_no = htonl(seqhi); } esph->spi = x->id.spi; @@ -207,15 +243,84 @@ static void esp_output_done_esn(struct crypto_async_request *base, int err) esp_output_done(base, err); } +static struct ip_esp_hdr *esp6_output_udp_encap(struct sk_buff *skb, + int encap_type, + struct esp_info *esp, + __be16 sport, + __be16 dport) +{ + struct udphdr *uh; + __be32 *udpdata32; + unsigned int len; + + len = skb->len + esp->tailen - skb_transport_offset(skb); + if (len > U16_MAX) + return ERR_PTR(-EMSGSIZE); + + uh = (struct udphdr *)esp->esph; + uh->source = sport; + uh->dest = dport; + uh->len = htons(len); + uh->check = 0; + + *skb_mac_header(skb) = IPPROTO_UDP; + + if (encap_type == UDP_ENCAP_ESPINUDP_NON_IKE) { + udpdata32 = (__be32 *)(uh + 1); + udpdata32[0] = udpdata32[1] = 0; + return (struct ip_esp_hdr *)(udpdata32 + 2); + } + + return (struct ip_esp_hdr *)(uh + 1); +} + +static int esp6_output_encap(struct xfrm_state *x, struct sk_buff *skb, + struct esp_info *esp) +{ + struct xfrm_encap_tmpl *encap = x->encap; + struct ip_esp_hdr *esph; + __be16 sport, dport; + int encap_type; + + spin_lock_bh(&x->lock); + sport = encap->encap_sport; + dport = encap->encap_dport; + encap_type = encap->encap_type; + spin_unlock_bh(&x->lock); + + switch (encap_type) { + default: + case UDP_ENCAP_ESPINUDP: + case UDP_ENCAP_ESPINUDP_NON_IKE: + esph = esp6_output_udp_encap(skb, encap_type, esp, sport, dport); + break; + } + + if (IS_ERR(esph)) + return PTR_ERR(esph); + + esp->esph = esph; + + return 0; +} + int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp) { u8 *tail; u8 *vaddr; int nfrags; + int esph_offset; struct page *page; struct sk_buff *trailer; int tailen = esp->tailen; + if (x->encap) { + int err = esp6_output_encap(x, skb, esp); + + if (err < 0) + return err; + } + if (!skb_cloned(skb)) { if (tailen <= skb_tailroom(skb)) { nfrags = 1; @@ -274,10 +379,13 @@ int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info } cow: + esph_offset = (unsigned char *)esp->esph - skb_transport_header(skb); + nfrags = skb_cow_data(skb, tailen, &trailer); if (nfrags < 0) goto out; tail = skb_tail_pointer(trailer); + esp->esph = (struct ip_esp_hdr *)(skb_transport_header(skb) + esph_offset); skip_cow: esp_output_fill_trailer(tail, esp->tfclen, esp->plen, esp->proto); @@ -295,20 +403,20 @@ int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info void *tmp; int ivlen; int assoclen; - int seqhilen; - __be32 *seqhi; + int extralen; struct page *page; struct ip_esp_hdr *esph; struct aead_request *req; struct crypto_aead *aead; struct scatterlist *sg, *dsg; + struct esp_output_extra *extra; int err = -ENOMEM; assoclen = sizeof(struct ip_esp_hdr); - seqhilen = 0; + extralen = 0; if (x->props.flags & XFRM_STATE_ESN) { - seqhilen += sizeof(__be32); + extralen += sizeof(*extra); assoclen += sizeof(__be32); } @@ -316,12 +424,12 @@ int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info alen = crypto_aead_authsize(aead); ivlen = crypto_aead_ivsize(aead); - tmp = esp_alloc_tmp(aead, esp->nfrags + 2, seqhilen); + tmp = esp_alloc_tmp(aead, esp->nfrags + 2, extralen); if (!tmp) goto error; - seqhi = esp_tmp_seqhi(tmp); - iv = esp_tmp_iv(aead, tmp, seqhilen); + extra = esp_tmp_extra(tmp); + iv = esp_tmp_iv(aead, tmp, extralen); req = esp_tmp_req(aead, iv); sg = esp_req_sg(aead, req); @@ -330,7 +438,8 @@ int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info else dsg = &sg[esp->nfrags]; - esph = esp_output_set_esn(skb, x, ip_esp_hdr(skb), seqhi); + esph = esp_output_set_esn(skb, x, esp->esph, extra); + esp->esph = esph; sg_init_table(sg, esp->nfrags); err = skb_to_sgvec(skb, sg, @@ -394,6 +503,7 @@ int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info case 0: if ((x->props.flags & XFRM_STATE_ESN)) esp_output_restore_header(skb); + esp_output_encap_csum(skb); } if (sg != dsg) @@ -438,11 +548,13 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) esp.plen = esp.clen - skb->len - esp.tfclen; esp.tailen = esp.tfclen + esp.plen + alen; + esp.esph = ip_esp_hdr(skb); + esp.nfrags = esp6_output_head(x, skb, &esp); if (esp.nfrags < 0) return esp.nfrags; - esph = ip_esp_hdr(skb); + esph = esp.esph; esph->spi = x->id.spi; esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); @@ -517,6 +629,56 @@ int esp6_input_done2(struct sk_buff *skb, int err) if (unlikely(err < 0)) goto out; + if (x->encap) { + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + struct xfrm_encap_tmpl *encap = x->encap; + struct udphdr *uh = (void *)(skb_network_header(skb) + hdr_len); + __be16 source; + + switch (x->encap->encap_type) { + case UDP_ENCAP_ESPINUDP: + case UDP_ENCAP_ESPINUDP_NON_IKE: + source = uh->source; + break; + default: + WARN_ON_ONCE(1); + err = -EINVAL; + goto out; + } + + /* + * 1) if the NAT-T peer's IP or port changed then + * advertize the change to the keying daemon. + * This is an inbound SA, so just compare + * SRC ports. + */ + if (!ipv6_addr_equal(&ip6h->saddr, &x->props.saddr.in6) || + source != encap->encap_sport) { + xfrm_address_t ipaddr; + + memcpy(&ipaddr.a6, &ip6h->saddr.s6_addr, sizeof(ipaddr.a6)); + km_new_mapping(x, &ipaddr, source); + + /* XXX: perhaps add an extra + * policy check here, to see + * if we should allow or + * reject a packet from a + * different source + * address/port. + */ + } + + /* + * 2) ignore UDP/TCP checksums in case + * of NAT-T in Transport Mode, or + * perform other post-processing fixes + * as per draft-ietf-ipsec-udp-encaps-06, + * section 3.1.2 + */ + if (x->props.mode == XFRM_MODE_TRANSPORT) + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + skb_postpull_rcsum(skb, skb_network_header(skb), skb_network_header_len(skb)); skb_pull_rcsum(skb, hlen); @@ -632,7 +794,7 @@ skip_cow: goto out; ESP_SKB_CB(skb)->tmp = tmp; - seqhi = esp_tmp_seqhi(tmp); + seqhi = esp_tmp_extra(tmp); iv = esp_tmp_iv(aead, tmp, seqhilen); req = esp_tmp_req(aead, iv); sg = esp_req_sg(aead, req); @@ -836,9 +998,6 @@ static int esp6_init_state(struct xfrm_state *x) u32 align; int err; - if (x->encap) - return -EINVAL; - x->data = NULL; if (x->aead) @@ -867,6 +1026,22 @@ static int esp6_init_state(struct xfrm_state *x) break; } + if (x->encap) { + struct xfrm_encap_tmpl *encap = x->encap; + + switch (encap->encap_type) { + default: + err = -EINVAL; + goto error; + case UDP_ENCAP_ESPINUDP: + x->props.header_len += sizeof(struct udphdr); + break; + case UDP_ENCAP_ESPINUDP_NON_IKE: + x->props.header_len += sizeof(struct udphdr) + 2 * sizeof(u32); + break; + } + } + align = ALIGN(crypto_aead_blocksize(aead), 4); x->props.trailer_len = align + 1 + crypto_aead_authsize(aead); @@ -893,6 +1068,7 @@ static const struct xfrm_type esp6_type = { static struct xfrm6_protocol esp6_protocol = { .handler = xfrm6_rcv, + .input_handler = xfrm_input, .cb_handler = esp6_rcv_cb, .err_handler = esp6_err, .priority = 0, diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index 8eab2c869d61..06163cc15844 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -271,7 +271,6 @@ static int esp6_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features int alen; int blksize; struct xfrm_offload *xo; - struct ip_esp_hdr *esph; struct crypto_aead *aead; struct esp_info esp; bool hw_offload = true; @@ -312,13 +311,13 @@ static int esp6_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features seq = xo->seq.low; - esph = ip_esp_hdr(skb); - esph->spi = x->id.spi; + esp.esph = ip_esp_hdr(skb); + esp.esph->spi = x->id.spi; skb_push(skb, -skb_network_offset(skb)); if (xo->flags & XFRM_GSO_SEGMENT) { - esph->seq_no = htonl(seq); + esp.esph->seq_no = htonl(seq); if (!skb_is_gso(skb)) xo->seq.low++; diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index cc6180e08a4f..1147f647b9a0 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -296,7 +296,8 @@ static void vti6_dev_uninit(struct net_device *dev) dev_put(dev); } -static int vti6_rcv(struct sk_buff *skb) +static int vti6_input_proto(struct sk_buff *skb, int nexthdr, __be32 spi, + int encap_type) { struct ip6_tnl *t; const struct ipv6hdr *ipv6h = ipv6_hdr(skb); @@ -323,7 +324,10 @@ static int vti6_rcv(struct sk_buff *skb) rcu_read_unlock(); - return xfrm6_rcv_tnl(skb, t); + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = t; + XFRM_SPI_SKB_CB(skb)->family = AF_INET6; + XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr); + return xfrm_input(skb, nexthdr, spi, encap_type); } rcu_read_unlock(); return -EINVAL; @@ -332,6 +336,13 @@ discard: return 0; } +static int vti6_rcv(struct sk_buff *skb) +{ + int nexthdr = skb_network_header(skb)[IP6CB(skb)->nhoff]; + + return vti6_input_proto(skb, nexthdr, 0, 0); +} + static int vti6_rcv_cb(struct sk_buff *skb, int err) { unsigned short family; @@ -1185,6 +1196,7 @@ static struct pernet_operations vti6_net_ops = { static struct xfrm6_protocol vti_esp6_protocol __read_mostly = { .handler = vti6_rcv, + .input_handler = vti6_input_proto, .cb_handler = vti6_rcv_cb, .err_handler = vti6_err, .priority = 100, @@ -1192,6 +1204,7 @@ static struct xfrm6_protocol vti_esp6_protocol __read_mostly = { static struct xfrm6_protocol vti_ah6_protocol __read_mostly = { .handler = vti6_rcv, + .input_handler = vti6_input_proto, .cb_handler = vti6_rcv_cb, .err_handler = vti6_err, .priority = 100, @@ -1199,6 +1212,7 @@ static struct xfrm6_protocol vti_ah6_protocol __read_mostly = { static struct xfrm6_protocol vti_ipcomp6_protocol __read_mostly = { .handler = vti6_rcv, + .input_handler = vti6_input_proto, .cb_handler = vti6_rcv_cb, .err_handler = vti6_err, .priority = 100, diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c index 3752bd3e92ce..99668bfebd85 100644 --- a/net/ipv6/ipcomp6.c +++ b/net/ipv6/ipcomp6.c @@ -183,6 +183,7 @@ static const struct xfrm_type ipcomp6_type = { static struct xfrm6_protocol ipcomp6_protocol = { .handler = xfrm6_rcv, + .input_handler = xfrm_input, .cb_handler = ipcomp6_rcv_cb, .err_handler = ipcomp6_err, .priority = 0, diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c index a52cb3fc6df5..56f52353b324 100644 --- a/net/ipv6/xfrm6_input.c +++ b/net/ipv6/xfrm6_input.c @@ -35,9 +35,12 @@ EXPORT_SYMBOL(xfrm6_rcv_spi); static int xfrm6_transport_finish2(struct net *net, struct sock *sk, struct sk_buff *skb) { - if (xfrm_trans_queue(skb, ip6_rcv_finish)) - __kfree_skb(skb); - return -1; + if (xfrm_trans_queue(skb, ip6_rcv_finish)) { + kfree_skb(skb); + return NET_RX_DROP; + } + + return 0; } int xfrm6_transport_finish(struct sk_buff *skb, int async) @@ -60,13 +63,106 @@ int xfrm6_transport_finish(struct sk_buff *skb, int async) if (xo && (xo->flags & XFRM_GRO)) { skb_mac_header_rebuild(skb); skb_reset_transport_header(skb); - return -1; + return 0; } NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, dev_net(skb->dev), NULL, skb, skb->dev, NULL, xfrm6_transport_finish2); - return -1; + return 0; +} + +/* If it's a keepalive packet, then just eat it. + * If it's an encapsulated packet, then pass it to the + * IPsec xfrm input. + * Returns 0 if skb passed to xfrm or was dropped. + * Returns >0 if skb should be passed to UDP. + * Returns <0 if skb should be resubmitted (-ret is protocol) + */ +int xfrm6_udp_encap_rcv(struct sock *sk, struct sk_buff *skb) +{ + struct udp_sock *up = udp_sk(sk); + struct udphdr *uh; + struct ipv6hdr *ip6h; + int len; + int ip6hlen = sizeof(struct ipv6hdr); + + __u8 *udpdata; + __be32 *udpdata32; + __u16 encap_type = up->encap_type; + + /* if this is not encapsulated socket, then just return now */ + if (!encap_type) + return 1; + + /* If this is a paged skb, make sure we pull up + * whatever data we need to look at. */ + len = skb->len - sizeof(struct udphdr); + if (!pskb_may_pull(skb, sizeof(struct udphdr) + min(len, 8))) + return 1; + + /* Now we can get the pointers */ + uh = udp_hdr(skb); + udpdata = (__u8 *)uh + sizeof(struct udphdr); + udpdata32 = (__be32 *)udpdata; + + switch (encap_type) { + default: + case UDP_ENCAP_ESPINUDP: + /* Check if this is a keepalive packet. If so, eat it. */ + if (len == 1 && udpdata[0] == 0xff) { + goto drop; + } else if (len > sizeof(struct ip_esp_hdr) && udpdata32[0] != 0) { + /* ESP Packet without Non-ESP header */ + len = sizeof(struct udphdr); + } else + /* Must be an IKE packet.. pass it through */ + return 1; + break; + case UDP_ENCAP_ESPINUDP_NON_IKE: + /* Check if this is a keepalive packet. If so, eat it. */ + if (len == 1 && udpdata[0] == 0xff) { + goto drop; + } else if (len > 2 * sizeof(u32) + sizeof(struct ip_esp_hdr) && + udpdata32[0] == 0 && udpdata32[1] == 0) { + + /* ESP Packet with Non-IKE marker */ + len = sizeof(struct udphdr) + 2 * sizeof(u32); + } else + /* Must be an IKE packet.. pass it through */ + return 1; + break; + } + + /* At this point we are sure that this is an ESPinUDP packet, + * so we need to remove 'len' bytes from the packet (the UDP + * header and optional ESP marker bytes) and then modify the + * protocol to ESP, and then call into the transform receiver. + */ + if (skb_unclone(skb, GFP_ATOMIC)) + goto drop; + + /* Now we can update and verify the packet length... */ + ip6h = ipv6_hdr(skb); + ip6h->payload_len = htons(ntohs(ip6h->payload_len) - len); + if (skb->len < ip6hlen + len) { + /* packet is too small!?! */ + goto drop; + } + + /* pull the data buffer up to the ESP header and set the + * transport header to point to ESP. Keep UDP on the stack + * for later. + */ + __skb_pull(skb, len); + skb_reset_transport_header(skb); + + /* process ESP */ + return xfrm6_rcv_encap(skb, IPPROTO_ESP, 0, encap_type); + +drop: + kfree_skb(skb); + return 0; } int xfrm6_rcv_tnl(struct sk_buff *skb, struct ip6_tnl *t) diff --git a/net/ipv6/xfrm6_protocol.c b/net/ipv6/xfrm6_protocol.c index 34cb65c7d5a7..ea2f805d3b01 100644 --- a/net/ipv6/xfrm6_protocol.c +++ b/net/ipv6/xfrm6_protocol.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -58,6 +59,53 @@ static int xfrm6_rcv_cb(struct sk_buff *skb, u8 protocol, int err) return 0; } +int xfrm6_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi, + int encap_type) +{ + int ret; + struct xfrm6_protocol *handler; + struct xfrm6_protocol __rcu **head = proto_handlers(nexthdr); + + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL; + XFRM_SPI_SKB_CB(skb)->family = AF_INET6; + XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr); + + if (!head) + goto out; + + if (!skb_dst(skb)) { + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + int flags = RT6_LOOKUP_F_HAS_SADDR; + struct dst_entry *dst; + struct flowi6 fl6 = { + .flowi6_iif = skb->dev->ifindex, + .daddr = ip6h->daddr, + .saddr = ip6h->saddr, + .flowlabel = ip6_flowinfo(ip6h), + .flowi6_mark = skb->mark, + .flowi6_proto = ip6h->nexthdr, + }; + + dst = ip6_route_input_lookup(dev_net(skb->dev), skb->dev, &fl6, + skb, flags); + if (dst->error) + goto drop; + skb_dst_set(skb, dst); + } + + for_each_protocol_rcu(*head, handler) + if ((ret = handler->input_handler(skb, nexthdr, spi, encap_type)) != -EINVAL) + return ret; + +out: + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); + +drop: + kfree_skb(skb); + return 0; +} +EXPORT_SYMBOL(xfrm6_rcv_encap); + static int xfrm6_esp_rcv(struct sk_buff *skb) { int ret; diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index eb9928c0a87c..02f8f46d0cc5 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -755,6 +755,7 @@ static struct pernet_operations xfrmi_net_ops = { static struct xfrm6_protocol xfrmi_esp6_protocol __read_mostly = { .handler = xfrm6_rcv, + .input_handler = xfrm_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, .priority = 10, @@ -762,6 +763,7 @@ static struct xfrm6_protocol xfrmi_esp6_protocol __read_mostly = { static struct xfrm6_protocol xfrmi_ah6_protocol __read_mostly = { .handler = xfrm6_rcv, + .input_handler = xfrm_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, .priority = 10, @@ -769,6 +771,7 @@ static struct xfrm6_protocol xfrmi_ah6_protocol __read_mostly = { static struct xfrm6_protocol xfrmi_ipcomp6_protocol __read_mostly = { .handler = xfrm6_rcv, + .input_handler = xfrm_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, .priority = 10, -- cgit v1.2.3 From 26333c37fc285e7372f1b9461f3ae0ba3dc699c9 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Mon, 27 Apr 2020 17:59:35 +0200 Subject: xfrm: add IPv6 support for espintcp This extends espintcp to support IPv6, building on the existing code and the new UDPv6 encapsulation support. Most of the code is either reused directly (stream parser, ULP) or very similar to the IPv4 variant (net/ipv6/esp6.c changes). The separation of config options for IPv4 and IPv6 espintcp requires a bit of Kconfig gymnastics to enable the core code. Signed-off-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- include/net/ipv6_stubs.h | 2 + net/ipv4/Kconfig | 1 + net/ipv6/Kconfig | 12 +++ net/ipv6/af_inet6.c | 1 + net/ipv6/esp6.c | 188 ++++++++++++++++++++++++++++++++++++++++++++++- net/xfrm/Kconfig | 3 + net/xfrm/Makefile | 2 +- net/xfrm/espintcp.c | 56 +++++++++++--- 8 files changed, 252 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h index f033a17b53b6..1e9e0cf7dc75 100644 --- a/include/net/ipv6_stubs.h +++ b/include/net/ipv6_stubs.h @@ -58,6 +58,8 @@ struct ipv6_stub { bool router, bool solicited, bool override, bool inc_opt); #if IS_ENABLED(CONFIG_XFRM) int (*xfrm6_udp_encap_rcv)(struct sock *sk, struct sk_buff *skb); + int (*xfrm6_rcv_encap)(struct sk_buff *skb, int nexthdr, __be32 spi, + int encap_type); #endif struct neigh_table *nd_tbl; }; diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 25a8888826b8..014aaa17dc79 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -384,6 +384,7 @@ config INET_ESPINTCP depends on XFRM && INET_ESP select STREAM_PARSER select NET_SOCK_MSG + select XFRM_ESPINTCP help Support for RFC 8229 encapsulation of ESP and IKE over TCP/IPv4 sockets. diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index 2ccaee98fddb..468a2faadc7d 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -88,6 +88,18 @@ config INET6_ESP_OFFLOAD If unsure, say N. +config INET6_ESPINTCP + bool "IPv6: ESP in TCP encapsulation (RFC 8229)" + depends on XFRM && INET6_ESP + select STREAM_PARSER + select NET_SOCK_MSG + select XFRM_ESPINTCP + help + Support for RFC 8229 encapsulation of ESP and IKE over + TCP/IPv6 sockets. + + If unsure, say N. + config INET6_IPCOMP tristate "IPv6: IPComp transformation" select INET6_XFRM_TUNNEL diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index b0b99c08350a..cbbb00bad20e 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -964,6 +964,7 @@ static const struct ipv6_stub ipv6_stub_impl = { .ndisc_send_na = ndisc_send_na, #if IS_ENABLED(CONFIG_XFRM) .xfrm6_udp_encap_rcv = xfrm6_udp_encap_rcv, + .xfrm6_rcv_encap = xfrm6_rcv_encap, #endif .nd_tbl = &nd_tbl, }; diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index e8800968e209..c43592771126 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -33,6 +33,9 @@ #include #include #include +#include +#include +#include #include @@ -132,6 +135,132 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp) put_page(sg_page(sg)); } +#ifdef CONFIG_INET6_ESPINTCP +struct esp_tcp_sk { + struct sock *sk; + struct rcu_head rcu; +}; + +static void esp_free_tcp_sk(struct rcu_head *head) +{ + struct esp_tcp_sk *esk = container_of(head, struct esp_tcp_sk, rcu); + + sock_put(esk->sk); + kfree(esk); +} + +static struct sock *esp6_find_tcp_sk(struct xfrm_state *x) +{ + struct xfrm_encap_tmpl *encap = x->encap; + struct esp_tcp_sk *esk; + __be16 sport, dport; + struct sock *nsk; + struct sock *sk; + + sk = rcu_dereference(x->encap_sk); + if (sk && sk->sk_state == TCP_ESTABLISHED) + return sk; + + spin_lock_bh(&x->lock); + sport = encap->encap_sport; + dport = encap->encap_dport; + nsk = rcu_dereference_protected(x->encap_sk, + lockdep_is_held(&x->lock)); + if (sk && sk == nsk) { + esk = kmalloc(sizeof(*esk), GFP_ATOMIC); + if (!esk) { + spin_unlock_bh(&x->lock); + return ERR_PTR(-ENOMEM); + } + RCU_INIT_POINTER(x->encap_sk, NULL); + esk->sk = sk; + call_rcu(&esk->rcu, esp_free_tcp_sk); + } + spin_unlock_bh(&x->lock); + + sk = __inet6_lookup_established(xs_net(x), &tcp_hashinfo, &x->id.daddr.in6, + dport, &x->props.saddr.in6, ntohs(sport), 0, 0); + if (!sk) + return ERR_PTR(-ENOENT); + + if (!tcp_is_ulp_esp(sk)) { + sock_put(sk); + return ERR_PTR(-EINVAL); + } + + spin_lock_bh(&x->lock); + nsk = rcu_dereference_protected(x->encap_sk, + lockdep_is_held(&x->lock)); + if (encap->encap_sport != sport || + encap->encap_dport != dport) { + sock_put(sk); + sk = nsk ?: ERR_PTR(-EREMCHG); + } else if (sk == nsk) { + sock_put(sk); + } else { + rcu_assign_pointer(x->encap_sk, sk); + } + spin_unlock_bh(&x->lock); + + return sk; +} + +static int esp_output_tcp_finish(struct xfrm_state *x, struct sk_buff *skb) +{ + struct sock *sk; + int err; + + rcu_read_lock(); + + sk = esp6_find_tcp_sk(x); + err = PTR_ERR_OR_ZERO(sk); + if (err) + goto out; + + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) + err = espintcp_queue_out(sk, skb); + else + err = espintcp_push_skb(sk, skb); + bh_unlock_sock(sk); + +out: + rcu_read_unlock(); + return err; +} + +static int esp_output_tcp_encap_cb(struct net *net, struct sock *sk, + struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct xfrm_state *x = dst->xfrm; + + return esp_output_tcp_finish(x, skb); +} + +static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb) +{ + int err; + + local_bh_disable(); + err = xfrm_trans_queue_net(xs_net(x), skb, esp_output_tcp_encap_cb); + local_bh_enable(); + + /* EINPROGRESS just happens to do the right thing. It + * actually means that the skb has been consumed and + * isn't coming back. + */ + return err ?: -EINPROGRESS; +} +#else +static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb) +{ + kfree_skb(skb); + + return -EOPNOTSUPP; +} +#endif + static void esp_output_encap_csum(struct sk_buff *skb) { /* UDP encap with IPv6 requires a valid checksum */ @@ -181,7 +310,11 @@ static void esp_output_done(struct crypto_async_request *base, int err) secpath_reset(skb); xfrm_dev_resume(skb); } else { - xfrm_output_resume(skb, err); + if (!err && + x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP) + esp_output_tail_tcp(x, skb); + else + xfrm_output_resume(skb, err); } } @@ -274,6 +407,41 @@ static struct ip_esp_hdr *esp6_output_udp_encap(struct sk_buff *skb, return (struct ip_esp_hdr *)(uh + 1); } +#ifdef CONFIG_INET6_ESPINTCP +static struct ip_esp_hdr *esp6_output_tcp_encap(struct xfrm_state *x, + struct sk_buff *skb, + struct esp_info *esp) +{ + __be16 *lenp = (void *)esp->esph; + struct ip_esp_hdr *esph; + unsigned int len; + struct sock *sk; + + len = skb->len + esp->tailen - skb_transport_offset(skb); + if (len > IP_MAX_MTU) + return ERR_PTR(-EMSGSIZE); + + rcu_read_lock(); + sk = esp6_find_tcp_sk(x); + rcu_read_unlock(); + + if (IS_ERR(sk)) + return ERR_CAST(sk); + + *lenp = htons(len); + esph = (struct ip_esp_hdr *)(lenp + 1); + + return esph; +} +#else +static struct ip_esp_hdr *esp6_output_tcp_encap(struct xfrm_state *x, + struct sk_buff *skb, + struct esp_info *esp) +{ + return ERR_PTR(-EOPNOTSUPP); +} +#endif + static int esp6_output_encap(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp) { @@ -294,6 +462,9 @@ static int esp6_output_encap(struct xfrm_state *x, struct sk_buff *skb, case UDP_ENCAP_ESPINUDP_NON_IKE: esph = esp6_output_udp_encap(skb, encap_type, esp, sport, dport); break; + case TCP_ENCAP_ESPINTCP: + esph = esp6_output_tcp_encap(x, skb, esp); + break; } if (IS_ERR(esph)) @@ -509,6 +680,9 @@ int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info if (sg != dsg) esp_ssg_unref(x, tmp); + if (!err && x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP) + err = esp_output_tail_tcp(x, skb); + error_free: kfree(tmp); error: @@ -633,9 +807,13 @@ int esp6_input_done2(struct sk_buff *skb, int err) const struct ipv6hdr *ip6h = ipv6_hdr(skb); struct xfrm_encap_tmpl *encap = x->encap; struct udphdr *uh = (void *)(skb_network_header(skb) + hdr_len); + struct tcphdr *th = (void *)(skb_network_header(skb) + hdr_len); __be16 source; switch (x->encap->encap_type) { + case TCP_ENCAP_ESPINTCP: + source = th->source; + break; case UDP_ENCAP_ESPINUDP: case UDP_ENCAP_ESPINUDP_NON_IKE: source = uh->source; @@ -1039,6 +1217,14 @@ static int esp6_init_state(struct xfrm_state *x) case UDP_ENCAP_ESPINUDP_NON_IKE: x->props.header_len += sizeof(struct udphdr) + 2 * sizeof(u32); break; +#ifdef CONFIG_INET6_ESPINTCP + case TCP_ENCAP_ESPINTCP: + /* only the length field, TCP encap is done by + * the socket + */ + x->props.header_len += 2; + break; +#endif } } diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig index 6921a18201a0..b7fd9c838416 100644 --- a/net/xfrm/Kconfig +++ b/net/xfrm/Kconfig @@ -99,4 +99,7 @@ config NET_KEY_MIGRATE If unsure, say N. +config XFRM_ESPINTCP + bool + endif # INET diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile index 212a4fcb4a88..2d4bb4b9f75e 100644 --- a/net/xfrm/Makefile +++ b/net/xfrm/Makefile @@ -11,4 +11,4 @@ obj-$(CONFIG_XFRM_ALGO) += xfrm_algo.o obj-$(CONFIG_XFRM_USER) += xfrm_user.o obj-$(CONFIG_XFRM_IPCOMP) += xfrm_ipcomp.o obj-$(CONFIG_XFRM_INTERFACE) += xfrm_interface.o -obj-$(CONFIG_INET_ESPINTCP) += espintcp.o +obj-$(CONFIG_XFRM_ESPINTCP) += espintcp.o diff --git a/net/xfrm/espintcp.c b/net/xfrm/espintcp.c index 037ea156d2f9..2132a3b6df0f 100644 --- a/net/xfrm/espintcp.c +++ b/net/xfrm/espintcp.c @@ -6,6 +6,9 @@ #include #include #include +#if IS_ENABLED(CONFIG_IPV6) +#include +#endif static void handle_nonesp(struct espintcp_ctx *ctx, struct sk_buff *skb, struct sock *sk) @@ -31,7 +34,12 @@ static void handle_esp(struct sk_buff *skb, struct sock *sk) rcu_read_lock(); skb->dev = dev_get_by_index_rcu(sock_net(sk), skb->skb_iif); local_bh_disable(); - xfrm4_rcv_encap(skb, IPPROTO_ESP, 0, TCP_ENCAP_ESPINTCP); +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) + ipv6_stub->xfrm6_rcv_encap(skb, IPPROTO_ESP, 0, TCP_ENCAP_ESPINTCP); + else +#endif + xfrm4_rcv_encap(skb, IPPROTO_ESP, 0, TCP_ENCAP_ESPINTCP); local_bh_enable(); rcu_read_unlock(); } @@ -347,6 +355,9 @@ unlock: static struct proto espintcp_prot __ro_after_init; static struct proto_ops espintcp_ops __ro_after_init; +static struct proto espintcp6_prot; +static struct proto_ops espintcp6_ops; +static DEFINE_MUTEX(tcpv6_prot_mutex); static void espintcp_data_ready(struct sock *sk) { @@ -384,10 +395,14 @@ static void espintcp_destruct(struct sock *sk) bool tcp_is_ulp_esp(struct sock *sk) { - return sk->sk_prot == &espintcp_prot; + return sk->sk_prot == &espintcp_prot || sk->sk_prot == &espintcp6_prot; } EXPORT_SYMBOL_GPL(tcp_is_ulp_esp); +static void build_protos(struct proto *espintcp_prot, + struct proto_ops *espintcp_ops, + const struct proto *orig_prot, + const struct proto_ops *orig_ops); static int espintcp_init_sk(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -415,8 +430,19 @@ static int espintcp_init_sk(struct sock *sk) strp_check_rcv(&ctx->strp); skb_queue_head_init(&ctx->ike_queue); skb_queue_head_init(&ctx->out_queue); - sk->sk_prot = &espintcp_prot; - sk->sk_socket->ops = &espintcp_ops; + + if (sk->sk_family == AF_INET) { + sk->sk_prot = &espintcp_prot; + sk->sk_socket->ops = &espintcp_ops; + } else { + mutex_lock(&tcpv6_prot_mutex); + if (!espintcp6_prot.recvmsg) + build_protos(&espintcp6_prot, &espintcp6_ops, sk->sk_prot, sk->sk_socket->ops); + mutex_unlock(&tcpv6_prot_mutex); + + sk->sk_prot = &espintcp6_prot; + sk->sk_socket->ops = &espintcp6_ops; + } ctx->saved_data_ready = sk->sk_data_ready; ctx->saved_write_space = sk->sk_write_space; sk->sk_data_ready = espintcp_data_ready; @@ -489,6 +515,20 @@ static __poll_t espintcp_poll(struct file *file, struct socket *sock, return mask; } +static void build_protos(struct proto *espintcp_prot, + struct proto_ops *espintcp_ops, + const struct proto *orig_prot, + const struct proto_ops *orig_ops) +{ + memcpy(espintcp_prot, orig_prot, sizeof(struct proto)); + memcpy(espintcp_ops, orig_ops, sizeof(struct proto_ops)); + espintcp_prot->sendmsg = espintcp_sendmsg; + espintcp_prot->recvmsg = espintcp_recvmsg; + espintcp_prot->close = espintcp_close; + espintcp_prot->release_cb = espintcp_release; + espintcp_ops->poll = espintcp_poll; +} + static struct tcp_ulp_ops espintcp_ulp __read_mostly = { .name = "espintcp", .owner = THIS_MODULE, @@ -497,13 +537,7 @@ static struct tcp_ulp_ops espintcp_ulp __read_mostly = { void __init espintcp_init(void) { - memcpy(&espintcp_prot, &tcp_prot, sizeof(tcp_prot)); - memcpy(&espintcp_ops, &inet_stream_ops, sizeof(inet_stream_ops)); - espintcp_prot.sendmsg = espintcp_sendmsg; - espintcp_prot.recvmsg = espintcp_recvmsg; - espintcp_prot.close = espintcp_close; - espintcp_prot.release_cb = espintcp_release; - espintcp_ops.poll = espintcp_poll; + build_protos(&espintcp_prot, &espintcp_ops, &tcp_prot, &inet_stream_ops); tcp_register_ulp(&espintcp_ulp); } -- cgit v1.2.3 From 4364f2e91f0d44fa0e233d2a55e3ec35053d9bd9 Mon Sep 17 00:00:00 2001 From: Alain Michaud Date: Thu, 23 Apr 2020 14:43:29 +0000 Subject: Bluetooth: allow scatternet connections if supported. This change allows scatternet connections to be created if the controller reports support and the HCI_QUIRK_VALID_LE_STATES indicates that the reported LE states can be trusted. Signed-off-by: Alain Michaud Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_event.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 966fc543c01d..006c24e04b44 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -5288,7 +5288,9 @@ static struct hci_conn *check_pending_le_conn(struct hci_dev *hdev, /* Most controller will fail if we try to create new connections * while we have an existing one in slave role. */ - if (hdev->conn_hash.le_num_slave > 0) + if (hdev->conn_hash.le_num_slave > 0 && + (!test_bit(HCI_QUIRK_VALID_LE_STATES, &hdev->quirks) || + !(hdev->le_states[3] & 0x10))) return NULL; /* If we're not connectable only connect devices that we have in -- cgit v1.2.3 From 11dd74b338bf83f8bca70b57bad33a903fedfa6e Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Mon, 27 Apr 2020 13:56:45 -0700 Subject: net: ipv6: new arg skip_notify to ip6_rt_del Used in subsequent work to skip route delete notifications on nexthop deletes. Suggested-by: David Ahern Signed-off-by: Roopa Prabhu Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_route.h | 2 +- include/net/ipv6_stubs.h | 2 +- net/ipv4/nexthop.c | 2 +- net/ipv6/addrconf.c | 12 ++++++------ net/ipv6/addrconf_core.c | 3 ++- net/ipv6/anycast.c | 4 ++-- net/ipv6/ndisc.c | 2 +- net/ipv6/route.c | 11 +++++++---- 8 files changed, 21 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 9947eb1e9eb6..e525f003e619 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -123,7 +123,7 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg); int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack); int ip6_ins_rt(struct net *net, struct fib6_info *f6i); -int ip6_del_rt(struct net *net, struct fib6_info *f6i); +int ip6_del_rt(struct net *net, struct fib6_info *f6i, bool skip_notify); void rt6_flush_exceptions(struct fib6_info *f6i); void rt6_age_exceptions(struct fib6_info *f6i, struct fib6_gc_args *gc_args, diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h index 3e7d2c0e79ca..a5f7c12c326a 100644 --- a/include/net/ipv6_stubs.h +++ b/include/net/ipv6_stubs.h @@ -48,7 +48,7 @@ struct ipv6_stub { struct netlink_ext_ack *extack); void (*fib6_nh_release)(struct fib6_nh *fib6_nh); void (*fib6_update_sernum)(struct net *net, struct fib6_info *rt); - int (*ip6_del_rt)(struct net *net, struct fib6_info *rt); + int (*ip6_del_rt)(struct net *net, struct fib6_info *rt, bool skip_notify); void (*fib6_rt_update)(struct net *net, struct fib6_info *rt, struct nl_info *info); diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index fdfca534d094..9999687ad6dc 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -784,7 +784,7 @@ static void __remove_nexthop_fib(struct net *net, struct nexthop *nh) list_for_each_entry_safe(f6i, tmp, &nh->f6i_list, nh_list) { /* __ip6_del_rt does a release, so do a hold here */ fib6_info_hold(f6i); - ipv6_stub->ip6_del_rt(net, f6i); + ipv6_stub->ip6_del_rt(net, f6i, false); } } diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 27b4fb6e452b..2c4f20ec1e2a 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1238,7 +1238,7 @@ cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires, ifp->idev->dev, 0, RTF_DEFAULT, true); if (f6i) { if (del_rt) - ip6_del_rt(dev_net(ifp->idev->dev), f6i); + ip6_del_rt(dev_net(ifp->idev->dev), f6i, false); else { if (!(f6i->fib6_flags & RTF_EXPIRES)) fib6_set_expires(f6i, expires); @@ -2718,7 +2718,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) if (rt) { /* Autoconf prefix route */ if (valid_lft == 0) { - ip6_del_rt(net, rt); + ip6_del_rt(net, rt, false); rt = NULL; } else if (addrconf_finite_timeout(rt_expires)) { /* not infinity */ @@ -3813,7 +3813,7 @@ restart: spin_unlock_bh(&ifa->lock); if (rt) - ip6_del_rt(net, rt); + ip6_del_rt(net, rt, false); if (state != INET6_IFADDR_STATE_DEAD) { __ipv6_ifa_notify(RTM_DELADDR, ifa); @@ -4652,7 +4652,7 @@ static int modify_prefix_route(struct inet6_ifaddr *ifp, prio = ifp->rt_priority ? : IP6_RT_PRIO_ADDRCONF; if (f6i->fib6_metric != prio) { /* delete old one */ - ip6_del_rt(dev_net(ifp->idev->dev), f6i); + ip6_del_rt(dev_net(ifp->idev->dev), f6i, false); /* add new one */ addrconf_prefix_route(modify_peer ? &ifp->peer_addr : &ifp->addr, @@ -6073,10 +6073,10 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) ifp->idev->dev, 0, 0, false); if (rt) - ip6_del_rt(net, rt); + ip6_del_rt(net, rt, false); } if (ifp->rt) { - ip6_del_rt(net, ifp->rt); + ip6_del_rt(net, ifp->rt, false); ifp->rt = NULL; } rt_genid_bump_ipv6(net); diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c index ea00ce3d4117..9ebf3fe0d2b1 100644 --- a/net/ipv6/addrconf_core.c +++ b/net/ipv6/addrconf_core.c @@ -185,7 +185,8 @@ static int eafnosupport_fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, return -EAFNOSUPPORT; } -static int eafnosupport_ip6_del_rt(struct net *net, struct fib6_info *rt) +static int eafnosupport_ip6_del_rt(struct net *net, struct fib6_info *rt, + bool skip_notify) { return -EAFNOSUPPORT; } diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index fed91ab7ec46..893261230ffc 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -364,7 +364,7 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr) ipv6_del_acaddr_hash(aca); addrconf_leave_solict(idev, &aca->aca_addr); - ip6_del_rt(dev_net(idev->dev), aca->aca_rt); + ip6_del_rt(dev_net(idev->dev), aca->aca_rt, false); aca_put(aca); return 0; @@ -393,7 +393,7 @@ void ipv6_ac_destroy_dev(struct inet6_dev *idev) addrconf_leave_solict(idev, &aca->aca_addr); - ip6_del_rt(dev_net(idev->dev), aca->aca_rt); + ip6_del_rt(dev_net(idev->dev), aca->aca_rt, false); aca_put(aca); diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 1ecd4e9b0bdf..2d09c4da03ee 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1302,7 +1302,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) } } if (rt && lifetime == 0) { - ip6_del_rt(net, rt); + ip6_del_rt(net, rt, false); rt = NULL; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 310cbddaa533..486c36a14f24 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -984,7 +984,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, gwaddr, dev); if (rt && !lifetime) { - ip6_del_rt(net, rt); + ip6_del_rt(net, rt, false); rt = NULL; } @@ -3729,9 +3729,12 @@ out: return err; } -int ip6_del_rt(struct net *net, struct fib6_info *rt) +int ip6_del_rt(struct net *net, struct fib6_info *rt, bool skip_notify) { - struct nl_info info = { .nl_net = net }; + struct nl_info info = { + .nl_net = net, + .skip_notify = skip_notify + }; return __ip6_del_rt(rt, &info); } @@ -4252,7 +4255,7 @@ restart: (!idev || idev->cnf.accept_ra != 2) && fib6_info_hold_safe(rt)) { rcu_read_unlock(); - ip6_del_rt(net, rt); + ip6_del_rt(net, rt, false); goto restart; } } -- cgit v1.2.3 From 4f80116d3df3b23ee4b83ea8557629e1799bc230 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Mon, 27 Apr 2020 13:56:46 -0700 Subject: net: ipv4: add sysctl for nexthop api compatibility mode Current route nexthop API maintains user space compatibility with old route API by default. Dumps and netlink notifications support both new and old API format. In systems which have moved to the new API, this compatibility mode cancels some of the performance benefits provided by the new nexthop API. This patch adds new sysctl nexthop_compat_mode which is on by default but provides the ability to turn off compatibility mode allowing systems to run entirely with the new routing API. Old route API behaviour and support is not modified by this sysctl. Uses a single sysctl to cover both ipv4 and ipv6 following other sysctls. Covers dumps and delete notifications as suggested by David Ahern. Signed-off-by: Roopa Prabhu Reviewed-by: David Ahern Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 12 ++++++++++++ include/net/netns/ipv4.h | 2 ++ net/ipv4/af_inet.c | 1 + net/ipv4/fib_semantics.c | 3 +++ net/ipv4/nexthop.c | 5 +++-- net/ipv4/sysctl_net_ipv4.c | 9 +++++++++ net/ipv6/route.c | 3 ++- 7 files changed, 32 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 9375324aa8e1..5cdc37c34830 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -1560,6 +1560,18 @@ skip_notify_on_dev_down - BOOLEAN on userspace caches to track link events and evict routes. Default: false (generate message) +nexthop_compat_mode - BOOLEAN + New nexthop API provides a means for managing nexthops independent of + prefixes. Backwards compatibilty with old route format is enabled by + default which means route dumps and notifications contain the new + nexthop attribute but also the full, expanded nexthop definition. + Further, updates or deletes of a nexthop configuration generate route + notifications for each fib entry using the nexthop. Once a system + understands the new API, this sysctl can be disabled to achieve full + performance benefits of the new API by disabling the nexthop expansion + and extraneous notifications. + Default: true (backward compat mode) + IPv6 Fragmentation: ip6frag_high_thresh - INTEGER diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 154b8f01499b..5acdb4d414c4 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -111,6 +111,8 @@ struct netns_ipv4 { int sysctl_tcp_early_demux; int sysctl_udp_early_demux; + int sysctl_nexthop_compat_mode; + int sysctl_fwmark_reflect; int sysctl_tcp_fwmark_accept; #ifdef CONFIG_NET_L3_MASTER_DEV diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index c618e242490f..6177c4ba0037 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1835,6 +1835,7 @@ static __net_init int inet_init_net(struct net *net) net->ipv4.sysctl_ip_early_demux = 1; net->ipv4.sysctl_udp_early_demux = 1; net->ipv4.sysctl_tcp_early_demux = 1; + net->ipv4.sysctl_nexthop_compat_mode = 1; #ifdef CONFIG_SYSCTL net->ipv4.sysctl_ip_prot_sock = PROT_SOCK; #endif diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 55ca2e521828..e53871e4a097 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1780,6 +1780,8 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, goto nla_put_failure; if (nexthop_is_blackhole(fi->nh)) rtm->rtm_type = RTN_BLACKHOLE; + if (!fi->fib_net->ipv4.sysctl_nexthop_compat_mode) + goto offload; } if (nhs == 1) { @@ -1805,6 +1807,7 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, goto nla_put_failure; } +offload: if (fri->offload) rtm->rtm_flags |= RTM_F_OFFLOAD; if (fri->trap) diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 9999687ad6dc..3957364d556c 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -784,7 +784,8 @@ static void __remove_nexthop_fib(struct net *net, struct nexthop *nh) list_for_each_entry_safe(f6i, tmp, &nh->f6i_list, nh_list) { /* __ip6_del_rt does a release, so do a hold here */ fib6_info_hold(f6i); - ipv6_stub->ip6_del_rt(net, f6i, false); + ipv6_stub->ip6_del_rt(net, f6i, + !net->ipv4.sysctl_nexthop_compat_mode); } } @@ -1041,7 +1042,7 @@ out: if (!rc) { nh_base_seq_inc(net); nexthop_notify(RTM_NEWNEXTHOP, new_nh, &cfg->nlinfo); - if (replace_notify) + if (replace_notify && net->ipv4.sysctl_nexthop_compat_mode) nexthop_replace_notify(net, new_nh, &cfg->nlinfo); } diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 81b267e990a1..95ad71e76cc3 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -710,6 +710,15 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_tcp_early_demux }, + { + .procname = "nexthop_compat_mode", + .data = &init_net.ipv4.sysctl_nexthop_compat_mode, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, { .procname = "ip_default_ttl", .data = &init_net.ipv4.sysctl_ip_default_ttl, diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 486c36a14f24..803212aae4ca 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -5557,7 +5557,8 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, if (nexthop_is_blackhole(rt->nh)) rtm->rtm_type = RTN_BLACKHOLE; - if (rt6_fill_node_nexthop(skb, rt->nh, &nh_flags) < 0) + if (net->ipv4.sysctl_nexthop_compat_mode && + rt6_fill_node_nexthop(skb, rt->nh, &nh_flags) < 0) goto nla_put_failure; rtm->rtm_flags |= nh_flags; -- cgit v1.2.3 From ff2269f16a1e1a7f8bbe72920d3d285ba3943572 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 28 Apr 2020 00:01:21 +0200 Subject: docs: networking: convert atm.txt to ReST There isn't much to be done here. Just: - add SPDX header; - add a document title. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/atm.rst | 14 ++++++++++++++ Documentation/networking/atm.txt | 8 -------- Documentation/networking/index.rst | 1 + net/atm/Kconfig | 2 +- 4 files changed, 16 insertions(+), 9 deletions(-) create mode 100644 Documentation/networking/atm.rst delete mode 100644 Documentation/networking/atm.txt (limited to 'net') diff --git a/Documentation/networking/atm.rst b/Documentation/networking/atm.rst new file mode 100644 index 000000000000..c1df8c038525 --- /dev/null +++ b/Documentation/networking/atm.rst @@ -0,0 +1,14 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=== +ATM +=== + +In order to use anything but the most primitive functions of ATM, +several user-mode programs are required to assist the kernel. These +programs and related material can be found via the ATM on Linux Web +page at http://linux-atm.sourceforge.net/ + +If you encounter problems with ATM, please report them on the ATM +on Linux mailing list. Subscription information, archives, etc., +can be found on http://linux-atm.sourceforge.net/ diff --git a/Documentation/networking/atm.txt b/Documentation/networking/atm.txt deleted file mode 100644 index 82921cee77fe..000000000000 --- a/Documentation/networking/atm.txt +++ /dev/null @@ -1,8 +0,0 @@ -In order to use anything but the most primitive functions of ATM, -several user-mode programs are required to assist the kernel. These -programs and related material can be found via the ATM on Linux Web -page at http://linux-atm.sourceforge.net/ - -If you encounter problems with ATM, please report them on the ATM -on Linux mailing list. Subscription information, archives, etc., -can be found on http://linux-atm.sourceforge.net/ diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 3e0a4bb23ef9..841f3c3905d5 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -41,6 +41,7 @@ Contents: altera_tse arcnet-hardware arcnet + atm .. only:: subproject and html diff --git a/net/atm/Kconfig b/net/atm/Kconfig index 271f682e8438..e61dcc9f85b2 100644 --- a/net/atm/Kconfig +++ b/net/atm/Kconfig @@ -16,7 +16,7 @@ config ATM of your ATM card below. Note that you need a set of user-space programs to actually make use - of ATM. See the file for + of ATM. See the file for further details. config ATM_CLIP -- cgit v1.2.3 From 20b943f075574c233de51fa2f0124a97f0298be1 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 28 Apr 2020 00:01:22 +0200 Subject: docs: networking: convert ax25.txt to ReST There isn't much to be done here. Just: - add SPDX header; - add a document title. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/ax25.rst | 16 ++++++++++++++++ Documentation/networking/ax25.txt | 10 ---------- Documentation/networking/index.rst | 1 + net/ax25/Kconfig | 6 +++--- 4 files changed, 20 insertions(+), 13 deletions(-) create mode 100644 Documentation/networking/ax25.rst delete mode 100644 Documentation/networking/ax25.txt (limited to 'net') diff --git a/Documentation/networking/ax25.rst b/Documentation/networking/ax25.rst new file mode 100644 index 000000000000..824afd7002db --- /dev/null +++ b/Documentation/networking/ax25.rst @@ -0,0 +1,16 @@ +.. SPDX-License-Identifier: GPL-2.0 + +===== +AX.25 +===== + +To use the amateur radio protocols within Linux you will need to get a +suitable copy of the AX.25 Utilities. More detailed information about +AX.25, NET/ROM and ROSE, associated programs and and utilities can be +found on http://www.linux-ax25.org. + +There is an active mailing list for discussing Linux amateur radio matters +called linux-hams@vger.kernel.org. To subscribe to it, send a message to +majordomo@vger.kernel.org with the words "subscribe linux-hams" in the body +of the message, the subject field is ignored. You don't need to be +subscribed to post but of course that means you might miss an answer. diff --git a/Documentation/networking/ax25.txt b/Documentation/networking/ax25.txt deleted file mode 100644 index 8257dbf9be57..000000000000 --- a/Documentation/networking/ax25.txt +++ /dev/null @@ -1,10 +0,0 @@ -To use the amateur radio protocols within Linux you will need to get a -suitable copy of the AX.25 Utilities. More detailed information about -AX.25, NET/ROM and ROSE, associated programs and and utilities can be -found on http://www.linux-ax25.org. - -There is an active mailing list for discussing Linux amateur radio matters -called linux-hams@vger.kernel.org. To subscribe to it, send a message to -majordomo@vger.kernel.org with the words "subscribe linux-hams" in the body -of the message, the subject field is ignored. You don't need to be -subscribed to post but of course that means you might miss an answer. diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 841f3c3905d5..6a5858b27cf6 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -42,6 +42,7 @@ Contents: arcnet-hardware arcnet atm + ax25 .. only:: subproject and html diff --git a/net/ax25/Kconfig b/net/ax25/Kconfig index 043fd5437809..97d686d115c0 100644 --- a/net/ax25/Kconfig +++ b/net/ax25/Kconfig @@ -40,7 +40,7 @@ config AX25 radio as well as information about how to configure an AX.25 port is contained in the AX25-HOWTO, available from . You might also want to - check out the file in the + check out the file in the kernel source. More information about digital amateur radio in general is on the WWW at . @@ -88,7 +88,7 @@ config NETROM users as well as information about how to configure an AX.25 port is contained in the Linux Ham Wiki, available from . You also might want to check out the - file . More information about + file . More information about digital amateur radio in general is on the WWW at . @@ -107,7 +107,7 @@ config ROSE users as well as information about how to configure an AX.25 port is contained in the Linux Ham Wiki, available from . You also might want to check out the - file . More information about + file . More information about digital amateur radio in general is on the WWW at . -- cgit v1.2.3 From 9a69fb9c21c4bf4107becb877729544759bdd059 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 28 Apr 2020 00:01:30 +0200 Subject: docs: networking: convert decnet.txt to ReST - add SPDX header; - adjust titles and chapters, adding proper markups; - mark lists as such; - mark code blocks and literals as such; - adjust identation, whitespaces and blank lines; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/admin-guide/kernel-parameters.txt | 2 +- Documentation/networking/decnet.rst | 243 ++++++++++++++++++++++++ Documentation/networking/decnet.txt | 230 ---------------------- Documentation/networking/index.rst | 1 + MAINTAINERS | 2 +- net/decnet/Kconfig | 4 +- 6 files changed, 248 insertions(+), 234 deletions(-) create mode 100644 Documentation/networking/decnet.rst delete mode 100644 Documentation/networking/decnet.txt (limited to 'net') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index f2a93c8679e8..b23ab11587a6 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -831,7 +831,7 @@ decnet.addr= [HW,NET] Format: [,] - See also Documentation/networking/decnet.txt. + See also Documentation/networking/decnet.rst. default_hugepagesz= [same as hugepagesz=] The size of the default diff --git a/Documentation/networking/decnet.rst b/Documentation/networking/decnet.rst new file mode 100644 index 000000000000..b8bc11ff8370 --- /dev/null +++ b/Documentation/networking/decnet.rst @@ -0,0 +1,243 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================================= +Linux DECnet Networking Layer Information +========================================= + +1. Other documentation.... +========================== + + - Project Home Pages + - http://www.chygwyn.com/ - Kernel info + - http://linux-decnet.sourceforge.net/ - Userland tools + - http://www.sourceforge.net/projects/linux-decnet/ - Status page + +2. Configuring the kernel +========================= + +Be sure to turn on the following options: + + - CONFIG_DECNET (obviously) + - CONFIG_PROC_FS (to see what's going on) + - CONFIG_SYSCTL (for easy configuration) + +if you want to try out router support (not properly debugged yet) +you'll need the following options as well... + + - CONFIG_DECNET_ROUTER (to be able to add/delete routes) + - CONFIG_NETFILTER (will be required for the DECnet routing daemon) + +Don't turn on SIOCGIFCONF support for DECnet unless you are really sure +that you need it, in general you won't and it can cause ifconfig to +malfunction. + +Run time configuration has changed slightly from the 2.4 system. If you +want to configure an endnode, then the simplified procedure is as follows: + + - Set the MAC address on your ethernet card before starting _any_ other + network protocols. + +As soon as your network card is brought into the UP state, DECnet should +start working. If you need something more complicated or are unsure how +to set the MAC address, see the next section. Also all configurations which +worked with 2.4 will work under 2.5 with no change. + +3. Command line options +======================= + +You can set a DECnet address on the kernel command line for compatibility +with the 2.4 configuration procedure, but in general it's not needed any more. +If you do st a DECnet address on the command line, it has only one purpose +which is that its added to the addresses on the loopback device. + +With 2.4 kernels, DECnet would only recognise addresses as local if they +were added to the loopback device. In 2.5, any local interface address +can be used to loop back to the local machine. Of course this does not +prevent you adding further addresses to the loopback device if you +want to. + +N.B. Since the address list of an interface determines the addresses for +which "hello" messages are sent, if you don't set an address on the loopback +interface then you won't see any entries in /proc/net/neigh for the local +host until such time as you start a connection. This doesn't affect the +operation of the local communications in any other way though. + +The kernel command line takes options looking like the following:: + + decnet.addr=1,2 + +the two numbers are the node address 1,2 = 1.2 For 2.2.xx kernels +and early 2.3.xx kernels, you must use a comma when specifying the +DECnet address like this. For more recent 2.3.xx kernels, you may +use almost any character except space, although a `.` would be the most +obvious choice :-) + +There used to be a third number specifying the node type. This option +has gone away in favour of a per interface node type. This is now set +using /proc/sys/net/decnet/conf//forwarding. This file can be +set with a single digit, 0=EndNode, 1=L1 Router and 2=L2 Router. + +There are also equivalent options for modules. The node address can +also be set through the /proc/sys/net/decnet/ files, as can other system +parameters. + +Currently the only supported devices are ethernet and ip_gre. The +ethernet address of your ethernet card has to be set according to the DECnet +address of the node in order for it to be autoconfigured (and then appear in +/proc/net/decnet_dev). There is a utility available at the above +FTP sites called dn2ethaddr which can compute the correct ethernet +address to use. The address can be set by ifconfig either before or +at the time the device is brought up. If you are using RedHat you can +add the line:: + + MACADDR=AA:00:04:00:03:04 + +or something similar, to /etc/sysconfig/network-scripts/ifcfg-eth0 or +wherever your network card's configuration lives. Setting the MAC address +of your ethernet card to an address starting with "hi-ord" will cause a +DECnet address which matches to be added to the interface (which you can +verify with iproute2). + +The default device for routing can be set through the /proc filesystem +by setting /proc/sys/net/decnet/default_device to the +device you want DECnet to route packets out of when no specific route +is available. Usually this will be eth0, for example:: + + echo -n "eth0" >/proc/sys/net/decnet/default_device + +If you don't set the default device, then it will default to the first +ethernet card which has been autoconfigured as described above. You can +confirm that by looking in the default_device file of course. + +There is a list of what the other files under /proc/sys/net/decnet/ do +on the kernel patch web site (shown above). + +4. Run time kernel configuration +================================ + + +This is either done through the sysctl/proc interface (see the kernel web +pages for details on what the various options do) or through the iproute2 +package in the same way as IPv4/6 configuration is performed. + +Documentation for iproute2 is included with the package, although there is +as yet no specific section on DECnet, most of the features apply to both +IP and DECnet, albeit with DECnet addresses instead of IP addresses and +a reduced functionality. + +If you want to configure a DECnet router you'll need the iproute2 package +since its the _only_ way to add and delete routes currently. Eventually +there will be a routing daemon to send and receive routing messages for +each interface and update the kernel routing tables accordingly. The +routing daemon will use netfilter to listen to routing packets, and +rtnetlink to update the kernels routing tables. + +The DECnet raw socket layer has been removed since it was there purely +for use by the routing daemon which will now use netfilter (a much cleaner +and more generic solution) instead. + +5. How can I tell if its working? +================================= + +Here is a quick guide of what to look for in order to know if your DECnet +kernel subsystem is working. + + - Is the node address set (see /proc/sys/net/decnet/node_address) + - Is the node of the correct type + (see /proc/sys/net/decnet/conf//forwarding) + - Is the Ethernet MAC address of each Ethernet card set to match + the DECnet address. If in doubt use the dn2ethaddr utility available + at the ftp archive. + - If the previous two steps are satisfied, and the Ethernet card is up, + you should find that it is listed in /proc/net/decnet_dev and also + that it appears as a directory in /proc/sys/net/decnet/conf/. The + loopback device (lo) should also appear and is required to communicate + within a node. + - If you have any DECnet routers on your network, they should appear + in /proc/net/decnet_neigh, otherwise this file will only contain the + entry for the node itself (if it doesn't check to see if lo is up). + - If you want to send to any node which is not listed in the + /proc/net/decnet_neigh file, you'll need to set the default device + to point to an Ethernet card with connection to a router. This is + again done with the /proc/sys/net/decnet/default_device file. + - Try starting a simple server and client, like the dnping/dnmirror + over the loopback interface. With luck they should communicate. + For this step and those after, you'll need the DECnet library + which can be obtained from the above ftp sites as well as the + actual utilities themselves. + - If this seems to work, then try talking to a node on your local + network, and see if you can obtain the same results. + - At this point you are on your own... :-) + +6. How to send a bug report +=========================== + +If you've found a bug and want to report it, then there are several things +you can do to help me work out exactly what it is that is wrong. Useful +information (_most_ of which _is_ _essential_) includes: + + - What kernel version are you running ? + - What version of the patch are you running ? + - How far though the above set of tests can you get ? + - What is in the /proc/decnet* files and /proc/sys/net/decnet/* files ? + - Which services are you running ? + - Which client caused the problem ? + - How much data was being transferred ? + - Was the network congested ? + - How can the problem be reproduced ? + - Can you use tcpdump to get a trace ? (N.B. Most (all?) versions of + tcpdump don't understand how to dump DECnet properly, so including + the hex listing of the packet contents is _essential_, usually the -x flag. + You may also need to increase the length grabbed with the -s flag. The + -e flag also provides very useful information (ethernet MAC addresses)) + +7. MAC FAQ +========== + +A quick FAQ on ethernet MAC addresses to explain how Linux and DECnet +interact and how to get the best performance from your hardware. + +Ethernet cards are designed to normally only pass received network frames +to a host computer when they are addressed to it, or to the broadcast address. + +Linux has an interface which allows the setting of extra addresses for +an ethernet card to listen to. If the ethernet card supports it, the +filtering operation will be done in hardware, if not the extra unwanted packets +received will be discarded by the host computer. In the latter case, +significant processor time and bus bandwidth can be used up on a busy +network (see the NAPI documentation for a longer explanation of these +effects). + +DECnet makes use of this interface to allow running DECnet on an ethernet +card which has already been configured using TCP/IP (presumably using the +built in MAC address of the card, as usual) and/or to allow multiple DECnet +addresses on each physical interface. If you do this, be aware that if your +ethernet card doesn't support perfect hashing in its MAC address filter +then your computer will be doing more work than required. Some cards +will simply set themselves into promiscuous mode in order to receive +packets from the DECnet specified addresses. So if you have one of these +cards its better to set the MAC address of the card as described above +to gain the best efficiency. Better still is to use a card which supports +NAPI as well. + + +8. Mailing list +=============== + +If you are keen to get involved in development, or want to ask questions +about configuration, or even just report bugs, then there is a mailing +list that you can join, details are at: + +http://sourceforge.net/mail/?group_id=4993 + +9. Legal Info +============= + +The Linux DECnet project team have placed their code under the GPL. The +software is provided "as is" and without warranty express or implied. +DECnet is a trademark of Compaq. This software is not a product of +Compaq. We acknowledge the help of people at Compaq in providing extra +documentation above and beyond what was previously publicly available. + +Steve Whitehouse + diff --git a/Documentation/networking/decnet.txt b/Documentation/networking/decnet.txt deleted file mode 100644 index d192f8b9948b..000000000000 --- a/Documentation/networking/decnet.txt +++ /dev/null @@ -1,230 +0,0 @@ - Linux DECnet Networking Layer Information - =========================================== - -1) Other documentation.... - - o Project Home Pages - http://www.chygwyn.com/ - Kernel info - http://linux-decnet.sourceforge.net/ - Userland tools - http://www.sourceforge.net/projects/linux-decnet/ - Status page - -2) Configuring the kernel - -Be sure to turn on the following options: - - CONFIG_DECNET (obviously) - CONFIG_PROC_FS (to see what's going on) - CONFIG_SYSCTL (for easy configuration) - -if you want to try out router support (not properly debugged yet) -you'll need the following options as well... - - CONFIG_DECNET_ROUTER (to be able to add/delete routes) - CONFIG_NETFILTER (will be required for the DECnet routing daemon) - -Don't turn on SIOCGIFCONF support for DECnet unless you are really sure -that you need it, in general you won't and it can cause ifconfig to -malfunction. - -Run time configuration has changed slightly from the 2.4 system. If you -want to configure an endnode, then the simplified procedure is as follows: - - o Set the MAC address on your ethernet card before starting _any_ other - network protocols. - -As soon as your network card is brought into the UP state, DECnet should -start working. If you need something more complicated or are unsure how -to set the MAC address, see the next section. Also all configurations which -worked with 2.4 will work under 2.5 with no change. - -3) Command line options - -You can set a DECnet address on the kernel command line for compatibility -with the 2.4 configuration procedure, but in general it's not needed any more. -If you do st a DECnet address on the command line, it has only one purpose -which is that its added to the addresses on the loopback device. - -With 2.4 kernels, DECnet would only recognise addresses as local if they -were added to the loopback device. In 2.5, any local interface address -can be used to loop back to the local machine. Of course this does not -prevent you adding further addresses to the loopback device if you -want to. - -N.B. Since the address list of an interface determines the addresses for -which "hello" messages are sent, if you don't set an address on the loopback -interface then you won't see any entries in /proc/net/neigh for the local -host until such time as you start a connection. This doesn't affect the -operation of the local communications in any other way though. - -The kernel command line takes options looking like the following: - - decnet.addr=1,2 - -the two numbers are the node address 1,2 = 1.2 For 2.2.xx kernels -and early 2.3.xx kernels, you must use a comma when specifying the -DECnet address like this. For more recent 2.3.xx kernels, you may -use almost any character except space, although a `.` would be the most -obvious choice :-) - -There used to be a third number specifying the node type. This option -has gone away in favour of a per interface node type. This is now set -using /proc/sys/net/decnet/conf//forwarding. This file can be -set with a single digit, 0=EndNode, 1=L1 Router and 2=L2 Router. - -There are also equivalent options for modules. The node address can -also be set through the /proc/sys/net/decnet/ files, as can other system -parameters. - -Currently the only supported devices are ethernet and ip_gre. The -ethernet address of your ethernet card has to be set according to the DECnet -address of the node in order for it to be autoconfigured (and then appear in -/proc/net/decnet_dev). There is a utility available at the above -FTP sites called dn2ethaddr which can compute the correct ethernet -address to use. The address can be set by ifconfig either before or -at the time the device is brought up. If you are using RedHat you can -add the line: - - MACADDR=AA:00:04:00:03:04 - -or something similar, to /etc/sysconfig/network-scripts/ifcfg-eth0 or -wherever your network card's configuration lives. Setting the MAC address -of your ethernet card to an address starting with "hi-ord" will cause a -DECnet address which matches to be added to the interface (which you can -verify with iproute2). - -The default device for routing can be set through the /proc filesystem -by setting /proc/sys/net/decnet/default_device to the -device you want DECnet to route packets out of when no specific route -is available. Usually this will be eth0, for example: - - echo -n "eth0" >/proc/sys/net/decnet/default_device - -If you don't set the default device, then it will default to the first -ethernet card which has been autoconfigured as described above. You can -confirm that by looking in the default_device file of course. - -There is a list of what the other files under /proc/sys/net/decnet/ do -on the kernel patch web site (shown above). - -4) Run time kernel configuration - -This is either done through the sysctl/proc interface (see the kernel web -pages for details on what the various options do) or through the iproute2 -package in the same way as IPv4/6 configuration is performed. - -Documentation for iproute2 is included with the package, although there is -as yet no specific section on DECnet, most of the features apply to both -IP and DECnet, albeit with DECnet addresses instead of IP addresses and -a reduced functionality. - -If you want to configure a DECnet router you'll need the iproute2 package -since its the _only_ way to add and delete routes currently. Eventually -there will be a routing daemon to send and receive routing messages for -each interface and update the kernel routing tables accordingly. The -routing daemon will use netfilter to listen to routing packets, and -rtnetlink to update the kernels routing tables. - -The DECnet raw socket layer has been removed since it was there purely -for use by the routing daemon which will now use netfilter (a much cleaner -and more generic solution) instead. - -5) How can I tell if its working ? - -Here is a quick guide of what to look for in order to know if your DECnet -kernel subsystem is working. - - - Is the node address set (see /proc/sys/net/decnet/node_address) - - Is the node of the correct type - (see /proc/sys/net/decnet/conf//forwarding) - - Is the Ethernet MAC address of each Ethernet card set to match - the DECnet address. If in doubt use the dn2ethaddr utility available - at the ftp archive. - - If the previous two steps are satisfied, and the Ethernet card is up, - you should find that it is listed in /proc/net/decnet_dev and also - that it appears as a directory in /proc/sys/net/decnet/conf/. The - loopback device (lo) should also appear and is required to communicate - within a node. - - If you have any DECnet routers on your network, they should appear - in /proc/net/decnet_neigh, otherwise this file will only contain the - entry for the node itself (if it doesn't check to see if lo is up). - - If you want to send to any node which is not listed in the - /proc/net/decnet_neigh file, you'll need to set the default device - to point to an Ethernet card with connection to a router. This is - again done with the /proc/sys/net/decnet/default_device file. - - Try starting a simple server and client, like the dnping/dnmirror - over the loopback interface. With luck they should communicate. - For this step and those after, you'll need the DECnet library - which can be obtained from the above ftp sites as well as the - actual utilities themselves. - - If this seems to work, then try talking to a node on your local - network, and see if you can obtain the same results. - - At this point you are on your own... :-) - -6) How to send a bug report - -If you've found a bug and want to report it, then there are several things -you can do to help me work out exactly what it is that is wrong. Useful -information (_most_ of which _is_ _essential_) includes: - - - What kernel version are you running ? - - What version of the patch are you running ? - - How far though the above set of tests can you get ? - - What is in the /proc/decnet* files and /proc/sys/net/decnet/* files ? - - Which services are you running ? - - Which client caused the problem ? - - How much data was being transferred ? - - Was the network congested ? - - How can the problem be reproduced ? - - Can you use tcpdump to get a trace ? (N.B. Most (all?) versions of - tcpdump don't understand how to dump DECnet properly, so including - the hex listing of the packet contents is _essential_, usually the -x flag. - You may also need to increase the length grabbed with the -s flag. The - -e flag also provides very useful information (ethernet MAC addresses)) - -7) MAC FAQ - -A quick FAQ on ethernet MAC addresses to explain how Linux and DECnet -interact and how to get the best performance from your hardware. - -Ethernet cards are designed to normally only pass received network frames -to a host computer when they are addressed to it, or to the broadcast address. - -Linux has an interface which allows the setting of extra addresses for -an ethernet card to listen to. If the ethernet card supports it, the -filtering operation will be done in hardware, if not the extra unwanted packets -received will be discarded by the host computer. In the latter case, -significant processor time and bus bandwidth can be used up on a busy -network (see the NAPI documentation for a longer explanation of these -effects). - -DECnet makes use of this interface to allow running DECnet on an ethernet -card which has already been configured using TCP/IP (presumably using the -built in MAC address of the card, as usual) and/or to allow multiple DECnet -addresses on each physical interface. If you do this, be aware that if your -ethernet card doesn't support perfect hashing in its MAC address filter -then your computer will be doing more work than required. Some cards -will simply set themselves into promiscuous mode in order to receive -packets from the DECnet specified addresses. So if you have one of these -cards its better to set the MAC address of the card as described above -to gain the best efficiency. Better still is to use a card which supports -NAPI as well. - - -8) Mailing list - -If you are keen to get involved in development, or want to ask questions -about configuration, or even just report bugs, then there is a mailing -list that you can join, details are at: - -http://sourceforge.net/mail/?group_id=4993 - -9) Legal Info - -The Linux DECnet project team have placed their code under the GPL. The -software is provided "as is" and without warranty express or implied. -DECnet is a trademark of Compaq. This software is not a product of -Compaq. We acknowledge the help of people at Compaq in providing extra -documentation above and beyond what was previously publicly available. - -Steve Whitehouse - diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 9e83d3bda4e0..e17432492745 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -50,6 +50,7 @@ Contents: cxacru dccp dctcp + decnet .. only:: subproject and html diff --git a/MAINTAINERS b/MAINTAINERS index 453fe0713e68..7323bfc1720f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4728,7 +4728,7 @@ DECnet NETWORK LAYER L: linux-decnet-user@lists.sourceforge.net S: Orphan W: http://linux-decnet.sourceforge.net -F: Documentation/networking/decnet.txt +F: Documentation/networking/decnet.rst F: net/decnet/ DECSTATION PLATFORM SUPPORT diff --git a/net/decnet/Kconfig b/net/decnet/Kconfig index 0935453ccfd5..8f98fb2f2ec9 100644 --- a/net/decnet/Kconfig +++ b/net/decnet/Kconfig @@ -15,7 +15,7 @@ config DECNET . More detailed documentation is available in - . + . Be sure to say Y to "/proc file system support" and "Sysctl support" below when using DECnet, since you will need sysctl support to aid @@ -40,4 +40,4 @@ config DECNET_ROUTER filtering" option will be required for the forthcoming routing daemon to work. - See for more information. + See for more information. -- cgit v1.2.3 From 9dfe1361261be48c92fd7cb26909cbcd5d496220 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 28 Apr 2020 00:01:32 +0200 Subject: docs: networking: convert dns_resolver.txt to ReST - add SPDX header; - adjust titles and chapters, adding proper markups; - comment out text-only TOC from html/pdf output; - mark code blocks and literals as such; - adjust identation, whitespaces and blank lines; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/dns_resolver.rst | 155 +++++++++++++++++++++++++++++ Documentation/networking/dns_resolver.txt | 157 ------------------------------ Documentation/networking/index.rst | 1 + net/ceph/Kconfig | 2 +- net/dns_resolver/Kconfig | 2 +- net/dns_resolver/dns_key.c | 2 +- net/dns_resolver/dns_query.c | 2 +- 7 files changed, 160 insertions(+), 161 deletions(-) create mode 100644 Documentation/networking/dns_resolver.rst delete mode 100644 Documentation/networking/dns_resolver.txt (limited to 'net') diff --git a/Documentation/networking/dns_resolver.rst b/Documentation/networking/dns_resolver.rst new file mode 100644 index 000000000000..add4d59a99a5 --- /dev/null +++ b/Documentation/networking/dns_resolver.rst @@ -0,0 +1,155 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=================== +DNS Resolver Module +=================== + +.. Contents: + + - Overview. + - Compilation. + - Setting up. + - Usage. + - Mechanism. + - Debugging. + + +Overview +======== + +The DNS resolver module provides a way for kernel services to make DNS queries +by way of requesting a key of key type dns_resolver. These queries are +upcalled to userspace through /sbin/request-key. + +These routines must be supported by userspace tools dns.upcall, cifs.upcall and +request-key. It is under development and does not yet provide the full feature +set. The features it does support include: + + (*) Implements the dns_resolver key_type to contact userspace. + +It does not yet support the following AFS features: + + (*) Dns query support for AFSDB resource record. + +This code is extracted from the CIFS filesystem. + + +Compilation +=========== + +The module should be enabled by turning on the kernel configuration options:: + + CONFIG_DNS_RESOLVER - tristate "DNS Resolver support" + + +Setting up +========== + +To set up this facility, the /etc/request-key.conf file must be altered so that +/sbin/request-key can appropriately direct the upcalls. For example, to handle +basic dname to IPv4/IPv6 address resolution, the following line should be +added:: + + + #OP TYPE DESC CO-INFO PROGRAM ARG1 ARG2 ARG3 ... + #====== ============ ======= ======= ========================== + create dns_resolver * * /usr/sbin/cifs.upcall %k + +To direct a query for query type 'foo', a line of the following should be added +before the more general line given above as the first match is the one taken:: + + create dns_resolver foo:* * /usr/sbin/dns.foo %k + + +Usage +===== + +To make use of this facility, one of the following functions that are +implemented in the module can be called after doing:: + + #include + + :: + + int dns_query(const char *type, const char *name, size_t namelen, + const char *options, char **_result, time_t *_expiry); + + This is the basic access function. It looks for a cached DNS query and if + it doesn't find it, it upcalls to userspace to make a new DNS query, which + may then be cached. The key description is constructed as a string of the + form:: + + [:] + + where optionally specifies the particular upcall program to invoke, + and thus the type of query to do, and specifies the string to be + looked up. The default query type is a straight hostname to IP address + set lookup. + + The name parameter is not required to be a NUL-terminated string, and its + length should be given by the namelen argument. + + The options parameter may be NULL or it may be a set of options + appropriate to the query type. + + The return value is a string appropriate to the query type. For instance, + for the default query type it is just a list of comma-separated IPv4 and + IPv6 addresses. The caller must free the result. + + The length of the result string is returned on success, and a negative + error code is returned otherwise. -EKEYREJECTED will be returned if the + DNS lookup failed. + + If _expiry is non-NULL, the expiry time (TTL) of the result will be + returned also. + +The kernel maintains an internal keyring in which it caches looked up keys. +This can be cleared by any process that has the CAP_SYS_ADMIN capability by +the use of KEYCTL_KEYRING_CLEAR on the keyring ID. + + +Reading DNS Keys from Userspace +=============================== + +Keys of dns_resolver type can be read from userspace using keyctl_read() or +"keyctl read/print/pipe". + + +Mechanism +========= + +The dnsresolver module registers a key type called "dns_resolver". Keys of +this type are used to transport and cache DNS lookup results from userspace. + +When dns_query() is invoked, it calls request_key() to search the local +keyrings for a cached DNS result. If that fails to find one, it upcalls to +userspace to get a new result. + +Upcalls to userspace are made through the request_key() upcall vector, and are +directed by means of configuration lines in /etc/request-key.conf that tell +/sbin/request-key what program to run to instantiate the key. + +The upcall handler program is responsible for querying the DNS, processing the +result into a form suitable for passing to the keyctl_instantiate_key() +routine. This then passes the data to dns_resolver_instantiate() which strips +off and processes any options included in the data, and then attaches the +remainder of the string to the key as its payload. + +The upcall handler program should set the expiry time on the key to that of the +lowest TTL of all the records it has extracted a result from. This means that +the key will be discarded and recreated when the data it holds has expired. + +dns_query() returns a copy of the value attached to the key, or an error if +that is indicated instead. + +See for further +information about request-key function. + + +Debugging +========= + +Debugging messages can be turned on dynamically by writing a 1 into the +following file:: + + /sys/module/dnsresolver/parameters/debug diff --git a/Documentation/networking/dns_resolver.txt b/Documentation/networking/dns_resolver.txt deleted file mode 100644 index eaa8f9a6fd5d..000000000000 --- a/Documentation/networking/dns_resolver.txt +++ /dev/null @@ -1,157 +0,0 @@ - =================== - DNS Resolver Module - =================== - -Contents: - - - Overview. - - Compilation. - - Setting up. - - Usage. - - Mechanism. - - Debugging. - - -======== -OVERVIEW -======== - -The DNS resolver module provides a way for kernel services to make DNS queries -by way of requesting a key of key type dns_resolver. These queries are -upcalled to userspace through /sbin/request-key. - -These routines must be supported by userspace tools dns.upcall, cifs.upcall and -request-key. It is under development and does not yet provide the full feature -set. The features it does support include: - - (*) Implements the dns_resolver key_type to contact userspace. - -It does not yet support the following AFS features: - - (*) Dns query support for AFSDB resource record. - -This code is extracted from the CIFS filesystem. - - -=========== -COMPILATION -=========== - -The module should be enabled by turning on the kernel configuration options: - - CONFIG_DNS_RESOLVER - tristate "DNS Resolver support" - - -========== -SETTING UP -========== - -To set up this facility, the /etc/request-key.conf file must be altered so that -/sbin/request-key can appropriately direct the upcalls. For example, to handle -basic dname to IPv4/IPv6 address resolution, the following line should be -added: - - #OP TYPE DESC CO-INFO PROGRAM ARG1 ARG2 ARG3 ... - #====== ============ ======= ======= ========================== - create dns_resolver * * /usr/sbin/cifs.upcall %k - -To direct a query for query type 'foo', a line of the following should be added -before the more general line given above as the first match is the one taken. - - create dns_resolver foo:* * /usr/sbin/dns.foo %k - - -===== -USAGE -===== - -To make use of this facility, one of the following functions that are -implemented in the module can be called after doing: - - #include - - (1) int dns_query(const char *type, const char *name, size_t namelen, - const char *options, char **_result, time_t *_expiry); - - This is the basic access function. It looks for a cached DNS query and if - it doesn't find it, it upcalls to userspace to make a new DNS query, which - may then be cached. The key description is constructed as a string of the - form: - - [:] - - where optionally specifies the particular upcall program to invoke, - and thus the type of query to do, and specifies the string to be - looked up. The default query type is a straight hostname to IP address - set lookup. - - The name parameter is not required to be a NUL-terminated string, and its - length should be given by the namelen argument. - - The options parameter may be NULL or it may be a set of options - appropriate to the query type. - - The return value is a string appropriate to the query type. For instance, - for the default query type it is just a list of comma-separated IPv4 and - IPv6 addresses. The caller must free the result. - - The length of the result string is returned on success, and a negative - error code is returned otherwise. -EKEYREJECTED will be returned if the - DNS lookup failed. - - If _expiry is non-NULL, the expiry time (TTL) of the result will be - returned also. - -The kernel maintains an internal keyring in which it caches looked up keys. -This can be cleared by any process that has the CAP_SYS_ADMIN capability by -the use of KEYCTL_KEYRING_CLEAR on the keyring ID. - - -=============================== -READING DNS KEYS FROM USERSPACE -=============================== - -Keys of dns_resolver type can be read from userspace using keyctl_read() or -"keyctl read/print/pipe". - - -========= -MECHANISM -========= - -The dnsresolver module registers a key type called "dns_resolver". Keys of -this type are used to transport and cache DNS lookup results from userspace. - -When dns_query() is invoked, it calls request_key() to search the local -keyrings for a cached DNS result. If that fails to find one, it upcalls to -userspace to get a new result. - -Upcalls to userspace are made through the request_key() upcall vector, and are -directed by means of configuration lines in /etc/request-key.conf that tell -/sbin/request-key what program to run to instantiate the key. - -The upcall handler program is responsible for querying the DNS, processing the -result into a form suitable for passing to the keyctl_instantiate_key() -routine. This then passes the data to dns_resolver_instantiate() which strips -off and processes any options included in the data, and then attaches the -remainder of the string to the key as its payload. - -The upcall handler program should set the expiry time on the key to that of the -lowest TTL of all the records it has extracted a result from. This means that -the key will be discarded and recreated when the data it holds has expired. - -dns_query() returns a copy of the value attached to the key, or an error if -that is indicated instead. - -See for further -information about request-key function. - - -========= -DEBUGGING -========= - -Debugging messages can be turned on dynamically by writing a 1 into the -following file: - - /sys/module/dnsresolver/parameters/debug diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index c893127004b9..55746038a7e9 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -52,6 +52,7 @@ Contents: dctcp decnet defza + dns_resolver .. only:: subproject and html diff --git a/net/ceph/Kconfig b/net/ceph/Kconfig index 2e8e6f904920..d7bec7adc267 100644 --- a/net/ceph/Kconfig +++ b/net/ceph/Kconfig @@ -39,6 +39,6 @@ config CEPH_LIB_USE_DNS_RESOLVER be resolved using the CONFIG_DNS_RESOLVER facility. For information on how to use CONFIG_DNS_RESOLVER consult - Documentation/networking/dns_resolver.txt + Documentation/networking/dns_resolver.rst If unsure, say N. diff --git a/net/dns_resolver/Kconfig b/net/dns_resolver/Kconfig index 0a1c2238b4bd..255df9b6e9e8 100644 --- a/net/dns_resolver/Kconfig +++ b/net/dns_resolver/Kconfig @@ -19,7 +19,7 @@ config DNS_RESOLVER SMB2 later. DNS Resolver is supported by the userspace upcall helper "/sbin/dns.resolver" via /etc/request-key.conf. - See for further + See for further information. To compile this as a module, choose M here: the module will be called diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c index ad53eb31d40f..3aced951d5ab 100644 --- a/net/dns_resolver/dns_key.c +++ b/net/dns_resolver/dns_key.c @@ -1,6 +1,6 @@ /* Key type used to cache DNS lookups made by the kernel * - * See Documentation/networking/dns_resolver.txt + * See Documentation/networking/dns_resolver.rst * * Copyright (c) 2007 Igor Mammedov * Author(s): Igor Mammedov (niallain@gmail.com) diff --git a/net/dns_resolver/dns_query.c b/net/dns_resolver/dns_query.c index cab4e0df924f..82b084cc1cc6 100644 --- a/net/dns_resolver/dns_query.c +++ b/net/dns_resolver/dns_query.c @@ -1,7 +1,7 @@ /* Upcall routine, designed to work as a key type and working through * /sbin/request-key to contact userspace when handling DNS queries. * - * See Documentation/networking/dns_resolver.txt + * See Documentation/networking/dns_resolver.rst * * Copyright (c) 2007 Igor Mammedov * Author(s): Igor Mammedov (niallain@gmail.com) -- cgit v1.2.3 From 8c498935585680284e5f3e5294d3c901b7c89d57 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 28 Apr 2020 00:01:41 +0200 Subject: docs: networking: convert gen_stats.txt to ReST - add SPDX header; - mark code blocks and literals as such; - mark tables as such; - mark lists as such; - adjust identation, whitespaces and blank lines; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/gen_stats.rst | 129 +++++++++++++++++++++++++++++++++ Documentation/networking/gen_stats.txt | 119 ------------------------------ Documentation/networking/index.rst | 1 + net/core/gen_stats.c | 2 +- 4 files changed, 131 insertions(+), 120 deletions(-) create mode 100644 Documentation/networking/gen_stats.rst delete mode 100644 Documentation/networking/gen_stats.txt (limited to 'net') diff --git a/Documentation/networking/gen_stats.rst b/Documentation/networking/gen_stats.rst new file mode 100644 index 000000000000..595a83b9a61b --- /dev/null +++ b/Documentation/networking/gen_stats.rst @@ -0,0 +1,129 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=============================================== +Generic networking statistics for netlink users +=============================================== + +Statistic counters are grouped into structs: + +==================== ===================== ===================== +Struct TLV type Description +==================== ===================== ===================== +gnet_stats_basic TCA_STATS_BASIC Basic statistics +gnet_stats_rate_est TCA_STATS_RATE_EST Rate estimator +gnet_stats_queue TCA_STATS_QUEUE Queue statistics +none TCA_STATS_APP Application specific +==================== ===================== ===================== + + +Collecting: +----------- + +Declare the statistic structs you need:: + + struct mystruct { + struct gnet_stats_basic bstats; + struct gnet_stats_queue qstats; + ... + }; + +Update statistics, in dequeue() methods only, (while owning qdisc->running):: + + mystruct->tstats.packet++; + mystruct->qstats.backlog += skb->pkt_len; + + +Export to userspace (Dump): +--------------------------- + +:: + + my_dumping_routine(struct sk_buff *skb, ...) + { + struct gnet_dump dump; + + if (gnet_stats_start_copy(skb, TCA_STATS2, &mystruct->lock, &dump, + TCA_PAD) < 0) + goto rtattr_failure; + + if (gnet_stats_copy_basic(&dump, &mystruct->bstats) < 0 || + gnet_stats_copy_queue(&dump, &mystruct->qstats) < 0 || + gnet_stats_copy_app(&dump, &xstats, sizeof(xstats)) < 0) + goto rtattr_failure; + + if (gnet_stats_finish_copy(&dump) < 0) + goto rtattr_failure; + ... + } + +TCA_STATS/TCA_XSTATS backward compatibility: +-------------------------------------------- + +Prior users of struct tc_stats and xstats can maintain backward +compatibility by calling the compat wrappers to keep providing the +existing TLV types:: + + my_dumping_routine(struct sk_buff *skb, ...) + { + if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, + TCA_XSTATS, &mystruct->lock, &dump, + TCA_PAD) < 0) + goto rtattr_failure; + ... + } + +A struct tc_stats will be filled out during gnet_stats_copy_* calls +and appended to the skb. TCA_XSTATS is provided if gnet_stats_copy_app +was called. + + +Locking: +-------- + +Locks are taken before writing and released once all statistics have +been written. Locks are always released in case of an error. You +are responsible for making sure that the lock is initialized. + + +Rate Estimator: +--------------- + +0) Prepare an estimator attribute. Most likely this would be in user + space. The value of this TLV should contain a tc_estimator structure. + As usual, such a TLV needs to be 32 bit aligned and therefore the + length needs to be appropriately set, etc. The estimator interval + and ewma log need to be converted to the appropriate values. + tc_estimator.c::tc_setup_estimator() is advisable to be used as the + conversion routine. It does a few clever things. It takes a time + interval in microsecs, a time constant also in microsecs and a struct + tc_estimator to be populated. The returned tc_estimator can be + transported to the kernel. Transfer such a structure in a TLV of type + TCA_RATE to your code in the kernel. + +In the kernel when setting up: + +1) make sure you have basic stats and rate stats setup first. +2) make sure you have initialized stats lock that is used to setup such + stats. +3) Now initialize a new estimator:: + + int ret = gen_new_estimator(my_basicstats,my_rate_est_stats, + mystats_lock, attr_with_tcestimator_struct); + + if ret == 0 + success + else + failed + +From now on, every time you dump my_rate_est_stats it will contain +up-to-date info. + +Once you are done, call gen_kill_estimator(my_basicstats, +my_rate_est_stats) Make sure that my_basicstats and my_rate_est_stats +are still valid (i.e still exist) at the time of making this call. + + +Authors: +-------- +- Thomas Graf +- Jamal Hadi Salim diff --git a/Documentation/networking/gen_stats.txt b/Documentation/networking/gen_stats.txt deleted file mode 100644 index 179b18ce45ff..000000000000 --- a/Documentation/networking/gen_stats.txt +++ /dev/null @@ -1,119 +0,0 @@ -Generic networking statistics for netlink users -====================================================================== - -Statistic counters are grouped into structs: - -Struct TLV type Description ----------------------------------------------------------------------- -gnet_stats_basic TCA_STATS_BASIC Basic statistics -gnet_stats_rate_est TCA_STATS_RATE_EST Rate estimator -gnet_stats_queue TCA_STATS_QUEUE Queue statistics -none TCA_STATS_APP Application specific - - -Collecting: ------------ - -Declare the statistic structs you need: -struct mystruct { - struct gnet_stats_basic bstats; - struct gnet_stats_queue qstats; - ... -}; - -Update statistics, in dequeue() methods only, (while owning qdisc->running) -mystruct->tstats.packet++; -mystruct->qstats.backlog += skb->pkt_len; - - -Export to userspace (Dump): ---------------------------- - -my_dumping_routine(struct sk_buff *skb, ...) -{ - struct gnet_dump dump; - - if (gnet_stats_start_copy(skb, TCA_STATS2, &mystruct->lock, &dump, - TCA_PAD) < 0) - goto rtattr_failure; - - if (gnet_stats_copy_basic(&dump, &mystruct->bstats) < 0 || - gnet_stats_copy_queue(&dump, &mystruct->qstats) < 0 || - gnet_stats_copy_app(&dump, &xstats, sizeof(xstats)) < 0) - goto rtattr_failure; - - if (gnet_stats_finish_copy(&dump) < 0) - goto rtattr_failure; - ... -} - -TCA_STATS/TCA_XSTATS backward compatibility: --------------------------------------------- - -Prior users of struct tc_stats and xstats can maintain backward -compatibility by calling the compat wrappers to keep providing the -existing TLV types. - -my_dumping_routine(struct sk_buff *skb, ...) -{ - if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, - TCA_XSTATS, &mystruct->lock, &dump, - TCA_PAD) < 0) - goto rtattr_failure; - ... -} - -A struct tc_stats will be filled out during gnet_stats_copy_* calls -and appended to the skb. TCA_XSTATS is provided if gnet_stats_copy_app -was called. - - -Locking: --------- - -Locks are taken before writing and released once all statistics have -been written. Locks are always released in case of an error. You -are responsible for making sure that the lock is initialized. - - -Rate Estimator: --------------- - -0) Prepare an estimator attribute. Most likely this would be in user - space. The value of this TLV should contain a tc_estimator structure. - As usual, such a TLV needs to be 32 bit aligned and therefore the - length needs to be appropriately set, etc. The estimator interval - and ewma log need to be converted to the appropriate values. - tc_estimator.c::tc_setup_estimator() is advisable to be used as the - conversion routine. It does a few clever things. It takes a time - interval in microsecs, a time constant also in microsecs and a struct - tc_estimator to be populated. The returned tc_estimator can be - transported to the kernel. Transfer such a structure in a TLV of type - TCA_RATE to your code in the kernel. - -In the kernel when setting up: -1) make sure you have basic stats and rate stats setup first. -2) make sure you have initialized stats lock that is used to setup such - stats. -3) Now initialize a new estimator: - - int ret = gen_new_estimator(my_basicstats,my_rate_est_stats, - mystats_lock, attr_with_tcestimator_struct); - - if ret == 0 - success - else - failed - -From now on, every time you dump my_rate_est_stats it will contain -up-to-date info. - -Once you are done, call gen_kill_estimator(my_basicstats, -my_rate_est_stats) Make sure that my_basicstats and my_rate_est_stats -are still valid (i.e still exist) at the time of making this call. - - -Authors: --------- -Thomas Graf -Jamal Hadi Salim diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 42e556509e22..33afbb67f3fa 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -61,6 +61,7 @@ Contents: framerelay generic-hdlc generic_netlink + gen_stats .. only:: subproject and html diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index 1d653fbfcf52..e491b083b348 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -6,7 +6,7 @@ * Jamal Hadi Salim * Alexey Kuznetsov, * - * See Documentation/networking/gen_stats.txt + * See Documentation/networking/gen_stats.rst */ #include -- cgit v1.2.3 From 1cec2cacaaec5d53adc04dd3ecfdb687b26c0e89 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 28 Apr 2020 00:01:49 +0200 Subject: docs: networking: convert ip-sysctl.txt to ReST - add SPDX header; - adjust titles and chapters, adding proper markups; - mark code blocks and literals as such; - mark lists as such; - mark tables as such; - use footnote markup; - adjust identation, whitespaces and blank lines; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/admin-guide/kernel-parameters.txt | 2 +- Documentation/admin-guide/sysctl/net.rst | 2 +- Documentation/networking/index.rst | 1 + Documentation/networking/ip-sysctl.rst | 2649 +++++++++++++++++++++++ Documentation/networking/ip-sysctl.txt | 2374 -------------------- Documentation/networking/snmp_counter.rst | 2 +- net/Kconfig | 2 +- net/ipv4/Kconfig | 2 +- net/ipv4/icmp.c | 2 +- 9 files changed, 2656 insertions(+), 2380 deletions(-) create mode 100644 Documentation/networking/ip-sysctl.rst delete mode 100644 Documentation/networking/ip-sysctl.txt (limited to 'net') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index b23ab11587a6..e37db6f1be64 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4910,7 +4910,7 @@ Set the number of tcp_metrics_hash slots. Default value is 8192 or 16384 depending on total ram pages. This is used to specify the TCP metrics - cache size. See Documentation/networking/ip-sysctl.txt + cache size. See Documentation/networking/ip-sysctl.rst "tcp_no_metrics_save" section for more details. tdfx= [HW,DRM] diff --git a/Documentation/admin-guide/sysctl/net.rst b/Documentation/admin-guide/sysctl/net.rst index e043c9213388..84e3348a9543 100644 --- a/Documentation/admin-guide/sysctl/net.rst +++ b/Documentation/admin-guide/sysctl/net.rst @@ -353,7 +353,7 @@ socket's buffer. It will not take effect unless PF_UNIX flag is specified. 3. /proc/sys/net/ipv4 - IPV4 settings ------------------------------------- -Please see: Documentation/networking/ip-sysctl.txt and ipvs-sysctl.txt for +Please see: Documentation/networking/ip-sysctl.rst and ipvs-sysctl.txt for descriptions of these entries. diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 3efb4608649a..7d133d8dbe2a 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -69,6 +69,7 @@ Contents: ip_dynaddr iphase ipsec + ip-sysctl .. only:: subproject and html diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst new file mode 100644 index 000000000000..38f811d4b2f0 --- /dev/null +++ b/Documentation/networking/ip-sysctl.rst @@ -0,0 +1,2649 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========= +IP Sysctl +========= + +/proc/sys/net/ipv4/* Variables +============================== + +ip_forward - BOOLEAN + - 0 - disabled (default) + - not 0 - enabled + + Forward Packets between interfaces. + + This variable is special, its change resets all configuration + parameters to their default state (RFC1122 for hosts, RFC1812 + for routers) + +ip_default_ttl - INTEGER + Default value of TTL field (Time To Live) for outgoing (but not + forwarded) IP packets. Should be between 1 and 255 inclusive. + Default: 64 (as recommended by RFC1700) + +ip_no_pmtu_disc - INTEGER + Disable Path MTU Discovery. If enabled in mode 1 and a + fragmentation-required ICMP is received, the PMTU to this + destination will be set to min_pmtu (see below). You will need + to raise min_pmtu to the smallest interface MTU on your system + manually if you want to avoid locally generated fragments. + + In mode 2 incoming Path MTU Discovery messages will be + discarded. Outgoing frames are handled the same as in mode 1, + implicitly setting IP_PMTUDISC_DONT on every created socket. + + Mode 3 is a hardened pmtu discover mode. The kernel will only + accept fragmentation-needed errors if the underlying protocol + can verify them besides a plain socket lookup. Current + protocols for which pmtu events will be honored are TCP, SCTP + and DCCP as they verify e.g. the sequence number or the + association. This mode should not be enabled globally but is + only intended to secure e.g. name servers in namespaces where + TCP path mtu must still work but path MTU information of other + protocols should be discarded. If enabled globally this mode + could break other protocols. + + Possible values: 0-3 + + Default: FALSE + +min_pmtu - INTEGER + default 552 - minimum discovered Path MTU + +ip_forward_use_pmtu - BOOLEAN + By default we don't trust protocol path MTUs while forwarding + because they could be easily forged and can lead to unwanted + fragmentation by the router. + You only need to enable this if you have user-space software + which tries to discover path mtus by itself and depends on the + kernel honoring this information. This is normally not the + case. + + Default: 0 (disabled) + + Possible values: + + - 0 - disabled + - 1 - enabled + +fwmark_reflect - BOOLEAN + Controls the fwmark of kernel-generated IPv4 reply packets that are not + associated with a socket for example, TCP RSTs or ICMP echo replies). + If unset, these packets have a fwmark of zero. If set, they have the + fwmark of the packet they are replying to. + + Default: 0 + +fib_multipath_use_neigh - BOOLEAN + Use status of existing neighbor entry when determining nexthop for + multipath routes. If disabled, neighbor information is not used and + packets could be directed to a failed nexthop. Only valid for kernels + built with CONFIG_IP_ROUTE_MULTIPATH enabled. + + Default: 0 (disabled) + + Possible values: + + - 0 - disabled + - 1 - enabled + +fib_multipath_hash_policy - INTEGER + Controls which hash policy to use for multipath routes. Only valid + for kernels built with CONFIG_IP_ROUTE_MULTIPATH enabled. + + Default: 0 (Layer 3) + + Possible values: + + - 0 - Layer 3 + - 1 - Layer 4 + - 2 - Layer 3 or inner Layer 3 if present + +fib_sync_mem - UNSIGNED INTEGER + Amount of dirty memory from fib entries that can be backlogged before + synchronize_rcu is forced. + + Default: 512kB Minimum: 64kB Maximum: 64MB + +ip_forward_update_priority - INTEGER + Whether to update SKB priority from "TOS" field in IPv4 header after it + is forwarded. The new SKB priority is mapped from TOS field value + according to an rt_tos2priority table (see e.g. man tc-prio). + + Default: 1 (Update priority.) + + Possible values: + + - 0 - Do not update priority. + - 1 - Update priority. + +route/max_size - INTEGER + Maximum number of routes allowed in the kernel. Increase + this when using large numbers of interfaces and/or routes. + + From linux kernel 3.6 onwards, this is deprecated for ipv4 + as route cache is no longer used. + +neigh/default/gc_thresh1 - INTEGER + Minimum number of entries to keep. Garbage collector will not + purge entries if there are fewer than this number. + + Default: 128 + +neigh/default/gc_thresh2 - INTEGER + Threshold when garbage collector becomes more aggressive about + purging entries. Entries older than 5 seconds will be cleared + when over this number. + + Default: 512 + +neigh/default/gc_thresh3 - INTEGER + Maximum number of non-PERMANENT neighbor entries allowed. Increase + this when using large numbers of interfaces and when communicating + with large numbers of directly-connected peers. + + Default: 1024 + +neigh/default/unres_qlen_bytes - INTEGER + The maximum number of bytes which may be used by packets + queued for each unresolved address by other network layers. + (added in linux 3.3) + + Setting negative value is meaningless and will return error. + + Default: SK_WMEM_MAX, (same as net.core.wmem_default). + + Exact value depends on architecture and kernel options, + but should be enough to allow queuing 256 packets + of medium size. + +neigh/default/unres_qlen - INTEGER + The maximum number of packets which may be queued for each + unresolved address by other network layers. + + (deprecated in linux 3.3) : use unres_qlen_bytes instead. + + Prior to linux 3.3, the default value is 3 which may cause + unexpected packet loss. The current default value is calculated + according to default value of unres_qlen_bytes and true size of + packet. + + Default: 101 + +mtu_expires - INTEGER + Time, in seconds, that cached PMTU information is kept. + +min_adv_mss - INTEGER + The advertised MSS depends on the first hop route MTU, but will + never be lower than this setting. + +IP Fragmentation: + +ipfrag_high_thresh - LONG INTEGER + Maximum memory used to reassemble IP fragments. + +ipfrag_low_thresh - LONG INTEGER + (Obsolete since linux-4.17) + Maximum memory used to reassemble IP fragments before the kernel + begins to remove incomplete fragment queues to free up resources. + The kernel still accepts new fragments for defragmentation. + +ipfrag_time - INTEGER + Time in seconds to keep an IP fragment in memory. + +ipfrag_max_dist - INTEGER + ipfrag_max_dist is a non-negative integer value which defines the + maximum "disorder" which is allowed among fragments which share a + common IP source address. Note that reordering of packets is + not unusual, but if a large number of fragments arrive from a source + IP address while a particular fragment queue remains incomplete, it + probably indicates that one or more fragments belonging to that queue + have been lost. When ipfrag_max_dist is positive, an additional check + is done on fragments before they are added to a reassembly queue - if + ipfrag_max_dist (or more) fragments have arrived from a particular IP + address between additions to any IP fragment queue using that source + address, it's presumed that one or more fragments in the queue are + lost. The existing fragment queue will be dropped, and a new one + started. An ipfrag_max_dist value of zero disables this check. + + Using a very small value, e.g. 1 or 2, for ipfrag_max_dist can + result in unnecessarily dropping fragment queues when normal + reordering of packets occurs, which could lead to poor application + performance. Using a very large value, e.g. 50000, increases the + likelihood of incorrectly reassembling IP fragments that originate + from different IP datagrams, which could result in data corruption. + Default: 64 + +INET peer storage +================= + +inet_peer_threshold - INTEGER + The approximate size of the storage. Starting from this threshold + entries will be thrown aggressively. This threshold also determines + entries' time-to-live and time intervals between garbage collection + passes. More entries, less time-to-live, less GC interval. + +inet_peer_minttl - INTEGER + Minimum time-to-live of entries. Should be enough to cover fragment + time-to-live on the reassembling side. This minimum time-to-live is + guaranteed if the pool size is less than inet_peer_threshold. + Measured in seconds. + +inet_peer_maxttl - INTEGER + Maximum time-to-live of entries. Unused entries will expire after + this period of time if there is no memory pressure on the pool (i.e. + when the number of entries in the pool is very small). + Measured in seconds. + +TCP variables +============= + +somaxconn - INTEGER + Limit of socket listen() backlog, known in userspace as SOMAXCONN. + Defaults to 4096. (Was 128 before linux-5.4) + See also tcp_max_syn_backlog for additional tuning for TCP sockets. + +tcp_abort_on_overflow - BOOLEAN + If listening service is too slow to accept new connections, + reset them. Default state is FALSE. It means that if overflow + occurred due to a burst, connection will recover. Enable this + option _only_ if you are really sure that listening daemon + cannot be tuned to accept connections faster. Enabling this + option can harm clients of your server. + +tcp_adv_win_scale - INTEGER + Count buffering overhead as bytes/2^tcp_adv_win_scale + (if tcp_adv_win_scale > 0) or bytes-bytes/2^(-tcp_adv_win_scale), + if it is <= 0. + + Possible values are [-31, 31], inclusive. + + Default: 1 + +tcp_allowed_congestion_control - STRING + Show/set the congestion control choices available to non-privileged + processes. The list is a subset of those listed in + tcp_available_congestion_control. + + Default is "reno" and the default setting (tcp_congestion_control). + +tcp_app_win - INTEGER + Reserve max(window/2^tcp_app_win, mss) of window for application + buffer. Value 0 is special, it means that nothing is reserved. + + Default: 31 + +tcp_autocorking - BOOLEAN + Enable TCP auto corking : + When applications do consecutive small write()/sendmsg() system calls, + we try to coalesce these small writes as much as possible, to lower + total amount of sent packets. This is done if at least one prior + packet for the flow is waiting in Qdisc queues or device transmit + queue. Applications can still use TCP_CORK for optimal behavior + when they know how/when to uncork their sockets. + + Default : 1 + +tcp_available_congestion_control - STRING + Shows the available congestion control choices that are registered. + More congestion control algorithms may be available as modules, + but not loaded. + +tcp_base_mss - INTEGER + The initial value of search_low to be used by the packetization layer + Path MTU discovery (MTU probing). If MTU probing is enabled, + this is the initial MSS used by the connection. + +tcp_mtu_probe_floor - INTEGER + If MTU probing is enabled this caps the minimum MSS used for search_low + for the connection. + + Default : 48 + +tcp_min_snd_mss - INTEGER + TCP SYN and SYNACK messages usually advertise an ADVMSS option, + as described in RFC 1122 and RFC 6691. + + If this ADVMSS option is smaller than tcp_min_snd_mss, + it is silently capped to tcp_min_snd_mss. + + Default : 48 (at least 8 bytes of payload per segment) + +tcp_congestion_control - STRING + Set the congestion control algorithm to be used for new + connections. The algorithm "reno" is always available, but + additional choices may be available based on kernel configuration. + Default is set as part of kernel configuration. + For passive connections, the listener congestion control choice + is inherited. + + [see setsockopt(listenfd, SOL_TCP, TCP_CONGESTION, "name" ...) ] + +tcp_dsack - BOOLEAN + Allows TCP to send "duplicate" SACKs. + +tcp_early_retrans - INTEGER + Tail loss probe (TLP) converts RTOs occurring due to tail + losses into fast recovery (draft-ietf-tcpm-rack). Note that + TLP requires RACK to function properly (see tcp_recovery below) + + Possible values: + + - 0 disables TLP + - 3 or 4 enables TLP + + Default: 3 + +tcp_ecn - INTEGER + Control use of Explicit Congestion Notification (ECN) by TCP. + ECN is used only when both ends of the TCP connection indicate + support for it. This feature is useful in avoiding losses due + to congestion by allowing supporting routers to signal + congestion before having to drop packets. + + Possible values are: + + = ===================================================== + 0 Disable ECN. Neither initiate nor accept ECN. + 1 Enable ECN when requested by incoming connections and + also request ECN on outgoing connection attempts. + 2 Enable ECN when requested by incoming connections + but do not request ECN on outgoing connections. + = ===================================================== + + Default: 2 + +tcp_ecn_fallback - BOOLEAN + If the kernel detects that ECN connection misbehaves, enable fall + back to non-ECN. Currently, this knob implements the fallback + from RFC3168, section 6.1.1.1., but we reserve that in future, + additional detection mechanisms could be implemented under this + knob. The value is not used, if tcp_ecn or per route (or congestion + control) ECN settings are disabled. + + Default: 1 (fallback enabled) + +tcp_fack - BOOLEAN + This is a legacy option, it has no effect anymore. + +tcp_fin_timeout - INTEGER + The length of time an orphaned (no longer referenced by any + application) connection will remain in the FIN_WAIT_2 state + before it is aborted at the local end. While a perfectly + valid "receive only" state for an un-orphaned connection, an + orphaned connection in FIN_WAIT_2 state could otherwise wait + forever for the remote to close its end of the connection. + + Cf. tcp_max_orphans + + Default: 60 seconds + +tcp_frto - INTEGER + Enables Forward RTO-Recovery (F-RTO) defined in RFC5682. + F-RTO is an enhanced recovery algorithm for TCP retransmission + timeouts. It is particularly beneficial in networks where the + RTT fluctuates (e.g., wireless). F-RTO is sender-side only + modification. It does not require any support from the peer. + + By default it's enabled with a non-zero value. 0 disables F-RTO. + +tcp_fwmark_accept - BOOLEAN + If set, incoming connections to listening sockets that do not have a + socket mark will set the mark of the accepting socket to the fwmark of + the incoming SYN packet. This will cause all packets on that connection + (starting from the first SYNACK) to be sent with that fwmark. The + listening socket's mark is unchanged. Listening sockets that already + have a fwmark set via setsockopt(SOL_SOCKET, SO_MARK, ...) are + unaffected. + + Default: 0 + +tcp_invalid_ratelimit - INTEGER + Limit the maximal rate for sending duplicate acknowledgments + in response to incoming TCP packets that are for an existing + connection but that are invalid due to any of these reasons: + + (a) out-of-window sequence number, + (b) out-of-window acknowledgment number, or + (c) PAWS (Protection Against Wrapped Sequence numbers) check failure + + This can help mitigate simple "ack loop" DoS attacks, wherein + a buggy or malicious middlebox or man-in-the-middle can + rewrite TCP header fields in manner that causes each endpoint + to think that the other is sending invalid TCP segments, thus + causing each side to send an unterminating stream of duplicate + acknowledgments for invalid segments. + + Using 0 disables rate-limiting of dupacks in response to + invalid segments; otherwise this value specifies the minimal + space between sending such dupacks, in milliseconds. + + Default: 500 (milliseconds). + +tcp_keepalive_time - INTEGER + How often TCP sends out keepalive messages when keepalive is enabled. + Default: 2hours. + +tcp_keepalive_probes - INTEGER + How many keepalive probes TCP sends out, until it decides that the + connection is broken. Default value: 9. + +tcp_keepalive_intvl - INTEGER + How frequently the probes are send out. Multiplied by + tcp_keepalive_probes it is time to kill not responding connection, + after probes started. Default value: 75sec i.e. connection + will be aborted after ~11 minutes of retries. + +tcp_l3mdev_accept - BOOLEAN + Enables child sockets to inherit the L3 master device index. + Enabling this option allows a "global" listen socket to work + across L3 master domains (e.g., VRFs) with connected sockets + derived from the listen socket to be bound to the L3 domain in + which the packets originated. Only valid when the kernel was + compiled with CONFIG_NET_L3_MASTER_DEV. + + Default: 0 (disabled) + +tcp_low_latency - BOOLEAN + This is a legacy option, it has no effect anymore. + +tcp_max_orphans - INTEGER + Maximal number of TCP sockets not attached to any user file handle, + held by system. If this number is exceeded orphaned connections are + reset immediately and warning is printed. This limit exists + only to prevent simple DoS attacks, you _must_ not rely on this + or lower the limit artificially, but rather increase it + (probably, after increasing installed memory), + if network conditions require more than default value, + and tune network services to linger and kill such states + more aggressively. Let me to remind again: each orphan eats + up to ~64K of unswappable memory. + +tcp_max_syn_backlog - INTEGER + Maximal number of remembered connection requests (SYN_RECV), + which have not received an acknowledgment from connecting client. + + This is a per-listener limit. + + The minimal value is 128 for low memory machines, and it will + increase in proportion to the memory of machine. + + If server suffers from overload, try increasing this number. + + Remember to also check /proc/sys/net/core/somaxconn + A SYN_RECV request socket consumes about 304 bytes of memory. + +tcp_max_tw_buckets - INTEGER + Maximal number of timewait sockets held by system simultaneously. + If this number is exceeded time-wait socket is immediately destroyed + and warning is printed. This limit exists only to prevent + simple DoS attacks, you _must_ not lower the limit artificially, + but rather increase it (probably, after increasing installed memory), + if network conditions require more than default value. + +tcp_mem - vector of 3 INTEGERs: min, pressure, max + min: below this number of pages TCP is not bothered about its + memory appetite. + + pressure: when amount of memory allocated by TCP exceeds this number + of pages, TCP moderates its memory consumption and enters memory + pressure mode, which is exited when memory consumption falls + under "min". + + max: number of pages allowed for queueing by all TCP sockets. + + Defaults are calculated at boot time from amount of available + memory. + +tcp_min_rtt_wlen - INTEGER + The window length of the windowed min filter to track the minimum RTT. + A shorter window lets a flow more quickly pick up new (higher) + minimum RTT when it is moved to a longer path (e.g., due to traffic + engineering). A longer window makes the filter more resistant to RTT + inflations such as transient congestion. The unit is seconds. + + Possible values: 0 - 86400 (1 day) + + Default: 300 + +tcp_moderate_rcvbuf - BOOLEAN + If set, TCP performs receive buffer auto-tuning, attempting to + automatically size the buffer (no greater than tcp_rmem[2]) to + match the size required by the path for full throughput. Enabled by + default. + +tcp_mtu_probing - INTEGER + Controls TCP Packetization-Layer Path MTU Discovery. Takes three + values: + + - 0 - Disabled + - 1 - Disabled by default, enabled when an ICMP black hole detected + - 2 - Always enabled, use initial MSS of tcp_base_mss. + +tcp_probe_interval - UNSIGNED INTEGER + Controls how often to start TCP Packetization-Layer Path MTU + Discovery reprobe. The default is reprobing every 10 minutes as + per RFC4821. + +tcp_probe_threshold - INTEGER + Controls when TCP Packetization-Layer Path MTU Discovery probing + will stop in respect to the width of search range in bytes. Default + is 8 bytes. + +tcp_no_metrics_save - BOOLEAN + By default, TCP saves various connection metrics in the route cache + when the connection closes, so that connections established in the + near future can use these to set initial conditions. Usually, this + increases overall performance, but may sometimes cause performance + degradation. If set, TCP will not cache metrics on closing + connections. + +tcp_no_ssthresh_metrics_save - BOOLEAN + Controls whether TCP saves ssthresh metrics in the route cache. + + Default is 1, which disables ssthresh metrics. + +tcp_orphan_retries - INTEGER + This value influences the timeout of a locally closed TCP connection, + when RTO retransmissions remain unacknowledged. + See tcp_retries2 for more details. + + The default value is 8. + + If your machine is a loaded WEB server, + you should think about lowering this value, such sockets + may consume significant resources. Cf. tcp_max_orphans. + +tcp_recovery - INTEGER + This value is a bitmap to enable various experimental loss recovery + features. + + ========= ============================================================= + RACK: 0x1 enables the RACK loss detection for fast detection of lost + retransmissions and tail drops. It also subsumes and disables + RFC6675 recovery for SACK connections. + + RACK: 0x2 makes RACK's reordering window static (min_rtt/4). + + RACK: 0x4 disables RACK's DUPACK threshold heuristic + ========= ============================================================= + + Default: 0x1 + +tcp_reordering - INTEGER + Initial reordering level of packets in a TCP stream. + TCP stack can then dynamically adjust flow reordering level + between this initial value and tcp_max_reordering + + Default: 3 + +tcp_max_reordering - INTEGER + Maximal reordering level of packets in a TCP stream. + 300 is a fairly conservative value, but you might increase it + if paths are using per packet load balancing (like bonding rr mode) + + Default: 300 + +tcp_retrans_collapse - BOOLEAN + Bug-to-bug compatibility with some broken printers. + On retransmit try to send bigger packets to work around bugs in + certain TCP stacks. + +tcp_retries1 - INTEGER + This value influences the time, after which TCP decides, that + something is wrong due to unacknowledged RTO retransmissions, + and reports this suspicion to the network layer. + See tcp_retries2 for more details. + + RFC 1122 recommends at least 3 retransmissions, which is the + default. + +tcp_retries2 - INTEGER + This value influences the timeout of an alive TCP connection, + when RTO retransmissions remain unacknowledged. + Given a value of N, a hypothetical TCP connection following + exponential backoff with an initial RTO of TCP_RTO_MIN would + retransmit N times before killing the connection at the (N+1)th RTO. + + The default value of 15 yields a hypothetical timeout of 924.6 + seconds and is a lower bound for the effective timeout. + TCP will effectively time out at the first RTO which exceeds the + hypothetical timeout. + + RFC 1122 recommends at least 100 seconds for the timeout, + which corresponds to a value of at least 8. + +tcp_rfc1337 - BOOLEAN + If set, the TCP stack behaves conforming to RFC1337. If unset, + we are not conforming to RFC, but prevent TCP TIME_WAIT + assassination. + + Default: 0 + +tcp_rmem - vector of 3 INTEGERs: min, default, max + min: Minimal size of receive buffer used by TCP sockets. + It is guaranteed to each TCP socket, even under moderate memory + pressure. + + Default: 4K + + default: initial size of receive buffer used by TCP sockets. + This value overrides net.core.rmem_default used by other protocols. + Default: 87380 bytes. This value results in window of 65535 with + default setting of tcp_adv_win_scale and tcp_app_win:0 and a bit + less for default tcp_app_win. See below about these variables. + + max: maximal size of receive buffer allowed for automatically + selected receiver buffers for TCP socket. This value does not override + net.core.rmem_max. Calling setsockopt() with SO_RCVBUF disables + automatic tuning of that socket's receive buffer size, in which + case this value is ignored. + Default: between 87380B and 6MB, depending on RAM size. + +tcp_sack - BOOLEAN + Enable select acknowledgments (SACKS). + +tcp_comp_sack_delay_ns - LONG INTEGER + TCP tries to reduce number of SACK sent, using a timer + based on 5% of SRTT, capped by this sysctl, in nano seconds. + The default is 1ms, based on TSO autosizing period. + + Default : 1,000,000 ns (1 ms) + +tcp_comp_sack_nr - INTEGER + Max number of SACK that can be compressed. + Using 0 disables SACK compression. + + Default : 44 + +tcp_slow_start_after_idle - BOOLEAN + If set, provide RFC2861 behavior and time out the congestion + window after an idle period. An idle period is defined at + the current RTO. If unset, the congestion window will not + be timed out after an idle period. + + Default: 1 + +tcp_stdurg - BOOLEAN + Use the Host requirements interpretation of the TCP urgent pointer field. + Most hosts use the older BSD interpretation, so if you turn this on + Linux might not communicate correctly with them. + + Default: FALSE + +tcp_synack_retries - INTEGER + Number of times SYNACKs for a passive TCP connection attempt will + be retransmitted. Should not be higher than 255. Default value + is 5, which corresponds to 31seconds till the last retransmission + with the current initial RTO of 1second. With this the final timeout + for a passive TCP connection will happen after 63seconds. + +tcp_syncookies - INTEGER + Only valid when the kernel was compiled with CONFIG_SYN_COOKIES + Send out syncookies when the syn backlog queue of a socket + overflows. This is to prevent against the common 'SYN flood attack' + Default: 1 + + Note, that syncookies is fallback facility. + It MUST NOT be used to help highly loaded servers to stand + against legal connection rate. If you see SYN flood warnings + in your logs, but investigation shows that they occur + because of overload with legal connections, you should tune + another parameters until this warning disappear. + See: tcp_max_syn_backlog, tcp_synack_retries, tcp_abort_on_overflow. + + syncookies seriously violate TCP protocol, do not allow + to use TCP extensions, can result in serious degradation + of some services (f.e. SMTP relaying), visible not by you, + but your clients and relays, contacting you. While you see + SYN flood warnings in logs not being really flooded, your server + is seriously misconfigured. + + If you want to test which effects syncookies have to your + network connections you can set this knob to 2 to enable + unconditionally generation of syncookies. + +tcp_fastopen - INTEGER + Enable TCP Fast Open (RFC7413) to send and accept data in the opening + SYN packet. + + The client support is enabled by flag 0x1 (on by default). The client + then must use sendmsg() or sendto() with the MSG_FASTOPEN flag, + rather than connect() to send data in SYN. + + The server support is enabled by flag 0x2 (off by default). Then + either enable for all listeners with another flag (0x400) or + enable individual listeners via TCP_FASTOPEN socket option with + the option value being the length of the syn-data backlog. + + The values (bitmap) are + + ===== ======== ====================================================== + 0x1 (client) enables sending data in the opening SYN on the client. + 0x2 (server) enables the server support, i.e., allowing data in + a SYN packet to be accepted and passed to the + application before 3-way handshake finishes. + 0x4 (client) send data in the opening SYN regardless of cookie + availability and without a cookie option. + 0x200 (server) accept data-in-SYN w/o any cookie option present. + 0x400 (server) enable all listeners to support Fast Open by + default without explicit TCP_FASTOPEN socket option. + ===== ======== ====================================================== + + Default: 0x1 + + Note that that additional client or server features are only + effective if the basic support (0x1 and 0x2) are enabled respectively. + +tcp_fastopen_blackhole_timeout_sec - INTEGER + Initial time period in second to disable Fastopen on active TCP sockets + when a TFO firewall blackhole issue happens. + This time period will grow exponentially when more blackhole issues + get detected right after Fastopen is re-enabled and will reset to + initial value when the blackhole issue goes away. + 0 to disable the blackhole detection. + + By default, it is set to 1hr. + +tcp_fastopen_key - list of comma separated 32-digit hexadecimal INTEGERs + The list consists of a primary key and an optional backup key. The + primary key is used for both creating and validating cookies, while the + optional backup key is only used for validating cookies. The purpose of + the backup key is to maximize TFO validation when keys are rotated. + + A randomly chosen primary key may be configured by the kernel if + the tcp_fastopen sysctl is set to 0x400 (see above), or if the + TCP_FASTOPEN setsockopt() optname is set and a key has not been + previously configured via sysctl. If keys are configured via + setsockopt() by using the TCP_FASTOPEN_KEY optname, then those + per-socket keys will be used instead of any keys that are specified via + sysctl. + + A key is specified as 4 8-digit hexadecimal integers which are separated + by a '-' as: xxxxxxxx-xxxxxxxx-xxxxxxxx-xxxxxxxx. Leading zeros may be + omitted. A primary and a backup key may be specified by separating them + by a comma. If only one key is specified, it becomes the primary key and + any previously configured backup keys are removed. + +tcp_syn_retries - INTEGER + Number of times initial SYNs for an active TCP connection attempt + will be retransmitted. Should not be higher than 127. Default value + is 6, which corresponds to 63seconds till the last retransmission + with the current initial RTO of 1second. With this the final timeout + for an active TCP connection attempt will happen after 127seconds. + +tcp_timestamps - INTEGER + Enable timestamps as defined in RFC1323. + + - 0: Disabled. + - 1: Enable timestamps as defined in RFC1323 and use random offset for + each connection rather than only using the current time. + - 2: Like 1, but without random offsets. + + Default: 1 + +tcp_min_tso_segs - INTEGER + Minimal number of segments per TSO frame. + + Since linux-3.12, TCP does an automatic sizing of TSO frames, + depending on flow rate, instead of filling 64Kbytes packets. + For specific usages, it's possible to force TCP to build big + TSO frames. Note that TCP stack might split too big TSO packets + if available window is too small. + + Default: 2 + +tcp_pacing_ss_ratio - INTEGER + sk->sk_pacing_rate is set by TCP stack using a ratio applied + to current rate. (current_rate = cwnd * mss / srtt) + If TCP is in slow start, tcp_pacing_ss_ratio is applied + to let TCP probe for bigger speeds, assuming cwnd can be + doubled every other RTT. + + Default: 200 + +tcp_pacing_ca_ratio - INTEGER + sk->sk_pacing_rate is set by TCP stack using a ratio applied + to current rate. (current_rate = cwnd * mss / srtt) + If TCP is in congestion avoidance phase, tcp_pacing_ca_ratio + is applied to conservatively probe for bigger throughput. + + Default: 120 + +tcp_tso_win_divisor - INTEGER + This allows control over what percentage of the congestion window + can be consumed by a single TSO frame. + The setting of this parameter is a choice between burstiness and + building larger TSO frames. + + Default: 3 + +tcp_tw_reuse - INTEGER + Enable reuse of TIME-WAIT sockets for new connections when it is + safe from protocol viewpoint. + + - 0 - disable + - 1 - global enable + - 2 - enable for loopback traffic only + + It should not be changed without advice/request of technical + experts. + + Default: 2 + +tcp_window_scaling - BOOLEAN + Enable window scaling as defined in RFC1323. + +tcp_wmem - vector of 3 INTEGERs: min, default, max + min: Amount of memory reserved for send buffers for TCP sockets. + Each TCP socket has rights to use it due to fact of its birth. + + Default: 4K + + default: initial size of send buffer used by TCP sockets. This + value overrides net.core.wmem_default used by other protocols. + + It is usually lower than net.core.wmem_default. + + Default: 16K + + max: Maximal amount of memory allowed for automatically tuned + send buffers for TCP sockets. This value does not override + net.core.wmem_max. Calling setsockopt() with SO_SNDBUF disables + automatic tuning of that socket's send buffer size, in which case + this value is ignored. + + Default: between 64K and 4MB, depending on RAM size. + +tcp_notsent_lowat - UNSIGNED INTEGER + A TCP socket can control the amount of unsent bytes in its write queue, + thanks to TCP_NOTSENT_LOWAT socket option. poll()/select()/epoll() + reports POLLOUT events if the amount of unsent bytes is below a per + socket value, and if the write queue is not full. sendmsg() will + also not add new buffers if the limit is hit. + + This global variable controls the amount of unsent data for + sockets not using TCP_NOTSENT_LOWAT. For these sockets, a change + to the global variable has immediate effect. + + Default: UINT_MAX (0xFFFFFFFF) + +tcp_workaround_signed_windows - BOOLEAN + If set, assume no receipt of a window scaling option means the + remote TCP is broken and treats the window as a signed quantity. + If unset, assume the remote TCP is not broken even if we do + not receive a window scaling option from them. + + Default: 0 + +tcp_thin_linear_timeouts - BOOLEAN + Enable dynamic triggering of linear timeouts for thin streams. + If set, a check is performed upon retransmission by timeout to + determine if the stream is thin (less than 4 packets in flight). + As long as the stream is found to be thin, up to 6 linear + timeouts may be performed before exponential backoff mode is + initiated. This improves retransmission latency for + non-aggressive thin streams, often found to be time-dependent. + For more information on thin streams, see + Documentation/networking/tcp-thin.txt + + Default: 0 + +tcp_limit_output_bytes - INTEGER + Controls TCP Small Queue limit per tcp socket. + TCP bulk sender tends to increase packets in flight until it + gets losses notifications. With SNDBUF autotuning, this can + result in a large amount of packets queued on the local machine + (e.g.: qdiscs, CPU backlog, or device) hurting latency of other + flows, for typical pfifo_fast qdiscs. tcp_limit_output_bytes + limits the number of bytes on qdisc or device to reduce artificial + RTT/cwnd and reduce bufferbloat. + + Default: 1048576 (16 * 65536) + +tcp_challenge_ack_limit - INTEGER + Limits number of Challenge ACK sent per second, as recommended + in RFC 5961 (Improving TCP's Robustness to Blind In-Window Attacks) + Default: 1000 + +tcp_rx_skb_cache - BOOLEAN + Controls a per TCP socket cache of one skb, that might help + performance of some workloads. This might be dangerous + on systems with a lot of TCP sockets, since it increases + memory usage. + + Default: 0 (disabled) + +UDP variables +============= + +udp_l3mdev_accept - BOOLEAN + Enabling this option allows a "global" bound socket to work + across L3 master domains (e.g., VRFs) with packets capable of + being received regardless of the L3 domain in which they + originated. Only valid when the kernel was compiled with + CONFIG_NET_L3_MASTER_DEV. + + Default: 0 (disabled) + +udp_mem - vector of 3 INTEGERs: min, pressure, max + Number of pages allowed for queueing by all UDP sockets. + + min: Below this number of pages UDP is not bothered about its + memory appetite. When amount of memory allocated by UDP exceeds + this number, UDP starts to moderate memory usage. + + pressure: This value was introduced to follow format of tcp_mem. + + max: Number of pages allowed for queueing by all UDP sockets. + + Default is calculated at boot time from amount of available memory. + +udp_rmem_min - INTEGER + Minimal size of receive buffer used by UDP sockets in moderation. + Each UDP socket is able to use the size for receiving data, even if + total pages of UDP sockets exceed udp_mem pressure. The unit is byte. + + Default: 4K + +udp_wmem_min - INTEGER + Minimal size of send buffer used by UDP sockets in moderation. + Each UDP socket is able to use the size for sending data, even if + total pages of UDP sockets exceed udp_mem pressure. The unit is byte. + + Default: 4K + +RAW variables +============= + +raw_l3mdev_accept - BOOLEAN + Enabling this option allows a "global" bound socket to work + across L3 master domains (e.g., VRFs) with packets capable of + being received regardless of the L3 domain in which they + originated. Only valid when the kernel was compiled with + CONFIG_NET_L3_MASTER_DEV. + + Default: 1 (enabled) + +CIPSOv4 Variables +================= + +cipso_cache_enable - BOOLEAN + If set, enable additions to and lookups from the CIPSO label mapping + cache. If unset, additions are ignored and lookups always result in a + miss. However, regardless of the setting the cache is still + invalidated when required when means you can safely toggle this on and + off and the cache will always be "safe". + + Default: 1 + +cipso_cache_bucket_size - INTEGER + The CIPSO label cache consists of a fixed size hash table with each + hash bucket containing a number of cache entries. This variable limits + the number of entries in each hash bucket; the larger the value the + more CIPSO label mappings that can be cached. When the number of + entries in a given hash bucket reaches this limit adding new entries + causes the oldest entry in the bucket to be removed to make room. + + Default: 10 + +cipso_rbm_optfmt - BOOLEAN + Enable the "Optimized Tag 1 Format" as defined in section 3.4.2.6 of + the CIPSO draft specification (see Documentation/netlabel for details). + This means that when set the CIPSO tag will be padded with empty + categories in order to make the packet data 32-bit aligned. + + Default: 0 + +cipso_rbm_structvalid - BOOLEAN + If set, do a very strict check of the CIPSO option when + ip_options_compile() is called. If unset, relax the checks done during + ip_options_compile(). Either way is "safe" as errors are caught else + where in the CIPSO processing code but setting this to 0 (False) should + result in less work (i.e. it should be faster) but could cause problems + with other implementations that require strict checking. + + Default: 0 + +IP Variables +============ + +ip_local_port_range - 2 INTEGERS + Defines the local port range that is used by TCP and UDP to + choose the local port. The first number is the first, the + second the last local port number. + If possible, it is better these numbers have different parity + (one even and one odd value). + Must be greater than or equal to ip_unprivileged_port_start. + The default values are 32768 and 60999 respectively. + +ip_local_reserved_ports - list of comma separated ranges + Specify the ports which are reserved for known third-party + applications. These ports will not be used by automatic port + assignments (e.g. when calling connect() or bind() with port + number 0). Explicit port allocation behavior is unchanged. + + The format used for both input and output is a comma separated + list of ranges (e.g. "1,2-4,10-10" for ports 1, 2, 3, 4 and + 10). Writing to the file will clear all previously reserved + ports and update the current list with the one given in the + input. + + Note that ip_local_port_range and ip_local_reserved_ports + settings are independent and both are considered by the kernel + when determining which ports are available for automatic port + assignments. + + You can reserve ports which are not in the current + ip_local_port_range, e.g.:: + + $ cat /proc/sys/net/ipv4/ip_local_port_range + 32000 60999 + $ cat /proc/sys/net/ipv4/ip_local_reserved_ports + 8080,9148 + + although this is redundant. However such a setting is useful + if later the port range is changed to a value that will + include the reserved ports. + + Default: Empty + +ip_unprivileged_port_start - INTEGER + This is a per-namespace sysctl. It defines the first + unprivileged port in the network namespace. Privileged ports + require root or CAP_NET_BIND_SERVICE in order to bind to them. + To disable all privileged ports, set this to 0. They must not + overlap with the ip_local_port_range. + + Default: 1024 + +ip_nonlocal_bind - BOOLEAN + If set, allows processes to bind() to non-local IP addresses, + which can be quite useful - but may break some applications. + + Default: 0 + +ip_autobind_reuse - BOOLEAN + By default, bind() does not select the ports automatically even if + the new socket and all sockets bound to the port have SO_REUSEADDR. + ip_autobind_reuse allows bind() to reuse the port and this is useful + when you use bind()+connect(), but may break some applications. + The preferred solution is to use IP_BIND_ADDRESS_NO_PORT and this + option should only be set by experts. + Default: 0 + +ip_dynaddr - BOOLEAN + If set non-zero, enables support for dynamic addresses. + If set to a non-zero value larger than 1, a kernel log + message will be printed when dynamic address rewriting + occurs. + + Default: 0 + +ip_early_demux - BOOLEAN + Optimize input packet processing down to one demux for + certain kinds of local sockets. Currently we only do this + for established TCP and connected UDP sockets. + + It may add an additional cost for pure routing workloads that + reduces overall throughput, in such case you should disable it. + + Default: 1 + +ping_group_range - 2 INTEGERS + Restrict ICMP_PROTO datagram sockets to users in the group range. + The default is "1 0", meaning, that nobody (not even root) may + create ping sockets. Setting it to "100 100" would grant permissions + to the single group. "0 4294967295" would enable it for the world, "100 + 4294967295" would enable it for the users, but not daemons. + +tcp_early_demux - BOOLEAN + Enable early demux for established TCP sockets. + + Default: 1 + +udp_early_demux - BOOLEAN + Enable early demux for connected UDP sockets. Disable this if + your system could experience more unconnected load. + + Default: 1 + +icmp_echo_ignore_all - BOOLEAN + If set non-zero, then the kernel will ignore all ICMP ECHO + requests sent to it. + + Default: 0 + +icmp_echo_ignore_broadcasts - BOOLEAN + If set non-zero, then the kernel will ignore all ICMP ECHO and + TIMESTAMP requests sent to it via broadcast/multicast. + + Default: 1 + +icmp_ratelimit - INTEGER + Limit the maximal rates for sending ICMP packets whose type matches + icmp_ratemask (see below) to specific targets. + 0 to disable any limiting, + otherwise the minimal space between responses in milliseconds. + Note that another sysctl, icmp_msgs_per_sec limits the number + of ICMP packets sent on all targets. + + Default: 1000 + +icmp_msgs_per_sec - INTEGER + Limit maximal number of ICMP packets sent per second from this host. + Only messages whose type matches icmp_ratemask (see below) are + controlled by this limit. + + Default: 1000 + +icmp_msgs_burst - INTEGER + icmp_msgs_per_sec controls number of ICMP packets sent per second, + while icmp_msgs_burst controls the burst size of these packets. + + Default: 50 + +icmp_ratemask - INTEGER + Mask made of ICMP types for which rates are being limited. + + Significant bits: IHGFEDCBA9876543210 + + Default mask: 0000001100000011000 (6168) + + Bit definitions (see include/linux/icmp.h): + + = ========================= + 0 Echo Reply + 3 Destination Unreachable [1]_ + 4 Source Quench [1]_ + 5 Redirect + 8 Echo Request + B Time Exceeded [1]_ + C Parameter Problem [1]_ + D Timestamp Request + E Timestamp Reply + F Info Request + G Info Reply + H Address Mask Request + I Address Mask Reply + = ========================= + + .. [1] These are rate limited by default (see default mask above) + +icmp_ignore_bogus_error_responses - BOOLEAN + Some routers violate RFC1122 by sending bogus responses to broadcast + frames. Such violations are normally logged via a kernel warning. + If this is set to TRUE, the kernel will not give such warnings, which + will avoid log file clutter. + + Default: 1 + +icmp_errors_use_inbound_ifaddr - BOOLEAN + + If zero, icmp error messages are sent with the primary address of + the exiting interface. + + If non-zero, the message will be sent with the primary address of + the interface that received the packet that caused the icmp error. + This is the behaviour network many administrators will expect from + a router. And it can make debugging complicated network layouts + much easier. + + Note that if no primary address exists for the interface selected, + then the primary address of the first non-loopback interface that + has one will be used regardless of this setting. + + Default: 0 + +igmp_max_memberships - INTEGER + Change the maximum number of multicast groups we can subscribe to. + Default: 20 + + Theoretical maximum value is bounded by having to send a membership + report in a single datagram (i.e. the report can't span multiple + datagrams, or risk confusing the switch and leaving groups you don't + intend to). + + The number of supported groups 'M' is bounded by the number of group + report entries you can fit into a single datagram of 65535 bytes. + + M = 65536-sizeof (ip header)/(sizeof(Group record)) + + Group records are variable length, with a minimum of 12 bytes. + So net.ipv4.igmp_max_memberships should not be set higher than: + + (65536-24) / 12 = 5459 + + The value 5459 assumes no IP header options, so in practice + this number may be lower. + +igmp_max_msf - INTEGER + Maximum number of addresses allowed in the source filter list for a + multicast group. + + Default: 10 + +igmp_qrv - INTEGER + Controls the IGMP query robustness variable (see RFC2236 8.1). + + Default: 2 (as specified by RFC2236 8.1) + + Minimum: 1 (as specified by RFC6636 4.5) + +force_igmp_version - INTEGER + - 0 - (default) No enforcement of a IGMP version, IGMPv1/v2 fallback + allowed. Will back to IGMPv3 mode again if all IGMPv1/v2 Querier + Present timer expires. + - 1 - Enforce to use IGMP version 1. Will also reply IGMPv1 report if + receive IGMPv2/v3 query. + - 2 - Enforce to use IGMP version 2. Will fallback to IGMPv1 if receive + IGMPv1 query message. Will reply report if receive IGMPv3 query. + - 3 - Enforce to use IGMP version 3. The same react with default 0. + + .. note:: + + this is not the same with force_mld_version because IGMPv3 RFC3376 + Security Considerations does not have clear description that we could + ignore other version messages completely as MLDv2 RFC3810. So make + this value as default 0 is recommended. + +``conf/interface/*`` + changes special settings per interface (where + interface" is the name of your network interface) + +``conf/all/*`` + is special, changes the settings for all interfaces + +log_martians - BOOLEAN + Log packets with impossible addresses to kernel log. + log_martians for the interface will be enabled if at least one of + conf/{all,interface}/log_martians is set to TRUE, + it will be disabled otherwise + +accept_redirects - BOOLEAN + Accept ICMP redirect messages. + accept_redirects for the interface will be enabled if: + + - both conf/{all,interface}/accept_redirects are TRUE in the case + forwarding for the interface is enabled + + or + + - at least one of conf/{all,interface}/accept_redirects is TRUE in the + case forwarding for the interface is disabled + + accept_redirects for the interface will be disabled otherwise + + default: + + - TRUE (host) + - FALSE (router) + +forwarding - BOOLEAN + Enable IP forwarding on this interface. This controls whether packets + received _on_ this interface can be forwarded. + +mc_forwarding - BOOLEAN + Do multicast routing. The kernel needs to be compiled with CONFIG_MROUTE + and a multicast routing daemon is required. + conf/all/mc_forwarding must also be set to TRUE to enable multicast + routing for the interface + +medium_id - INTEGER + Integer value used to differentiate the devices by the medium they + are attached to. Two devices can have different id values when + the broadcast packets are received only on one of them. + The default value 0 means that the device is the only interface + to its medium, value of -1 means that medium is not known. + + Currently, it is used to change the proxy_arp behavior: + the proxy_arp feature is enabled for packets forwarded between + two devices attached to different media. + +proxy_arp - BOOLEAN + Do proxy arp. + + proxy_arp for the interface will be enabled if at least one of + conf/{all,interface}/proxy_arp is set to TRUE, + it will be disabled otherwise + +proxy_arp_pvlan - BOOLEAN + Private VLAN proxy arp. + + Basically allow proxy arp replies back to the same interface + (from which the ARP request/solicitation was received). + + This is done to support (ethernet) switch features, like RFC + 3069, where the individual ports are NOT allowed to + communicate with each other, but they are allowed to talk to + the upstream router. As described in RFC 3069, it is possible + to allow these hosts to communicate through the upstream + router by proxy_arp'ing. Don't need to be used together with + proxy_arp. + + This technology is known by different names: + + In RFC 3069 it is called VLAN Aggregation. + Cisco and Allied Telesyn call it Private VLAN. + Hewlett-Packard call it Source-Port filtering or port-isolation. + Ericsson call it MAC-Forced Forwarding (RFC Draft). + +shared_media - BOOLEAN + Send(router) or accept(host) RFC1620 shared media redirects. + Overrides secure_redirects. + + shared_media for the interface will be enabled if at least one of + conf/{all,interface}/shared_media is set to TRUE, + it will be disabled otherwise + + default TRUE + +secure_redirects - BOOLEAN + Accept ICMP redirect messages only to gateways listed in the + interface's current gateway list. Even if disabled, RFC1122 redirect + rules still apply. + + Overridden by shared_media. + + secure_redirects for the interface will be enabled if at least one of + conf/{all,interface}/secure_redirects is set to TRUE, + it will be disabled otherwise + + default TRUE + +send_redirects - BOOLEAN + Send redirects, if router. + + send_redirects for the interface will be enabled if at least one of + conf/{all,interface}/send_redirects is set to TRUE, + it will be disabled otherwise + + Default: TRUE + +bootp_relay - BOOLEAN + Accept packets with source address 0.b.c.d destined + not to this host as local ones. It is supposed, that + BOOTP relay daemon will catch and forward such packets. + conf/all/bootp_relay must also be set to TRUE to enable BOOTP relay + for the interface + + default FALSE + + Not Implemented Yet. + +accept_source_route - BOOLEAN + Accept packets with SRR option. + conf/all/accept_source_route must also be set to TRUE to accept packets + with SRR option on the interface + + default + + - TRUE (router) + - FALSE (host) + +accept_local - BOOLEAN + Accept packets with local source addresses. In combination with + suitable routing, this can be used to direct packets between two + local interfaces over the wire and have them accepted properly. + default FALSE + +route_localnet - BOOLEAN + Do not consider loopback addresses as martian source or destination + while routing. This enables the use of 127/8 for local routing purposes. + + default FALSE + +rp_filter - INTEGER + - 0 - No source validation. + - 1 - Strict mode as defined in RFC3704 Strict Reverse Path + Each incoming packet is tested against the FIB and if the interface + is not the best reverse path the packet check will fail. + By default failed packets are discarded. + - 2 - Loose mode as defined in RFC3704 Loose Reverse Path + Each incoming packet's source address is also tested against the FIB + and if the source address is not reachable via any interface + the packet check will fail. + + Current recommended practice in RFC3704 is to enable strict mode + to prevent IP spoofing from DDos attacks. If using asymmetric routing + or other complicated routing, then loose mode is recommended. + + The max value from conf/{all,interface}/rp_filter is used + when doing source validation on the {interface}. + + Default value is 0. Note that some distributions enable it + in startup scripts. + +arp_filter - BOOLEAN + - 1 - Allows you to have multiple network interfaces on the same + subnet, and have the ARPs for each interface be answered + based on whether or not the kernel would route a packet from + the ARP'd IP out that interface (therefore you must use source + based routing for this to work). In other words it allows control + of which cards (usually 1) will respond to an arp request. + + - 0 - (default) The kernel can respond to arp requests with addresses + from other interfaces. This may seem wrong but it usually makes + sense, because it increases the chance of successful communication. + IP addresses are owned by the complete host on Linux, not by + particular interfaces. Only for more complex setups like load- + balancing, does this behaviour cause problems. + + arp_filter for the interface will be enabled if at least one of + conf/{all,interface}/arp_filter is set to TRUE, + it will be disabled otherwise + +arp_announce - INTEGER + Define different restriction levels for announcing the local + source IP address from IP packets in ARP requests sent on + interface: + + - 0 - (default) Use any local address, configured on any interface + - 1 - Try to avoid local addresses that are not in the target's + subnet for this interface. This mode is useful when target + hosts reachable via this interface require the source IP + address in ARP requests to be part of their logical network + configured on the receiving interface. When we generate the + request we will check all our subnets that include the + target IP and will preserve the source address if it is from + such subnet. If there is no such subnet we select source + address according to the rules for level 2. + - 2 - Always use the best local address for this target. + In this mode we ignore the source address in the IP packet + and try to select local address that we prefer for talks with + the target host. Such local address is selected by looking + for primary IP addresses on all our subnets on the outgoing + interface that include the target IP address. If no suitable + local address is found we select the first local address + we have on the outgoing interface or on all other interfaces, + with the hope we will receive reply for our request and + even sometimes no matter the source IP address we announce. + + The max value from conf/{all,interface}/arp_announce is used. + + Increasing the restriction level gives more chance for + receiving answer from the resolved target while decreasing + the level announces more valid sender's information. + +arp_ignore - INTEGER + Define different modes for sending replies in response to + received ARP requests that resolve local target IP addresses: + + - 0 - (default): reply for any local target IP address, configured + on any interface + - 1 - reply only if the target IP address is local address + configured on the incoming interface + - 2 - reply only if the target IP address is local address + configured on the incoming interface and both with the + sender's IP address are part from same subnet on this interface + - 3 - do not reply for local addresses configured with scope host, + only resolutions for global and link addresses are replied + - 4-7 - reserved + - 8 - do not reply for all local addresses + + The max value from conf/{all,interface}/arp_ignore is used + when ARP request is received on the {interface} + +arp_notify - BOOLEAN + Define mode for notification of address and device changes. + + == ========================================================== + 0 (default): do nothing + 1 Generate gratuitous arp requests when device is brought up + or hardware address changes. + == ========================================================== + +arp_accept - BOOLEAN + Define behavior for gratuitous ARP frames who's IP is not + already present in the ARP table: + + - 0 - don't create new entries in the ARP table + - 1 - create new entries in the ARP table + + Both replies and requests type gratuitous arp will trigger the + ARP table to be updated, if this setting is on. + + If the ARP table already contains the IP address of the + gratuitous arp frame, the arp table will be updated regardless + if this setting is on or off. + +mcast_solicit - INTEGER + The maximum number of multicast probes in INCOMPLETE state, + when the associated hardware address is unknown. Defaults + to 3. + +ucast_solicit - INTEGER + The maximum number of unicast probes in PROBE state, when + the hardware address is being reconfirmed. Defaults to 3. + +app_solicit - INTEGER + The maximum number of probes to send to the user space ARP daemon + via netlink before dropping back to multicast probes (see + mcast_resolicit). Defaults to 0. + +mcast_resolicit - INTEGER + The maximum number of multicast probes after unicast and + app probes in PROBE state. Defaults to 0. + +disable_policy - BOOLEAN + Disable IPSEC policy (SPD) for this interface + +disable_xfrm - BOOLEAN + Disable IPSEC encryption on this interface, whatever the policy + +igmpv2_unsolicited_report_interval - INTEGER + The interval in milliseconds in which the next unsolicited + IGMPv1 or IGMPv2 report retransmit will take place. + + Default: 10000 (10 seconds) + +igmpv3_unsolicited_report_interval - INTEGER + The interval in milliseconds in which the next unsolicited + IGMPv3 report retransmit will take place. + + Default: 1000 (1 seconds) + +promote_secondaries - BOOLEAN + When a primary IP address is removed from this interface + promote a corresponding secondary IP address instead of + removing all the corresponding secondary IP addresses. + +drop_unicast_in_l2_multicast - BOOLEAN + Drop any unicast IP packets that are received in link-layer + multicast (or broadcast) frames. + + This behavior (for multicast) is actually a SHOULD in RFC + 1122, but is disabled by default for compatibility reasons. + + Default: off (0) + +drop_gratuitous_arp - BOOLEAN + Drop all gratuitous ARP frames, for example if there's a known + good ARP proxy on the network and such frames need not be used + (or in the case of 802.11, must not be used to prevent attacks.) + + Default: off (0) + + +tag - INTEGER + Allows you to write a number, which can be used as required. + + Default value is 0. + +xfrm4_gc_thresh - INTEGER + (Obsolete since linux-4.14) + The threshold at which we will start garbage collecting for IPv4 + destination cache entries. At twice this value the system will + refuse new allocations. + +igmp_link_local_mcast_reports - BOOLEAN + Enable IGMP reports for link local multicast groups in the + 224.0.0.X range. + + Default TRUE + +Alexey Kuznetsov. +kuznet@ms2.inr.ac.ru + +Updated by: + +- Andi Kleen + ak@muc.de +- Nicolas Delon + delon.nicolas@wanadoo.fr + + + + +/proc/sys/net/ipv6/* Variables +============================== + +IPv6 has no global variables such as tcp_*. tcp_* settings under ipv4/ also +apply to IPv6 [XXX?]. + +bindv6only - BOOLEAN + Default value for IPV6_V6ONLY socket option, + which restricts use of the IPv6 socket to IPv6 communication + only. + + - TRUE: disable IPv4-mapped address feature + - FALSE: enable IPv4-mapped address feature + + Default: FALSE (as specified in RFC3493) + +flowlabel_consistency - BOOLEAN + Protect the consistency (and unicity) of flow label. + You have to disable it to use IPV6_FL_F_REFLECT flag on the + flow label manager. + + - TRUE: enabled + - FALSE: disabled + + Default: TRUE + +auto_flowlabels - INTEGER + Automatically generate flow labels based on a flow hash of the + packet. This allows intermediate devices, such as routers, to + identify packet flows for mechanisms like Equal Cost Multipath + Routing (see RFC 6438). + + = =========================================================== + 0 automatic flow labels are completely disabled + 1 automatic flow labels are enabled by default, they can be + disabled on a per socket basis using the IPV6_AUTOFLOWLABEL + socket option + 2 automatic flow labels are allowed, they may be enabled on a + per socket basis using the IPV6_AUTOFLOWLABEL socket option + 3 automatic flow labels are enabled and enforced, they cannot + be disabled by the socket option + = =========================================================== + + Default: 1 + +flowlabel_state_ranges - BOOLEAN + Split the flow label number space into two ranges. 0-0x7FFFF is + reserved for the IPv6 flow manager facility, 0x80000-0xFFFFF + is reserved for stateless flow labels as described in RFC6437. + + - TRUE: enabled + - FALSE: disabled + + Default: true + +flowlabel_reflect - INTEGER + Control flow label reflection. Needed for Path MTU + Discovery to work with Equal Cost Multipath Routing in anycast + environments. See RFC 7690 and: + https://tools.ietf.org/html/draft-wang-6man-flow-label-reflection-01 + + This is a bitmask. + + - 1: enabled for established flows + + Note that this prevents automatic flowlabel changes, as done + in "tcp: change IPv6 flow-label upon receiving spurious retransmission" + and "tcp: Change txhash on every SYN and RTO retransmit" + + - 2: enabled for TCP RESET packets (no active listener) + If set, a RST packet sent in response to a SYN packet on a closed + port will reflect the incoming flow label. + + - 4: enabled for ICMPv6 echo reply messages. + + Default: 0 + +fib_multipath_hash_policy - INTEGER + Controls which hash policy to use for multipath routes. + + Default: 0 (Layer 3) + + Possible values: + + - 0 - Layer 3 (source and destination addresses plus flow label) + - 1 - Layer 4 (standard 5-tuple) + - 2 - Layer 3 or inner Layer 3 if present + +anycast_src_echo_reply - BOOLEAN + Controls the use of anycast addresses as source addresses for ICMPv6 + echo reply + + - TRUE: enabled + - FALSE: disabled + + Default: FALSE + +idgen_delay - INTEGER + Controls the delay in seconds after which time to retry + privacy stable address generation if a DAD conflict is + detected. + + Default: 1 (as specified in RFC7217) + +idgen_retries - INTEGER + Controls the number of retries to generate a stable privacy + address if a DAD conflict is detected. + + Default: 3 (as specified in RFC7217) + +mld_qrv - INTEGER + Controls the MLD query robustness variable (see RFC3810 9.1). + + Default: 2 (as specified by RFC3810 9.1) + + Minimum: 1 (as specified by RFC6636 4.5) + +max_dst_opts_number - INTEGER + Maximum number of non-padding TLVs allowed in a Destination + options extension header. If this value is less than zero + then unknown options are disallowed and the number of known + TLVs allowed is the absolute value of this number. + + Default: 8 + +max_hbh_opts_number - INTEGER + Maximum number of non-padding TLVs allowed in a Hop-by-Hop + options extension header. If this value is less than zero + then unknown options are disallowed and the number of known + TLVs allowed is the absolute value of this number. + + Default: 8 + +max_dst_opts_length - INTEGER + Maximum length allowed for a Destination options extension + header. + + Default: INT_MAX (unlimited) + +max_hbh_length - INTEGER + Maximum length allowed for a Hop-by-Hop options extension + header. + + Default: INT_MAX (unlimited) + +skip_notify_on_dev_down - BOOLEAN + Controls whether an RTM_DELROUTE message is generated for routes + removed when a device is taken down or deleted. IPv4 does not + generate this message; IPv6 does by default. Setting this sysctl + to true skips the message, making IPv4 and IPv6 on par in relying + on userspace caches to track link events and evict routes. + + Default: false (generate message) + +nexthop_compat_mode - BOOLEAN + New nexthop API provides a means for managing nexthops independent of + prefixes. Backwards compatibilty with old route format is enabled by + default which means route dumps and notifications contain the new + nexthop attribute but also the full, expanded nexthop definition. + Further, updates or deletes of a nexthop configuration generate route + notifications for each fib entry using the nexthop. Once a system + understands the new API, this sysctl can be disabled to achieve full + performance benefits of the new API by disabling the nexthop expansion + and extraneous notifications. + Default: true (backward compat mode) + +IPv6 Fragmentation: + +ip6frag_high_thresh - INTEGER + Maximum memory used to reassemble IPv6 fragments. When + ip6frag_high_thresh bytes of memory is allocated for this purpose, + the fragment handler will toss packets until ip6frag_low_thresh + is reached. + +ip6frag_low_thresh - INTEGER + See ip6frag_high_thresh + +ip6frag_time - INTEGER + Time in seconds to keep an IPv6 fragment in memory. + +IPv6 Segment Routing: + +seg6_flowlabel - INTEGER + Controls the behaviour of computing the flowlabel of outer + IPv6 header in case of SR T.encaps + + == ======================================================= + -1 set flowlabel to zero. + 0 copy flowlabel from Inner packet in case of Inner IPv6 + (Set flowlabel to 0 in case IPv4/L2) + 1 Compute the flowlabel using seg6_make_flowlabel() + == ======================================================= + + Default is 0. + +``conf/default/*``: + Change the interface-specific default settings. + + +``conf/all/*``: + Change all the interface-specific settings. + + [XXX: Other special features than forwarding?] + +conf/all/forwarding - BOOLEAN + Enable global IPv6 forwarding between all interfaces. + + IPv4 and IPv6 work differently here; e.g. netfilter must be used + to control which interfaces may forward packets and which not. + + This also sets all interfaces' Host/Router setting + 'forwarding' to the specified value. See below for details. + + This referred to as global forwarding. + +proxy_ndp - BOOLEAN + Do proxy ndp. + +fwmark_reflect - BOOLEAN + Controls the fwmark of kernel-generated IPv6 reply packets that are not + associated with a socket for example, TCP RSTs or ICMPv6 echo replies). + If unset, these packets have a fwmark of zero. If set, they have the + fwmark of the packet they are replying to. + + Default: 0 + +``conf/interface/*``: + Change special settings per interface. + + The functional behaviour for certain settings is different + depending on whether local forwarding is enabled or not. + +accept_ra - INTEGER + Accept Router Advertisements; autoconfigure using them. + + It also determines whether or not to transmit Router + Solicitations. If and only if the functional setting is to + accept Router Advertisements, Router Solicitations will be + transmitted. + + Possible values are: + + == =========================================================== + 0 Do not accept Router Advertisements. + 1 Accept Router Advertisements if forwarding is disabled. + 2 Overrule forwarding behaviour. Accept Router Advertisements + even if forwarding is enabled. + == =========================================================== + + Functional default: + + - enabled if local forwarding is disabled. + - disabled if local forwarding is enabled. + +accept_ra_defrtr - BOOLEAN + Learn default router in Router Advertisement. + + Functional default: + + - enabled if accept_ra is enabled. + - disabled if accept_ra is disabled. + +accept_ra_from_local - BOOLEAN + Accept RA with source-address that is found on local machine + if the RA is otherwise proper and able to be accepted. + + Default is to NOT accept these as it may be an un-intended + network loop. + + Functional default: + + - enabled if accept_ra_from_local is enabled + on a specific interface. + - disabled if accept_ra_from_local is disabled + on a specific interface. + +accept_ra_min_hop_limit - INTEGER + Minimum hop limit Information in Router Advertisement. + + Hop limit Information in Router Advertisement less than this + variable shall be ignored. + + Default: 1 + +accept_ra_pinfo - BOOLEAN + Learn Prefix Information in Router Advertisement. + + Functional default: + + - enabled if accept_ra is enabled. + - disabled if accept_ra is disabled. + +accept_ra_rt_info_min_plen - INTEGER + Minimum prefix length of Route Information in RA. + + Route Information w/ prefix smaller than this variable shall + be ignored. + + Functional default: + + * 0 if accept_ra_rtr_pref is enabled. + * -1 if accept_ra_rtr_pref is disabled. + +accept_ra_rt_info_max_plen - INTEGER + Maximum prefix length of Route Information in RA. + + Route Information w/ prefix larger than this variable shall + be ignored. + + Functional default: + + * 0 if accept_ra_rtr_pref is enabled. + * -1 if accept_ra_rtr_pref is disabled. + +accept_ra_rtr_pref - BOOLEAN + Accept Router Preference in RA. + + Functional default: + + - enabled if accept_ra is enabled. + - disabled if accept_ra is disabled. + +accept_ra_mtu - BOOLEAN + Apply the MTU value specified in RA option 5 (RFC4861). If + disabled, the MTU specified in the RA will be ignored. + + Functional default: + + - enabled if accept_ra is enabled. + - disabled if accept_ra is disabled. + +accept_redirects - BOOLEAN + Accept Redirects. + + Functional default: + + - enabled if local forwarding is disabled. + - disabled if local forwarding is enabled. + +accept_source_route - INTEGER + Accept source routing (routing extension header). + + - >= 0: Accept only routing header type 2. + - < 0: Do not accept routing header. + + Default: 0 + +autoconf - BOOLEAN + Autoconfigure addresses using Prefix Information in Router + Advertisements. + + Functional default: + + - enabled if accept_ra_pinfo is enabled. + - disabled if accept_ra_pinfo is disabled. + +dad_transmits - INTEGER + The amount of Duplicate Address Detection probes to send. + + Default: 1 + +forwarding - INTEGER + Configure interface-specific Host/Router behaviour. + + .. note:: + + It is recommended to have the same setting on all + interfaces; mixed router/host scenarios are rather uncommon. + + Possible values are: + + - 0 Forwarding disabled + - 1 Forwarding enabled + + **FALSE (0)**: + + By default, Host behaviour is assumed. This means: + + 1. IsRouter flag is not set in Neighbour Advertisements. + 2. If accept_ra is TRUE (default), transmit Router + Solicitations. + 3. If accept_ra is TRUE (default), accept Router + Advertisements (and do autoconfiguration). + 4. If accept_redirects is TRUE (default), accept Redirects. + + **TRUE (1)**: + + If local forwarding is enabled, Router behaviour is assumed. + This means exactly the reverse from the above: + + 1. IsRouter flag is set in Neighbour Advertisements. + 2. Router Solicitations are not sent unless accept_ra is 2. + 3. Router Advertisements are ignored unless accept_ra is 2. + 4. Redirects are ignored. + + Default: 0 (disabled) if global forwarding is disabled (default), + otherwise 1 (enabled). + +hop_limit - INTEGER + Default Hop Limit to set. + + Default: 64 + +mtu - INTEGER + Default Maximum Transfer Unit + + Default: 1280 (IPv6 required minimum) + +ip_nonlocal_bind - BOOLEAN + If set, allows processes to bind() to non-local IPv6 addresses, + which can be quite useful - but may break some applications. + + Default: 0 + +router_probe_interval - INTEGER + Minimum interval (in seconds) between Router Probing described + in RFC4191. + + Default: 60 + +router_solicitation_delay - INTEGER + Number of seconds to wait after interface is brought up + before sending Router Solicitations. + + Default: 1 + +router_solicitation_interval - INTEGER + Number of seconds to wait between Router Solicitations. + + Default: 4 + +router_solicitations - INTEGER + Number of Router Solicitations to send until assuming no + routers are present. + + Default: 3 + +use_oif_addrs_only - BOOLEAN + When enabled, the candidate source addresses for destinations + routed via this interface are restricted to the set of addresses + configured on this interface (vis. RFC 6724, section 4). + + Default: false + +use_tempaddr - INTEGER + Preference for Privacy Extensions (RFC3041). + + * <= 0 : disable Privacy Extensions + * == 1 : enable Privacy Extensions, but prefer public + addresses over temporary addresses. + * > 1 : enable Privacy Extensions and prefer temporary + addresses over public addresses. + + Default: + + * 0 (for most devices) + * -1 (for point-to-point devices and loopback devices) + +temp_valid_lft - INTEGER + valid lifetime (in seconds) for temporary addresses. + + Default: 604800 (7 days) + +temp_prefered_lft - INTEGER + Preferred lifetime (in seconds) for temporary addresses. + + Default: 86400 (1 day) + +keep_addr_on_down - INTEGER + Keep all IPv6 addresses on an interface down event. If set static + global addresses with no expiration time are not flushed. + + * >0 : enabled + * 0 : system default + * <0 : disabled + + Default: 0 (addresses are removed) + +max_desync_factor - INTEGER + Maximum value for DESYNC_FACTOR, which is a random value + that ensures that clients don't synchronize with each + other and generate new addresses at exactly the same time. + value is in seconds. + + Default: 600 + +regen_max_retry - INTEGER + Number of attempts before give up attempting to generate + valid temporary addresses. + + Default: 5 + +max_addresses - INTEGER + Maximum number of autoconfigured addresses per interface. Setting + to zero disables the limitation. It is not recommended to set this + value too large (or to zero) because it would be an easy way to + crash the kernel by allowing too many addresses to be created. + + Default: 16 + +disable_ipv6 - BOOLEAN + Disable IPv6 operation. If accept_dad is set to 2, this value + will be dynamically set to TRUE if DAD fails for the link-local + address. + + Default: FALSE (enable IPv6 operation) + + When this value is changed from 1 to 0 (IPv6 is being enabled), + it will dynamically create a link-local address on the given + interface and start Duplicate Address Detection, if necessary. + + When this value is changed from 0 to 1 (IPv6 is being disabled), + it will dynamically delete all addresses and routes on the given + interface. From now on it will not possible to add addresses/routes + to the selected interface. + +accept_dad - INTEGER + Whether to accept DAD (Duplicate Address Detection). + + == ============================================================== + 0 Disable DAD + 1 Enable DAD (default) + 2 Enable DAD, and disable IPv6 operation if MAC-based duplicate + link-local address has been found. + == ============================================================== + + DAD operation and mode on a given interface will be selected according + to the maximum value of conf/{all,interface}/accept_dad. + +force_tllao - BOOLEAN + Enable sending the target link-layer address option even when + responding to a unicast neighbor solicitation. + + Default: FALSE + + Quoting from RFC 2461, section 4.4, Target link-layer address: + + "The option MUST be included for multicast solicitations in order to + avoid infinite Neighbor Solicitation "recursion" when the peer node + does not have a cache entry to return a Neighbor Advertisements + message. When responding to unicast solicitations, the option can be + omitted since the sender of the solicitation has the correct link- + layer address; otherwise it would not have be able to send the unicast + solicitation in the first place. However, including the link-layer + address in this case adds little overhead and eliminates a potential + race condition where the sender deletes the cached link-layer address + prior to receiving a response to a previous solicitation." + +ndisc_notify - BOOLEAN + Define mode for notification of address and device changes. + + * 0 - (default): do nothing + * 1 - Generate unsolicited neighbour advertisements when device is brought + up or hardware address changes. + +ndisc_tclass - INTEGER + The IPv6 Traffic Class to use by default when sending IPv6 Neighbor + Discovery (Router Solicitation, Router Advertisement, Neighbor + Solicitation, Neighbor Advertisement, Redirect) messages. + These 8 bits can be interpreted as 6 high order bits holding the DSCP + value and 2 low order bits representing ECN (which you probably want + to leave cleared). + + * 0 - (default) + +mldv1_unsolicited_report_interval - INTEGER + The interval in milliseconds in which the next unsolicited + MLDv1 report retransmit will take place. + + Default: 10000 (10 seconds) + +mldv2_unsolicited_report_interval - INTEGER + The interval in milliseconds in which the next unsolicited + MLDv2 report retransmit will take place. + + Default: 1000 (1 second) + +force_mld_version - INTEGER + * 0 - (default) No enforcement of a MLD version, MLDv1 fallback allowed + * 1 - Enforce to use MLD version 1 + * 2 - Enforce to use MLD version 2 + +suppress_frag_ndisc - INTEGER + Control RFC 6980 (Security Implications of IPv6 Fragmentation + with IPv6 Neighbor Discovery) behavior: + + * 1 - (default) discard fragmented neighbor discovery packets + * 0 - allow fragmented neighbor discovery packets + +optimistic_dad - BOOLEAN + Whether to perform Optimistic Duplicate Address Detection (RFC 4429). + + * 0: disabled (default) + * 1: enabled + + Optimistic Duplicate Address Detection for the interface will be enabled + if at least one of conf/{all,interface}/optimistic_dad is set to 1, + it will be disabled otherwise. + +use_optimistic - BOOLEAN + If enabled, do not classify optimistic addresses as deprecated during + source address selection. Preferred addresses will still be chosen + before optimistic addresses, subject to other ranking in the source + address selection algorithm. + + * 0: disabled (default) + * 1: enabled + + This will be enabled if at least one of + conf/{all,interface}/use_optimistic is set to 1, disabled otherwise. + +stable_secret - IPv6 address + This IPv6 address will be used as a secret to generate IPv6 + addresses for link-local addresses and autoconfigured + ones. All addresses generated after setting this secret will + be stable privacy ones by default. This can be changed via the + addrgenmode ip-link. conf/default/stable_secret is used as the + secret for the namespace, the interface specific ones can + overwrite that. Writes to conf/all/stable_secret are refused. + + It is recommended to generate this secret during installation + of a system and keep it stable after that. + + By default the stable secret is unset. + +addr_gen_mode - INTEGER + Defines how link-local and autoconf addresses are generated. + + = ================================================================= + 0 generate address based on EUI64 (default) + 1 do no generate a link-local address, use EUI64 for addresses + generated from autoconf + 2 generate stable privacy addresses, using the secret from + stable_secret (RFC7217) + 3 generate stable privacy addresses, using a random secret if unset + = ================================================================= + +drop_unicast_in_l2_multicast - BOOLEAN + Drop any unicast IPv6 packets that are received in link-layer + multicast (or broadcast) frames. + + By default this is turned off. + +drop_unsolicited_na - BOOLEAN + Drop all unsolicited neighbor advertisements, for example if there's + a known good NA proxy on the network and such frames need not be used + (or in the case of 802.11, must not be used to prevent attacks.) + + By default this is turned off. + +enhanced_dad - BOOLEAN + Include a nonce option in the IPv6 neighbor solicitation messages used for + duplicate address detection per RFC7527. A received DAD NS will only signal + a duplicate address if the nonce is different. This avoids any false + detection of duplicates due to loopback of the NS messages that we send. + The nonce option will be sent on an interface unless both of + conf/{all,interface}/enhanced_dad are set to FALSE. + + Default: TRUE + +``icmp/*``: +=========== + +ratelimit - INTEGER + Limit the maximal rates for sending ICMPv6 messages. + + 0 to disable any limiting, + otherwise the minimal space between responses in milliseconds. + + Default: 1000 + +ratemask - list of comma separated ranges + For ICMPv6 message types matching the ranges in the ratemask, limit + the sending of the message according to ratelimit parameter. + + The format used for both input and output is a comma separated + list of ranges (e.g. "0-127,129" for ICMPv6 message type 0 to 127 and + 129). Writing to the file will clear all previous ranges of ICMPv6 + message types and update the current list with the input. + + Refer to: https://www.iana.org/assignments/icmpv6-parameters/icmpv6-parameters.xhtml + for numerical values of ICMPv6 message types, e.g. echo request is 128 + and echo reply is 129. + + Default: 0-1,3-127 (rate limit ICMPv6 errors except Packet Too Big) + +echo_ignore_all - BOOLEAN + If set non-zero, then the kernel will ignore all ICMP ECHO + requests sent to it over the IPv6 protocol. + + Default: 0 + +echo_ignore_multicast - BOOLEAN + If set non-zero, then the kernel will ignore all ICMP ECHO + requests sent to it over the IPv6 protocol via multicast. + + Default: 0 + +echo_ignore_anycast - BOOLEAN + If set non-zero, then the kernel will ignore all ICMP ECHO + requests sent to it over the IPv6 protocol destined to anycast address. + + Default: 0 + +xfrm6_gc_thresh - INTEGER + (Obsolete since linux-4.14) + The threshold at which we will start garbage collecting for IPv6 + destination cache entries. At twice this value the system will + refuse new allocations. + + +IPv6 Update by: +Pekka Savola +YOSHIFUJI Hideaki / USAGI Project + + +/proc/sys/net/bridge/* Variables: +================================= + +bridge-nf-call-arptables - BOOLEAN + - 1 : pass bridged ARP traffic to arptables' FORWARD chain. + - 0 : disable this. + + Default: 1 + +bridge-nf-call-iptables - BOOLEAN + - 1 : pass bridged IPv4 traffic to iptables' chains. + - 0 : disable this. + + Default: 1 + +bridge-nf-call-ip6tables - BOOLEAN + - 1 : pass bridged IPv6 traffic to ip6tables' chains. + - 0 : disable this. + + Default: 1 + +bridge-nf-filter-vlan-tagged - BOOLEAN + - 1 : pass bridged vlan-tagged ARP/IP/IPv6 traffic to {arp,ip,ip6}tables. + - 0 : disable this. + + Default: 0 + +bridge-nf-filter-pppoe-tagged - BOOLEAN + - 1 : pass bridged pppoe-tagged IP/IPv6 traffic to {ip,ip6}tables. + - 0 : disable this. + + Default: 0 + +bridge-nf-pass-vlan-input-dev - BOOLEAN + - 1: if bridge-nf-filter-vlan-tagged is enabled, try to find a vlan + interface on the bridge and set the netfilter input device to the + vlan. This allows use of e.g. "iptables -i br0.1" and makes the + REDIRECT target work with vlan-on-top-of-bridge interfaces. When no + matching vlan interface is found, or this switch is off, the input + device is set to the bridge interface. + + - 0: disable bridge netfilter vlan interface lookup. + + Default: 0 + +``proc/sys/net/sctp/*`` Variables: +================================== + +addip_enable - BOOLEAN + Enable or disable extension of Dynamic Address Reconfiguration + (ADD-IP) functionality specified in RFC5061. This extension provides + the ability to dynamically add and remove new addresses for the SCTP + associations. + + 1: Enable extension. + + 0: Disable extension. + + Default: 0 + +pf_enable - INTEGER + Enable or disable pf (pf is short for potentially failed) state. A value + of pf_retrans > path_max_retrans also disables pf state. That is, one of + both pf_enable and pf_retrans > path_max_retrans can disable pf state. + Since pf_retrans and path_max_retrans can be changed by userspace + application, sometimes user expects to disable pf state by the value of + pf_retrans > path_max_retrans, but occasionally the value of pf_retrans + or path_max_retrans is changed by the user application, this pf state is + enabled. As such, it is necessary to add this to dynamically enable + and disable pf state. See: + https://datatracker.ietf.org/doc/draft-ietf-tsvwg-sctp-failover for + details. + + 1: Enable pf. + + 0: Disable pf. + + Default: 1 + +pf_expose - INTEGER + Unset or enable/disable pf (pf is short for potentially failed) state + exposure. Applications can control the exposure of the PF path state + in the SCTP_PEER_ADDR_CHANGE event and the SCTP_GET_PEER_ADDR_INFO + sockopt. When it's unset, no SCTP_PEER_ADDR_CHANGE event with + SCTP_ADDR_PF state will be sent and a SCTP_PF-state transport info + can be got via SCTP_GET_PEER_ADDR_INFO sockopt; When it's enabled, + a SCTP_PEER_ADDR_CHANGE event will be sent for a transport becoming + SCTP_PF state and a SCTP_PF-state transport info can be got via + SCTP_GET_PEER_ADDR_INFO sockopt; When it's diabled, no + SCTP_PEER_ADDR_CHANGE event will be sent and it returns -EACCES when + trying to get a SCTP_PF-state transport info via SCTP_GET_PEER_ADDR_INFO + sockopt. + + 0: Unset pf state exposure, Compatible with old applications. + + 1: Disable pf state exposure. + + 2: Enable pf state exposure. + + Default: 0 + +addip_noauth_enable - BOOLEAN + Dynamic Address Reconfiguration (ADD-IP) requires the use of + authentication to protect the operations of adding or removing new + addresses. This requirement is mandated so that unauthorized hosts + would not be able to hijack associations. However, older + implementations may not have implemented this requirement while + allowing the ADD-IP extension. For reasons of interoperability, + we provide this variable to control the enforcement of the + authentication requirement. + + == =============================================================== + 1 Allow ADD-IP extension to be used without authentication. This + should only be set in a closed environment for interoperability + with older implementations. + + 0 Enforce the authentication requirement + == =============================================================== + + Default: 0 + +auth_enable - BOOLEAN + Enable or disable Authenticated Chunks extension. This extension + provides the ability to send and receive authenticated chunks and is + required for secure operation of Dynamic Address Reconfiguration + (ADD-IP) extension. + + - 1: Enable this extension. + - 0: Disable this extension. + + Default: 0 + +prsctp_enable - BOOLEAN + Enable or disable the Partial Reliability extension (RFC3758) which + is used to notify peers that a given DATA should no longer be expected. + + - 1: Enable extension + - 0: Disable + + Default: 1 + +max_burst - INTEGER + The limit of the number of new packets that can be initially sent. It + controls how bursty the generated traffic can be. + + Default: 4 + +association_max_retrans - INTEGER + Set the maximum number for retransmissions that an association can + attempt deciding that the remote end is unreachable. If this value + is exceeded, the association is terminated. + + Default: 10 + +max_init_retransmits - INTEGER + The maximum number of retransmissions of INIT and COOKIE-ECHO chunks + that an association will attempt before declaring the destination + unreachable and terminating. + + Default: 8 + +path_max_retrans - INTEGER + The maximum number of retransmissions that will be attempted on a given + path. Once this threshold is exceeded, the path is considered + unreachable, and new traffic will use a different path when the + association is multihomed. + + Default: 5 + +pf_retrans - INTEGER + The number of retransmissions that will be attempted on a given path + before traffic is redirected to an alternate transport (should one + exist). Note this is distinct from path_max_retrans, as a path that + passes the pf_retrans threshold can still be used. Its only + deprioritized when a transmission path is selected by the stack. This + setting is primarily used to enable fast failover mechanisms without + having to reduce path_max_retrans to a very low value. See: + http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt + for details. Note also that a value of pf_retrans > path_max_retrans + disables this feature. Since both pf_retrans and path_max_retrans can + be changed by userspace application, a variable pf_enable is used to + disable pf state. + + Default: 0 + +ps_retrans - INTEGER + Primary.Switchover.Max.Retrans (PSMR), it's a tunable parameter coming + from section-5 "Primary Path Switchover" in rfc7829. The primary path + will be changed to another active path when the path error counter on + the old primary path exceeds PSMR, so that "the SCTP sender is allowed + to continue data transmission on a new working path even when the old + primary destination address becomes active again". Note this feature + is disabled by initializing 'ps_retrans' per netns as 0xffff by default, + and its value can't be less than 'pf_retrans' when changing by sysctl. + + Default: 0xffff + +rto_initial - INTEGER + The initial round trip timeout value in milliseconds that will be used + in calculating round trip times. This is the initial time interval + for retransmissions. + + Default: 3000 + +rto_max - INTEGER + The maximum value (in milliseconds) of the round trip timeout. This + is the largest time interval that can elapse between retransmissions. + + Default: 60000 + +rto_min - INTEGER + The minimum value (in milliseconds) of the round trip timeout. This + is the smallest time interval the can elapse between retransmissions. + + Default: 1000 + +hb_interval - INTEGER + The interval (in milliseconds) between HEARTBEAT chunks. These chunks + are sent at the specified interval on idle paths to probe the state of + a given path between 2 associations. + + Default: 30000 + +sack_timeout - INTEGER + The amount of time (in milliseconds) that the implementation will wait + to send a SACK. + + Default: 200 + +valid_cookie_life - INTEGER + The default lifetime of the SCTP cookie (in milliseconds). The cookie + is used during association establishment. + + Default: 60000 + +cookie_preserve_enable - BOOLEAN + Enable or disable the ability to extend the lifetime of the SCTP cookie + that is used during the establishment phase of SCTP association + + - 1: Enable cookie lifetime extension. + - 0: Disable + + Default: 1 + +cookie_hmac_alg - STRING + Select the hmac algorithm used when generating the cookie value sent by + a listening sctp socket to a connecting client in the INIT-ACK chunk. + Valid values are: + + * md5 + * sha1 + * none + + Ability to assign md5 or sha1 as the selected alg is predicated on the + configuration of those algorithms at build time (CONFIG_CRYPTO_MD5 and + CONFIG_CRYPTO_SHA1). + + Default: Dependent on configuration. MD5 if available, else SHA1 if + available, else none. + +rcvbuf_policy - INTEGER + Determines if the receive buffer is attributed to the socket or to + association. SCTP supports the capability to create multiple + associations on a single socket. When using this capability, it is + possible that a single stalled association that's buffering a lot + of data may block other associations from delivering their data by + consuming all of the receive buffer space. To work around this, + the rcvbuf_policy could be set to attribute the receiver buffer space + to each association instead of the socket. This prevents the described + blocking. + + - 1: rcvbuf space is per association + - 0: rcvbuf space is per socket + + Default: 0 + +sndbuf_policy - INTEGER + Similar to rcvbuf_policy above, this applies to send buffer space. + + - 1: Send buffer is tracked per association + - 0: Send buffer is tracked per socket. + + Default: 0 + +sctp_mem - vector of 3 INTEGERs: min, pressure, max + Number of pages allowed for queueing by all SCTP sockets. + + min: Below this number of pages SCTP is not bothered about its + memory appetite. When amount of memory allocated by SCTP exceeds + this number, SCTP starts to moderate memory usage. + + pressure: This value was introduced to follow format of tcp_mem. + + max: Number of pages allowed for queueing by all SCTP sockets. + + Default is calculated at boot time from amount of available memory. + +sctp_rmem - vector of 3 INTEGERs: min, default, max + Only the first value ("min") is used, "default" and "max" are + ignored. + + min: Minimal size of receive buffer used by SCTP socket. + It is guaranteed to each SCTP socket (but not association) even + under moderate memory pressure. + + Default: 4K + +sctp_wmem - vector of 3 INTEGERs: min, default, max + Currently this tunable has no effect. + +addr_scope_policy - INTEGER + Control IPv4 address scoping - draft-stewart-tsvwg-sctp-ipv4-00 + + - 0 - Disable IPv4 address scoping + - 1 - Enable IPv4 address scoping + - 2 - Follow draft but allow IPv4 private addresses + - 3 - Follow draft but allow IPv4 link local addresses + + Default: 1 + + +``/proc/sys/net/core/*`` +======================== + + Please see: Documentation/admin-guide/sysctl/net.rst for descriptions of these entries. + + +``/proc/sys/net/unix/*`` +======================== + +max_dgram_qlen - INTEGER + The maximum length of dgram socket receive queue + + Default: 10 + diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt deleted file mode 100644 index 5cdc37c34830..000000000000 --- a/Documentation/networking/ip-sysctl.txt +++ /dev/null @@ -1,2374 +0,0 @@ -/proc/sys/net/ipv4/* Variables: - -ip_forward - BOOLEAN - 0 - disabled (default) - not 0 - enabled - - Forward Packets between interfaces. - - This variable is special, its change resets all configuration - parameters to their default state (RFC1122 for hosts, RFC1812 - for routers) - -ip_default_ttl - INTEGER - Default value of TTL field (Time To Live) for outgoing (but not - forwarded) IP packets. Should be between 1 and 255 inclusive. - Default: 64 (as recommended by RFC1700) - -ip_no_pmtu_disc - INTEGER - Disable Path MTU Discovery. If enabled in mode 1 and a - fragmentation-required ICMP is received, the PMTU to this - destination will be set to min_pmtu (see below). You will need - to raise min_pmtu to the smallest interface MTU on your system - manually if you want to avoid locally generated fragments. - - In mode 2 incoming Path MTU Discovery messages will be - discarded. Outgoing frames are handled the same as in mode 1, - implicitly setting IP_PMTUDISC_DONT on every created socket. - - Mode 3 is a hardened pmtu discover mode. The kernel will only - accept fragmentation-needed errors if the underlying protocol - can verify them besides a plain socket lookup. Current - protocols for which pmtu events will be honored are TCP, SCTP - and DCCP as they verify e.g. the sequence number or the - association. This mode should not be enabled globally but is - only intended to secure e.g. name servers in namespaces where - TCP path mtu must still work but path MTU information of other - protocols should be discarded. If enabled globally this mode - could break other protocols. - - Possible values: 0-3 - Default: FALSE - -min_pmtu - INTEGER - default 552 - minimum discovered Path MTU - -ip_forward_use_pmtu - BOOLEAN - By default we don't trust protocol path MTUs while forwarding - because they could be easily forged and can lead to unwanted - fragmentation by the router. - You only need to enable this if you have user-space software - which tries to discover path mtus by itself and depends on the - kernel honoring this information. This is normally not the - case. - Default: 0 (disabled) - Possible values: - 0 - disabled - 1 - enabled - -fwmark_reflect - BOOLEAN - Controls the fwmark of kernel-generated IPv4 reply packets that are not - associated with a socket for example, TCP RSTs or ICMP echo replies). - If unset, these packets have a fwmark of zero. If set, they have the - fwmark of the packet they are replying to. - Default: 0 - -fib_multipath_use_neigh - BOOLEAN - Use status of existing neighbor entry when determining nexthop for - multipath routes. If disabled, neighbor information is not used and - packets could be directed to a failed nexthop. Only valid for kernels - built with CONFIG_IP_ROUTE_MULTIPATH enabled. - Default: 0 (disabled) - Possible values: - 0 - disabled - 1 - enabled - -fib_multipath_hash_policy - INTEGER - Controls which hash policy to use for multipath routes. Only valid - for kernels built with CONFIG_IP_ROUTE_MULTIPATH enabled. - Default: 0 (Layer 3) - Possible values: - 0 - Layer 3 - 1 - Layer 4 - 2 - Layer 3 or inner Layer 3 if present - -fib_sync_mem - UNSIGNED INTEGER - Amount of dirty memory from fib entries that can be backlogged before - synchronize_rcu is forced. - Default: 512kB Minimum: 64kB Maximum: 64MB - -ip_forward_update_priority - INTEGER - Whether to update SKB priority from "TOS" field in IPv4 header after it - is forwarded. The new SKB priority is mapped from TOS field value - according to an rt_tos2priority table (see e.g. man tc-prio). - Default: 1 (Update priority.) - Possible values: - 0 - Do not update priority. - 1 - Update priority. - -route/max_size - INTEGER - Maximum number of routes allowed in the kernel. Increase - this when using large numbers of interfaces and/or routes. - From linux kernel 3.6 onwards, this is deprecated for ipv4 - as route cache is no longer used. - -neigh/default/gc_thresh1 - INTEGER - Minimum number of entries to keep. Garbage collector will not - purge entries if there are fewer than this number. - Default: 128 - -neigh/default/gc_thresh2 - INTEGER - Threshold when garbage collector becomes more aggressive about - purging entries. Entries older than 5 seconds will be cleared - when over this number. - Default: 512 - -neigh/default/gc_thresh3 - INTEGER - Maximum number of non-PERMANENT neighbor entries allowed. Increase - this when using large numbers of interfaces and when communicating - with large numbers of directly-connected peers. - Default: 1024 - -neigh/default/unres_qlen_bytes - INTEGER - The maximum number of bytes which may be used by packets - queued for each unresolved address by other network layers. - (added in linux 3.3) - Setting negative value is meaningless and will return error. - Default: SK_WMEM_MAX, (same as net.core.wmem_default). - Exact value depends on architecture and kernel options, - but should be enough to allow queuing 256 packets - of medium size. - -neigh/default/unres_qlen - INTEGER - The maximum number of packets which may be queued for each - unresolved address by other network layers. - (deprecated in linux 3.3) : use unres_qlen_bytes instead. - Prior to linux 3.3, the default value is 3 which may cause - unexpected packet loss. The current default value is calculated - according to default value of unres_qlen_bytes and true size of - packet. - Default: 101 - -mtu_expires - INTEGER - Time, in seconds, that cached PMTU information is kept. - -min_adv_mss - INTEGER - The advertised MSS depends on the first hop route MTU, but will - never be lower than this setting. - -IP Fragmentation: - -ipfrag_high_thresh - LONG INTEGER - Maximum memory used to reassemble IP fragments. - -ipfrag_low_thresh - LONG INTEGER - (Obsolete since linux-4.17) - Maximum memory used to reassemble IP fragments before the kernel - begins to remove incomplete fragment queues to free up resources. - The kernel still accepts new fragments for defragmentation. - -ipfrag_time - INTEGER - Time in seconds to keep an IP fragment in memory. - -ipfrag_max_dist - INTEGER - ipfrag_max_dist is a non-negative integer value which defines the - maximum "disorder" which is allowed among fragments which share a - common IP source address. Note that reordering of packets is - not unusual, but if a large number of fragments arrive from a source - IP address while a particular fragment queue remains incomplete, it - probably indicates that one or more fragments belonging to that queue - have been lost. When ipfrag_max_dist is positive, an additional check - is done on fragments before they are added to a reassembly queue - if - ipfrag_max_dist (or more) fragments have arrived from a particular IP - address between additions to any IP fragment queue using that source - address, it's presumed that one or more fragments in the queue are - lost. The existing fragment queue will be dropped, and a new one - started. An ipfrag_max_dist value of zero disables this check. - - Using a very small value, e.g. 1 or 2, for ipfrag_max_dist can - result in unnecessarily dropping fragment queues when normal - reordering of packets occurs, which could lead to poor application - performance. Using a very large value, e.g. 50000, increases the - likelihood of incorrectly reassembling IP fragments that originate - from different IP datagrams, which could result in data corruption. - Default: 64 - -INET peer storage: - -inet_peer_threshold - INTEGER - The approximate size of the storage. Starting from this threshold - entries will be thrown aggressively. This threshold also determines - entries' time-to-live and time intervals between garbage collection - passes. More entries, less time-to-live, less GC interval. - -inet_peer_minttl - INTEGER - Minimum time-to-live of entries. Should be enough to cover fragment - time-to-live on the reassembling side. This minimum time-to-live is - guaranteed if the pool size is less than inet_peer_threshold. - Measured in seconds. - -inet_peer_maxttl - INTEGER - Maximum time-to-live of entries. Unused entries will expire after - this period of time if there is no memory pressure on the pool (i.e. - when the number of entries in the pool is very small). - Measured in seconds. - -TCP variables: - -somaxconn - INTEGER - Limit of socket listen() backlog, known in userspace as SOMAXCONN. - Defaults to 4096. (Was 128 before linux-5.4) - See also tcp_max_syn_backlog for additional tuning for TCP sockets. - -tcp_abort_on_overflow - BOOLEAN - If listening service is too slow to accept new connections, - reset them. Default state is FALSE. It means that if overflow - occurred due to a burst, connection will recover. Enable this - option _only_ if you are really sure that listening daemon - cannot be tuned to accept connections faster. Enabling this - option can harm clients of your server. - -tcp_adv_win_scale - INTEGER - Count buffering overhead as bytes/2^tcp_adv_win_scale - (if tcp_adv_win_scale > 0) or bytes-bytes/2^(-tcp_adv_win_scale), - if it is <= 0. - Possible values are [-31, 31], inclusive. - Default: 1 - -tcp_allowed_congestion_control - STRING - Show/set the congestion control choices available to non-privileged - processes. The list is a subset of those listed in - tcp_available_congestion_control. - Default is "reno" and the default setting (tcp_congestion_control). - -tcp_app_win - INTEGER - Reserve max(window/2^tcp_app_win, mss) of window for application - buffer. Value 0 is special, it means that nothing is reserved. - Default: 31 - -tcp_autocorking - BOOLEAN - Enable TCP auto corking : - When applications do consecutive small write()/sendmsg() system calls, - we try to coalesce these small writes as much as possible, to lower - total amount of sent packets. This is done if at least one prior - packet for the flow is waiting in Qdisc queues or device transmit - queue. Applications can still use TCP_CORK for optimal behavior - when they know how/when to uncork their sockets. - Default : 1 - -tcp_available_congestion_control - STRING - Shows the available congestion control choices that are registered. - More congestion control algorithms may be available as modules, - but not loaded. - -tcp_base_mss - INTEGER - The initial value of search_low to be used by the packetization layer - Path MTU discovery (MTU probing). If MTU probing is enabled, - this is the initial MSS used by the connection. - -tcp_mtu_probe_floor - INTEGER - If MTU probing is enabled this caps the minimum MSS used for search_low - for the connection. - - Default : 48 - -tcp_min_snd_mss - INTEGER - TCP SYN and SYNACK messages usually advertise an ADVMSS option, - as described in RFC 1122 and RFC 6691. - If this ADVMSS option is smaller than tcp_min_snd_mss, - it is silently capped to tcp_min_snd_mss. - - Default : 48 (at least 8 bytes of payload per segment) - -tcp_congestion_control - STRING - Set the congestion control algorithm to be used for new - connections. The algorithm "reno" is always available, but - additional choices may be available based on kernel configuration. - Default is set as part of kernel configuration. - For passive connections, the listener congestion control choice - is inherited. - [see setsockopt(listenfd, SOL_TCP, TCP_CONGESTION, "name" ...) ] - -tcp_dsack - BOOLEAN - Allows TCP to send "duplicate" SACKs. - -tcp_early_retrans - INTEGER - Tail loss probe (TLP) converts RTOs occurring due to tail - losses into fast recovery (draft-ietf-tcpm-rack). Note that - TLP requires RACK to function properly (see tcp_recovery below) - Possible values: - 0 disables TLP - 3 or 4 enables TLP - Default: 3 - -tcp_ecn - INTEGER - Control use of Explicit Congestion Notification (ECN) by TCP. - ECN is used only when both ends of the TCP connection indicate - support for it. This feature is useful in avoiding losses due - to congestion by allowing supporting routers to signal - congestion before having to drop packets. - Possible values are: - 0 Disable ECN. Neither initiate nor accept ECN. - 1 Enable ECN when requested by incoming connections and - also request ECN on outgoing connection attempts. - 2 Enable ECN when requested by incoming connections - but do not request ECN on outgoing connections. - Default: 2 - -tcp_ecn_fallback - BOOLEAN - If the kernel detects that ECN connection misbehaves, enable fall - back to non-ECN. Currently, this knob implements the fallback - from RFC3168, section 6.1.1.1., but we reserve that in future, - additional detection mechanisms could be implemented under this - knob. The value is not used, if tcp_ecn or per route (or congestion - control) ECN settings are disabled. - Default: 1 (fallback enabled) - -tcp_fack - BOOLEAN - This is a legacy option, it has no effect anymore. - -tcp_fin_timeout - INTEGER - The length of time an orphaned (no longer referenced by any - application) connection will remain in the FIN_WAIT_2 state - before it is aborted at the local end. While a perfectly - valid "receive only" state for an un-orphaned connection, an - orphaned connection in FIN_WAIT_2 state could otherwise wait - forever for the remote to close its end of the connection. - Cf. tcp_max_orphans - Default: 60 seconds - -tcp_frto - INTEGER - Enables Forward RTO-Recovery (F-RTO) defined in RFC5682. - F-RTO is an enhanced recovery algorithm for TCP retransmission - timeouts. It is particularly beneficial in networks where the - RTT fluctuates (e.g., wireless). F-RTO is sender-side only - modification. It does not require any support from the peer. - - By default it's enabled with a non-zero value. 0 disables F-RTO. - -tcp_fwmark_accept - BOOLEAN - If set, incoming connections to listening sockets that do not have a - socket mark will set the mark of the accepting socket to the fwmark of - the incoming SYN packet. This will cause all packets on that connection - (starting from the first SYNACK) to be sent with that fwmark. The - listening socket's mark is unchanged. Listening sockets that already - have a fwmark set via setsockopt(SOL_SOCKET, SO_MARK, ...) are - unaffected. - - Default: 0 - -tcp_invalid_ratelimit - INTEGER - Limit the maximal rate for sending duplicate acknowledgments - in response to incoming TCP packets that are for an existing - connection but that are invalid due to any of these reasons: - - (a) out-of-window sequence number, - (b) out-of-window acknowledgment number, or - (c) PAWS (Protection Against Wrapped Sequence numbers) check failure - - This can help mitigate simple "ack loop" DoS attacks, wherein - a buggy or malicious middlebox or man-in-the-middle can - rewrite TCP header fields in manner that causes each endpoint - to think that the other is sending invalid TCP segments, thus - causing each side to send an unterminating stream of duplicate - acknowledgments for invalid segments. - - Using 0 disables rate-limiting of dupacks in response to - invalid segments; otherwise this value specifies the minimal - space between sending such dupacks, in milliseconds. - - Default: 500 (milliseconds). - -tcp_keepalive_time - INTEGER - How often TCP sends out keepalive messages when keepalive is enabled. - Default: 2hours. - -tcp_keepalive_probes - INTEGER - How many keepalive probes TCP sends out, until it decides that the - connection is broken. Default value: 9. - -tcp_keepalive_intvl - INTEGER - How frequently the probes are send out. Multiplied by - tcp_keepalive_probes it is time to kill not responding connection, - after probes started. Default value: 75sec i.e. connection - will be aborted after ~11 minutes of retries. - -tcp_l3mdev_accept - BOOLEAN - Enables child sockets to inherit the L3 master device index. - Enabling this option allows a "global" listen socket to work - across L3 master domains (e.g., VRFs) with connected sockets - derived from the listen socket to be bound to the L3 domain in - which the packets originated. Only valid when the kernel was - compiled with CONFIG_NET_L3_MASTER_DEV. - Default: 0 (disabled) - -tcp_low_latency - BOOLEAN - This is a legacy option, it has no effect anymore. - -tcp_max_orphans - INTEGER - Maximal number of TCP sockets not attached to any user file handle, - held by system. If this number is exceeded orphaned connections are - reset immediately and warning is printed. This limit exists - only to prevent simple DoS attacks, you _must_ not rely on this - or lower the limit artificially, but rather increase it - (probably, after increasing installed memory), - if network conditions require more than default value, - and tune network services to linger and kill such states - more aggressively. Let me to remind again: each orphan eats - up to ~64K of unswappable memory. - -tcp_max_syn_backlog - INTEGER - Maximal number of remembered connection requests (SYN_RECV), - which have not received an acknowledgment from connecting client. - This is a per-listener limit. - The minimal value is 128 for low memory machines, and it will - increase in proportion to the memory of machine. - If server suffers from overload, try increasing this number. - Remember to also check /proc/sys/net/core/somaxconn - A SYN_RECV request socket consumes about 304 bytes of memory. - -tcp_max_tw_buckets - INTEGER - Maximal number of timewait sockets held by system simultaneously. - If this number is exceeded time-wait socket is immediately destroyed - and warning is printed. This limit exists only to prevent - simple DoS attacks, you _must_ not lower the limit artificially, - but rather increase it (probably, after increasing installed memory), - if network conditions require more than default value. - -tcp_mem - vector of 3 INTEGERs: min, pressure, max - min: below this number of pages TCP is not bothered about its - memory appetite. - - pressure: when amount of memory allocated by TCP exceeds this number - of pages, TCP moderates its memory consumption and enters memory - pressure mode, which is exited when memory consumption falls - under "min". - - max: number of pages allowed for queueing by all TCP sockets. - - Defaults are calculated at boot time from amount of available - memory. - -tcp_min_rtt_wlen - INTEGER - The window length of the windowed min filter to track the minimum RTT. - A shorter window lets a flow more quickly pick up new (higher) - minimum RTT when it is moved to a longer path (e.g., due to traffic - engineering). A longer window makes the filter more resistant to RTT - inflations such as transient congestion. The unit is seconds. - Possible values: 0 - 86400 (1 day) - Default: 300 - -tcp_moderate_rcvbuf - BOOLEAN - If set, TCP performs receive buffer auto-tuning, attempting to - automatically size the buffer (no greater than tcp_rmem[2]) to - match the size required by the path for full throughput. Enabled by - default. - -tcp_mtu_probing - INTEGER - Controls TCP Packetization-Layer Path MTU Discovery. Takes three - values: - 0 - Disabled - 1 - Disabled by default, enabled when an ICMP black hole detected - 2 - Always enabled, use initial MSS of tcp_base_mss. - -tcp_probe_interval - UNSIGNED INTEGER - Controls how often to start TCP Packetization-Layer Path MTU - Discovery reprobe. The default is reprobing every 10 minutes as - per RFC4821. - -tcp_probe_threshold - INTEGER - Controls when TCP Packetization-Layer Path MTU Discovery probing - will stop in respect to the width of search range in bytes. Default - is 8 bytes. - -tcp_no_metrics_save - BOOLEAN - By default, TCP saves various connection metrics in the route cache - when the connection closes, so that connections established in the - near future can use these to set initial conditions. Usually, this - increases overall performance, but may sometimes cause performance - degradation. If set, TCP will not cache metrics on closing - connections. - -tcp_no_ssthresh_metrics_save - BOOLEAN - Controls whether TCP saves ssthresh metrics in the route cache. - Default is 1, which disables ssthresh metrics. - -tcp_orphan_retries - INTEGER - This value influences the timeout of a locally closed TCP connection, - when RTO retransmissions remain unacknowledged. - See tcp_retries2 for more details. - - The default value is 8. - If your machine is a loaded WEB server, - you should think about lowering this value, such sockets - may consume significant resources. Cf. tcp_max_orphans. - -tcp_recovery - INTEGER - This value is a bitmap to enable various experimental loss recovery - features. - - RACK: 0x1 enables the RACK loss detection for fast detection of lost - retransmissions and tail drops. It also subsumes and disables - RFC6675 recovery for SACK connections. - RACK: 0x2 makes RACK's reordering window static (min_rtt/4). - RACK: 0x4 disables RACK's DUPACK threshold heuristic - - Default: 0x1 - -tcp_reordering - INTEGER - Initial reordering level of packets in a TCP stream. - TCP stack can then dynamically adjust flow reordering level - between this initial value and tcp_max_reordering - Default: 3 - -tcp_max_reordering - INTEGER - Maximal reordering level of packets in a TCP stream. - 300 is a fairly conservative value, but you might increase it - if paths are using per packet load balancing (like bonding rr mode) - Default: 300 - -tcp_retrans_collapse - BOOLEAN - Bug-to-bug compatibility with some broken printers. - On retransmit try to send bigger packets to work around bugs in - certain TCP stacks. - -tcp_retries1 - INTEGER - This value influences the time, after which TCP decides, that - something is wrong due to unacknowledged RTO retransmissions, - and reports this suspicion to the network layer. - See tcp_retries2 for more details. - - RFC 1122 recommends at least 3 retransmissions, which is the - default. - -tcp_retries2 - INTEGER - This value influences the timeout of an alive TCP connection, - when RTO retransmissions remain unacknowledged. - Given a value of N, a hypothetical TCP connection following - exponential backoff with an initial RTO of TCP_RTO_MIN would - retransmit N times before killing the connection at the (N+1)th RTO. - - The default value of 15 yields a hypothetical timeout of 924.6 - seconds and is a lower bound for the effective timeout. - TCP will effectively time out at the first RTO which exceeds the - hypothetical timeout. - - RFC 1122 recommends at least 100 seconds for the timeout, - which corresponds to a value of at least 8. - -tcp_rfc1337 - BOOLEAN - If set, the TCP stack behaves conforming to RFC1337. If unset, - we are not conforming to RFC, but prevent TCP TIME_WAIT - assassination. - Default: 0 - -tcp_rmem - vector of 3 INTEGERs: min, default, max - min: Minimal size of receive buffer used by TCP sockets. - It is guaranteed to each TCP socket, even under moderate memory - pressure. - Default: 4K - - default: initial size of receive buffer used by TCP sockets. - This value overrides net.core.rmem_default used by other protocols. - Default: 87380 bytes. This value results in window of 65535 with - default setting of tcp_adv_win_scale and tcp_app_win:0 and a bit - less for default tcp_app_win. See below about these variables. - - max: maximal size of receive buffer allowed for automatically - selected receiver buffers for TCP socket. This value does not override - net.core.rmem_max. Calling setsockopt() with SO_RCVBUF disables - automatic tuning of that socket's receive buffer size, in which - case this value is ignored. - Default: between 87380B and 6MB, depending on RAM size. - -tcp_sack - BOOLEAN - Enable select acknowledgments (SACKS). - -tcp_comp_sack_delay_ns - LONG INTEGER - TCP tries to reduce number of SACK sent, using a timer - based on 5% of SRTT, capped by this sysctl, in nano seconds. - The default is 1ms, based on TSO autosizing period. - - Default : 1,000,000 ns (1 ms) - -tcp_comp_sack_nr - INTEGER - Max number of SACK that can be compressed. - Using 0 disables SACK compression. - - Default : 44 - -tcp_slow_start_after_idle - BOOLEAN - If set, provide RFC2861 behavior and time out the congestion - window after an idle period. An idle period is defined at - the current RTO. If unset, the congestion window will not - be timed out after an idle period. - Default: 1 - -tcp_stdurg - BOOLEAN - Use the Host requirements interpretation of the TCP urgent pointer field. - Most hosts use the older BSD interpretation, so if you turn this on - Linux might not communicate correctly with them. - Default: FALSE - -tcp_synack_retries - INTEGER - Number of times SYNACKs for a passive TCP connection attempt will - be retransmitted. Should not be higher than 255. Default value - is 5, which corresponds to 31seconds till the last retransmission - with the current initial RTO of 1second. With this the final timeout - for a passive TCP connection will happen after 63seconds. - -tcp_syncookies - INTEGER - Only valid when the kernel was compiled with CONFIG_SYN_COOKIES - Send out syncookies when the syn backlog queue of a socket - overflows. This is to prevent against the common 'SYN flood attack' - Default: 1 - - Note, that syncookies is fallback facility. - It MUST NOT be used to help highly loaded servers to stand - against legal connection rate. If you see SYN flood warnings - in your logs, but investigation shows that they occur - because of overload with legal connections, you should tune - another parameters until this warning disappear. - See: tcp_max_syn_backlog, tcp_synack_retries, tcp_abort_on_overflow. - - syncookies seriously violate TCP protocol, do not allow - to use TCP extensions, can result in serious degradation - of some services (f.e. SMTP relaying), visible not by you, - but your clients and relays, contacting you. While you see - SYN flood warnings in logs not being really flooded, your server - is seriously misconfigured. - - If you want to test which effects syncookies have to your - network connections you can set this knob to 2 to enable - unconditionally generation of syncookies. - -tcp_fastopen - INTEGER - Enable TCP Fast Open (RFC7413) to send and accept data in the opening - SYN packet. - - The client support is enabled by flag 0x1 (on by default). The client - then must use sendmsg() or sendto() with the MSG_FASTOPEN flag, - rather than connect() to send data in SYN. - - The server support is enabled by flag 0x2 (off by default). Then - either enable for all listeners with another flag (0x400) or - enable individual listeners via TCP_FASTOPEN socket option with - the option value being the length of the syn-data backlog. - - The values (bitmap) are - 0x1: (client) enables sending data in the opening SYN on the client. - 0x2: (server) enables the server support, i.e., allowing data in - a SYN packet to be accepted and passed to the - application before 3-way handshake finishes. - 0x4: (client) send data in the opening SYN regardless of cookie - availability and without a cookie option. - 0x200: (server) accept data-in-SYN w/o any cookie option present. - 0x400: (server) enable all listeners to support Fast Open by - default without explicit TCP_FASTOPEN socket option. - - Default: 0x1 - - Note that that additional client or server features are only - effective if the basic support (0x1 and 0x2) are enabled respectively. - -tcp_fastopen_blackhole_timeout_sec - INTEGER - Initial time period in second to disable Fastopen on active TCP sockets - when a TFO firewall blackhole issue happens. - This time period will grow exponentially when more blackhole issues - get detected right after Fastopen is re-enabled and will reset to - initial value when the blackhole issue goes away. - 0 to disable the blackhole detection. - By default, it is set to 1hr. - -tcp_fastopen_key - list of comma separated 32-digit hexadecimal INTEGERs - The list consists of a primary key and an optional backup key. The - primary key is used for both creating and validating cookies, while the - optional backup key is only used for validating cookies. The purpose of - the backup key is to maximize TFO validation when keys are rotated. - - A randomly chosen primary key may be configured by the kernel if - the tcp_fastopen sysctl is set to 0x400 (see above), or if the - TCP_FASTOPEN setsockopt() optname is set and a key has not been - previously configured via sysctl. If keys are configured via - setsockopt() by using the TCP_FASTOPEN_KEY optname, then those - per-socket keys will be used instead of any keys that are specified via - sysctl. - - A key is specified as 4 8-digit hexadecimal integers which are separated - by a '-' as: xxxxxxxx-xxxxxxxx-xxxxxxxx-xxxxxxxx. Leading zeros may be - omitted. A primary and a backup key may be specified by separating them - by a comma. If only one key is specified, it becomes the primary key and - any previously configured backup keys are removed. - -tcp_syn_retries - INTEGER - Number of times initial SYNs for an active TCP connection attempt - will be retransmitted. Should not be higher than 127. Default value - is 6, which corresponds to 63seconds till the last retransmission - with the current initial RTO of 1second. With this the final timeout - for an active TCP connection attempt will happen after 127seconds. - -tcp_timestamps - INTEGER -Enable timestamps as defined in RFC1323. - 0: Disabled. - 1: Enable timestamps as defined in RFC1323 and use random offset for - each connection rather than only using the current time. - 2: Like 1, but without random offsets. - Default: 1 - -tcp_min_tso_segs - INTEGER - Minimal number of segments per TSO frame. - Since linux-3.12, TCP does an automatic sizing of TSO frames, - depending on flow rate, instead of filling 64Kbytes packets. - For specific usages, it's possible to force TCP to build big - TSO frames. Note that TCP stack might split too big TSO packets - if available window is too small. - Default: 2 - -tcp_pacing_ss_ratio - INTEGER - sk->sk_pacing_rate is set by TCP stack using a ratio applied - to current rate. (current_rate = cwnd * mss / srtt) - If TCP is in slow start, tcp_pacing_ss_ratio is applied - to let TCP probe for bigger speeds, assuming cwnd can be - doubled every other RTT. - Default: 200 - -tcp_pacing_ca_ratio - INTEGER - sk->sk_pacing_rate is set by TCP stack using a ratio applied - to current rate. (current_rate = cwnd * mss / srtt) - If TCP is in congestion avoidance phase, tcp_pacing_ca_ratio - is applied to conservatively probe for bigger throughput. - Default: 120 - -tcp_tso_win_divisor - INTEGER - This allows control over what percentage of the congestion window - can be consumed by a single TSO frame. - The setting of this parameter is a choice between burstiness and - building larger TSO frames. - Default: 3 - -tcp_tw_reuse - INTEGER - Enable reuse of TIME-WAIT sockets for new connections when it is - safe from protocol viewpoint. - 0 - disable - 1 - global enable - 2 - enable for loopback traffic only - It should not be changed without advice/request of technical - experts. - Default: 2 - -tcp_window_scaling - BOOLEAN - Enable window scaling as defined in RFC1323. - -tcp_wmem - vector of 3 INTEGERs: min, default, max - min: Amount of memory reserved for send buffers for TCP sockets. - Each TCP socket has rights to use it due to fact of its birth. - Default: 4K - - default: initial size of send buffer used by TCP sockets. This - value overrides net.core.wmem_default used by other protocols. - It is usually lower than net.core.wmem_default. - Default: 16K - - max: Maximal amount of memory allowed for automatically tuned - send buffers for TCP sockets. This value does not override - net.core.wmem_max. Calling setsockopt() with SO_SNDBUF disables - automatic tuning of that socket's send buffer size, in which case - this value is ignored. - Default: between 64K and 4MB, depending on RAM size. - -tcp_notsent_lowat - UNSIGNED INTEGER - A TCP socket can control the amount of unsent bytes in its write queue, - thanks to TCP_NOTSENT_LOWAT socket option. poll()/select()/epoll() - reports POLLOUT events if the amount of unsent bytes is below a per - socket value, and if the write queue is not full. sendmsg() will - also not add new buffers if the limit is hit. - - This global variable controls the amount of unsent data for - sockets not using TCP_NOTSENT_LOWAT. For these sockets, a change - to the global variable has immediate effect. - - Default: UINT_MAX (0xFFFFFFFF) - -tcp_workaround_signed_windows - BOOLEAN - If set, assume no receipt of a window scaling option means the - remote TCP is broken and treats the window as a signed quantity. - If unset, assume the remote TCP is not broken even if we do - not receive a window scaling option from them. - Default: 0 - -tcp_thin_linear_timeouts - BOOLEAN - Enable dynamic triggering of linear timeouts for thin streams. - If set, a check is performed upon retransmission by timeout to - determine if the stream is thin (less than 4 packets in flight). - As long as the stream is found to be thin, up to 6 linear - timeouts may be performed before exponential backoff mode is - initiated. This improves retransmission latency for - non-aggressive thin streams, often found to be time-dependent. - For more information on thin streams, see - Documentation/networking/tcp-thin.txt - Default: 0 - -tcp_limit_output_bytes - INTEGER - Controls TCP Small Queue limit per tcp socket. - TCP bulk sender tends to increase packets in flight until it - gets losses notifications. With SNDBUF autotuning, this can - result in a large amount of packets queued on the local machine - (e.g.: qdiscs, CPU backlog, or device) hurting latency of other - flows, for typical pfifo_fast qdiscs. tcp_limit_output_bytes - limits the number of bytes on qdisc or device to reduce artificial - RTT/cwnd and reduce bufferbloat. - Default: 1048576 (16 * 65536) - -tcp_challenge_ack_limit - INTEGER - Limits number of Challenge ACK sent per second, as recommended - in RFC 5961 (Improving TCP's Robustness to Blind In-Window Attacks) - Default: 1000 - -tcp_rx_skb_cache - BOOLEAN - Controls a per TCP socket cache of one skb, that might help - performance of some workloads. This might be dangerous - on systems with a lot of TCP sockets, since it increases - memory usage. - - Default: 0 (disabled) - -UDP variables: - -udp_l3mdev_accept - BOOLEAN - Enabling this option allows a "global" bound socket to work - across L3 master domains (e.g., VRFs) with packets capable of - being received regardless of the L3 domain in which they - originated. Only valid when the kernel was compiled with - CONFIG_NET_L3_MASTER_DEV. - Default: 0 (disabled) - -udp_mem - vector of 3 INTEGERs: min, pressure, max - Number of pages allowed for queueing by all UDP sockets. - - min: Below this number of pages UDP is not bothered about its - memory appetite. When amount of memory allocated by UDP exceeds - this number, UDP starts to moderate memory usage. - - pressure: This value was introduced to follow format of tcp_mem. - - max: Number of pages allowed for queueing by all UDP sockets. - - Default is calculated at boot time from amount of available memory. - -udp_rmem_min - INTEGER - Minimal size of receive buffer used by UDP sockets in moderation. - Each UDP socket is able to use the size for receiving data, even if - total pages of UDP sockets exceed udp_mem pressure. The unit is byte. - Default: 4K - -udp_wmem_min - INTEGER - Minimal size of send buffer used by UDP sockets in moderation. - Each UDP socket is able to use the size for sending data, even if - total pages of UDP sockets exceed udp_mem pressure. The unit is byte. - Default: 4K - -RAW variables: - -raw_l3mdev_accept - BOOLEAN - Enabling this option allows a "global" bound socket to work - across L3 master domains (e.g., VRFs) with packets capable of - being received regardless of the L3 domain in which they - originated. Only valid when the kernel was compiled with - CONFIG_NET_L3_MASTER_DEV. - Default: 1 (enabled) - -CIPSOv4 Variables: - -cipso_cache_enable - BOOLEAN - If set, enable additions to and lookups from the CIPSO label mapping - cache. If unset, additions are ignored and lookups always result in a - miss. However, regardless of the setting the cache is still - invalidated when required when means you can safely toggle this on and - off and the cache will always be "safe". - Default: 1 - -cipso_cache_bucket_size - INTEGER - The CIPSO label cache consists of a fixed size hash table with each - hash bucket containing a number of cache entries. This variable limits - the number of entries in each hash bucket; the larger the value the - more CIPSO label mappings that can be cached. When the number of - entries in a given hash bucket reaches this limit adding new entries - causes the oldest entry in the bucket to be removed to make room. - Default: 10 - -cipso_rbm_optfmt - BOOLEAN - Enable the "Optimized Tag 1 Format" as defined in section 3.4.2.6 of - the CIPSO draft specification (see Documentation/netlabel for details). - This means that when set the CIPSO tag will be padded with empty - categories in order to make the packet data 32-bit aligned. - Default: 0 - -cipso_rbm_structvalid - BOOLEAN - If set, do a very strict check of the CIPSO option when - ip_options_compile() is called. If unset, relax the checks done during - ip_options_compile(). Either way is "safe" as errors are caught else - where in the CIPSO processing code but setting this to 0 (False) should - result in less work (i.e. it should be faster) but could cause problems - with other implementations that require strict checking. - Default: 0 - -IP Variables: - -ip_local_port_range - 2 INTEGERS - Defines the local port range that is used by TCP and UDP to - choose the local port. The first number is the first, the - second the last local port number. - If possible, it is better these numbers have different parity - (one even and one odd value). - Must be greater than or equal to ip_unprivileged_port_start. - The default values are 32768 and 60999 respectively. - -ip_local_reserved_ports - list of comma separated ranges - Specify the ports which are reserved for known third-party - applications. These ports will not be used by automatic port - assignments (e.g. when calling connect() or bind() with port - number 0). Explicit port allocation behavior is unchanged. - - The format used for both input and output is a comma separated - list of ranges (e.g. "1,2-4,10-10" for ports 1, 2, 3, 4 and - 10). Writing to the file will clear all previously reserved - ports and update the current list with the one given in the - input. - - Note that ip_local_port_range and ip_local_reserved_ports - settings are independent and both are considered by the kernel - when determining which ports are available for automatic port - assignments. - - You can reserve ports which are not in the current - ip_local_port_range, e.g.: - - $ cat /proc/sys/net/ipv4/ip_local_port_range - 32000 60999 - $ cat /proc/sys/net/ipv4/ip_local_reserved_ports - 8080,9148 - - although this is redundant. However such a setting is useful - if later the port range is changed to a value that will - include the reserved ports. - - Default: Empty - -ip_unprivileged_port_start - INTEGER - This is a per-namespace sysctl. It defines the first - unprivileged port in the network namespace. Privileged ports - require root or CAP_NET_BIND_SERVICE in order to bind to them. - To disable all privileged ports, set this to 0. They must not - overlap with the ip_local_port_range. - - Default: 1024 - -ip_nonlocal_bind - BOOLEAN - If set, allows processes to bind() to non-local IP addresses, - which can be quite useful - but may break some applications. - Default: 0 - -ip_autobind_reuse - BOOLEAN - By default, bind() does not select the ports automatically even if - the new socket and all sockets bound to the port have SO_REUSEADDR. - ip_autobind_reuse allows bind() to reuse the port and this is useful - when you use bind()+connect(), but may break some applications. - The preferred solution is to use IP_BIND_ADDRESS_NO_PORT and this - option should only be set by experts. - Default: 0 - -ip_dynaddr - BOOLEAN - If set non-zero, enables support for dynamic addresses. - If set to a non-zero value larger than 1, a kernel log - message will be printed when dynamic address rewriting - occurs. - Default: 0 - -ip_early_demux - BOOLEAN - Optimize input packet processing down to one demux for - certain kinds of local sockets. Currently we only do this - for established TCP and connected UDP sockets. - - It may add an additional cost for pure routing workloads that - reduces overall throughput, in such case you should disable it. - Default: 1 - -ping_group_range - 2 INTEGERS - Restrict ICMP_PROTO datagram sockets to users in the group range. - The default is "1 0", meaning, that nobody (not even root) may - create ping sockets. Setting it to "100 100" would grant permissions - to the single group. "0 4294967295" would enable it for the world, "100 - 4294967295" would enable it for the users, but not daemons. - -tcp_early_demux - BOOLEAN - Enable early demux for established TCP sockets. - Default: 1 - -udp_early_demux - BOOLEAN - Enable early demux for connected UDP sockets. Disable this if - your system could experience more unconnected load. - Default: 1 - -icmp_echo_ignore_all - BOOLEAN - If set non-zero, then the kernel will ignore all ICMP ECHO - requests sent to it. - Default: 0 - -icmp_echo_ignore_broadcasts - BOOLEAN - If set non-zero, then the kernel will ignore all ICMP ECHO and - TIMESTAMP requests sent to it via broadcast/multicast. - Default: 1 - -icmp_ratelimit - INTEGER - Limit the maximal rates for sending ICMP packets whose type matches - icmp_ratemask (see below) to specific targets. - 0 to disable any limiting, - otherwise the minimal space between responses in milliseconds. - Note that another sysctl, icmp_msgs_per_sec limits the number - of ICMP packets sent on all targets. - Default: 1000 - -icmp_msgs_per_sec - INTEGER - Limit maximal number of ICMP packets sent per second from this host. - Only messages whose type matches icmp_ratemask (see below) are - controlled by this limit. - Default: 1000 - -icmp_msgs_burst - INTEGER - icmp_msgs_per_sec controls number of ICMP packets sent per second, - while icmp_msgs_burst controls the burst size of these packets. - Default: 50 - -icmp_ratemask - INTEGER - Mask made of ICMP types for which rates are being limited. - Significant bits: IHGFEDCBA9876543210 - Default mask: 0000001100000011000 (6168) - - Bit definitions (see include/linux/icmp.h): - 0 Echo Reply - 3 Destination Unreachable * - 4 Source Quench * - 5 Redirect - 8 Echo Request - B Time Exceeded * - C Parameter Problem * - D Timestamp Request - E Timestamp Reply - F Info Request - G Info Reply - H Address Mask Request - I Address Mask Reply - - * These are rate limited by default (see default mask above) - -icmp_ignore_bogus_error_responses - BOOLEAN - Some routers violate RFC1122 by sending bogus responses to broadcast - frames. Such violations are normally logged via a kernel warning. - If this is set to TRUE, the kernel will not give such warnings, which - will avoid log file clutter. - Default: 1 - -icmp_errors_use_inbound_ifaddr - BOOLEAN - - If zero, icmp error messages are sent with the primary address of - the exiting interface. - - If non-zero, the message will be sent with the primary address of - the interface that received the packet that caused the icmp error. - This is the behaviour network many administrators will expect from - a router. And it can make debugging complicated network layouts - much easier. - - Note that if no primary address exists for the interface selected, - then the primary address of the first non-loopback interface that - has one will be used regardless of this setting. - - Default: 0 - -igmp_max_memberships - INTEGER - Change the maximum number of multicast groups we can subscribe to. - Default: 20 - - Theoretical maximum value is bounded by having to send a membership - report in a single datagram (i.e. the report can't span multiple - datagrams, or risk confusing the switch and leaving groups you don't - intend to). - - The number of supported groups 'M' is bounded by the number of group - report entries you can fit into a single datagram of 65535 bytes. - - M = 65536-sizeof (ip header)/(sizeof(Group record)) - - Group records are variable length, with a minimum of 12 bytes. - So net.ipv4.igmp_max_memberships should not be set higher than: - - (65536-24) / 12 = 5459 - - The value 5459 assumes no IP header options, so in practice - this number may be lower. - -igmp_max_msf - INTEGER - Maximum number of addresses allowed in the source filter list for a - multicast group. - Default: 10 - -igmp_qrv - INTEGER - Controls the IGMP query robustness variable (see RFC2236 8.1). - Default: 2 (as specified by RFC2236 8.1) - Minimum: 1 (as specified by RFC6636 4.5) - -force_igmp_version - INTEGER - 0 - (default) No enforcement of a IGMP version, IGMPv1/v2 fallback - allowed. Will back to IGMPv3 mode again if all IGMPv1/v2 Querier - Present timer expires. - 1 - Enforce to use IGMP version 1. Will also reply IGMPv1 report if - receive IGMPv2/v3 query. - 2 - Enforce to use IGMP version 2. Will fallback to IGMPv1 if receive - IGMPv1 query message. Will reply report if receive IGMPv3 query. - 3 - Enforce to use IGMP version 3. The same react with default 0. - - Note: this is not the same with force_mld_version because IGMPv3 RFC3376 - Security Considerations does not have clear description that we could - ignore other version messages completely as MLDv2 RFC3810. So make - this value as default 0 is recommended. - -conf/interface/* changes special settings per interface (where -"interface" is the name of your network interface) - -conf/all/* is special, changes the settings for all interfaces - -log_martians - BOOLEAN - Log packets with impossible addresses to kernel log. - log_martians for the interface will be enabled if at least one of - conf/{all,interface}/log_martians is set to TRUE, - it will be disabled otherwise - -accept_redirects - BOOLEAN - Accept ICMP redirect messages. - accept_redirects for the interface will be enabled if: - - both conf/{all,interface}/accept_redirects are TRUE in the case - forwarding for the interface is enabled - or - - at least one of conf/{all,interface}/accept_redirects is TRUE in the - case forwarding for the interface is disabled - accept_redirects for the interface will be disabled otherwise - default TRUE (host) - FALSE (router) - -forwarding - BOOLEAN - Enable IP forwarding on this interface. This controls whether packets - received _on_ this interface can be forwarded. - -mc_forwarding - BOOLEAN - Do multicast routing. The kernel needs to be compiled with CONFIG_MROUTE - and a multicast routing daemon is required. - conf/all/mc_forwarding must also be set to TRUE to enable multicast - routing for the interface - -medium_id - INTEGER - Integer value used to differentiate the devices by the medium they - are attached to. Two devices can have different id values when - the broadcast packets are received only on one of them. - The default value 0 means that the device is the only interface - to its medium, value of -1 means that medium is not known. - - Currently, it is used to change the proxy_arp behavior: - the proxy_arp feature is enabled for packets forwarded between - two devices attached to different media. - -proxy_arp - BOOLEAN - Do proxy arp. - proxy_arp for the interface will be enabled if at least one of - conf/{all,interface}/proxy_arp is set to TRUE, - it will be disabled otherwise - -proxy_arp_pvlan - BOOLEAN - Private VLAN proxy arp. - Basically allow proxy arp replies back to the same interface - (from which the ARP request/solicitation was received). - - This is done to support (ethernet) switch features, like RFC - 3069, where the individual ports are NOT allowed to - communicate with each other, but they are allowed to talk to - the upstream router. As described in RFC 3069, it is possible - to allow these hosts to communicate through the upstream - router by proxy_arp'ing. Don't need to be used together with - proxy_arp. - - This technology is known by different names: - In RFC 3069 it is called VLAN Aggregation. - Cisco and Allied Telesyn call it Private VLAN. - Hewlett-Packard call it Source-Port filtering or port-isolation. - Ericsson call it MAC-Forced Forwarding (RFC Draft). - -shared_media - BOOLEAN - Send(router) or accept(host) RFC1620 shared media redirects. - Overrides secure_redirects. - shared_media for the interface will be enabled if at least one of - conf/{all,interface}/shared_media is set to TRUE, - it will be disabled otherwise - default TRUE - -secure_redirects - BOOLEAN - Accept ICMP redirect messages only to gateways listed in the - interface's current gateway list. Even if disabled, RFC1122 redirect - rules still apply. - Overridden by shared_media. - secure_redirects for the interface will be enabled if at least one of - conf/{all,interface}/secure_redirects is set to TRUE, - it will be disabled otherwise - default TRUE - -send_redirects - BOOLEAN - Send redirects, if router. - send_redirects for the interface will be enabled if at least one of - conf/{all,interface}/send_redirects is set to TRUE, - it will be disabled otherwise - Default: TRUE - -bootp_relay - BOOLEAN - Accept packets with source address 0.b.c.d destined - not to this host as local ones. It is supposed, that - BOOTP relay daemon will catch and forward such packets. - conf/all/bootp_relay must also be set to TRUE to enable BOOTP relay - for the interface - default FALSE - Not Implemented Yet. - -accept_source_route - BOOLEAN - Accept packets with SRR option. - conf/all/accept_source_route must also be set to TRUE to accept packets - with SRR option on the interface - default TRUE (router) - FALSE (host) - -accept_local - BOOLEAN - Accept packets with local source addresses. In combination with - suitable routing, this can be used to direct packets between two - local interfaces over the wire and have them accepted properly. - default FALSE - -route_localnet - BOOLEAN - Do not consider loopback addresses as martian source or destination - while routing. This enables the use of 127/8 for local routing purposes. - default FALSE - -rp_filter - INTEGER - 0 - No source validation. - 1 - Strict mode as defined in RFC3704 Strict Reverse Path - Each incoming packet is tested against the FIB and if the interface - is not the best reverse path the packet check will fail. - By default failed packets are discarded. - 2 - Loose mode as defined in RFC3704 Loose Reverse Path - Each incoming packet's source address is also tested against the FIB - and if the source address is not reachable via any interface - the packet check will fail. - - Current recommended practice in RFC3704 is to enable strict mode - to prevent IP spoofing from DDos attacks. If using asymmetric routing - or other complicated routing, then loose mode is recommended. - - The max value from conf/{all,interface}/rp_filter is used - when doing source validation on the {interface}. - - Default value is 0. Note that some distributions enable it - in startup scripts. - -arp_filter - BOOLEAN - 1 - Allows you to have multiple network interfaces on the same - subnet, and have the ARPs for each interface be answered - based on whether or not the kernel would route a packet from - the ARP'd IP out that interface (therefore you must use source - based routing for this to work). In other words it allows control - of which cards (usually 1) will respond to an arp request. - - 0 - (default) The kernel can respond to arp requests with addresses - from other interfaces. This may seem wrong but it usually makes - sense, because it increases the chance of successful communication. - IP addresses are owned by the complete host on Linux, not by - particular interfaces. Only for more complex setups like load- - balancing, does this behaviour cause problems. - - arp_filter for the interface will be enabled if at least one of - conf/{all,interface}/arp_filter is set to TRUE, - it will be disabled otherwise - -arp_announce - INTEGER - Define different restriction levels for announcing the local - source IP address from IP packets in ARP requests sent on - interface: - 0 - (default) Use any local address, configured on any interface - 1 - Try to avoid local addresses that are not in the target's - subnet for this interface. This mode is useful when target - hosts reachable via this interface require the source IP - address in ARP requests to be part of their logical network - configured on the receiving interface. When we generate the - request we will check all our subnets that include the - target IP and will preserve the source address if it is from - such subnet. If there is no such subnet we select source - address according to the rules for level 2. - 2 - Always use the best local address for this target. - In this mode we ignore the source address in the IP packet - and try to select local address that we prefer for talks with - the target host. Such local address is selected by looking - for primary IP addresses on all our subnets on the outgoing - interface that include the target IP address. If no suitable - local address is found we select the first local address - we have on the outgoing interface or on all other interfaces, - with the hope we will receive reply for our request and - even sometimes no matter the source IP address we announce. - - The max value from conf/{all,interface}/arp_announce is used. - - Increasing the restriction level gives more chance for - receiving answer from the resolved target while decreasing - the level announces more valid sender's information. - -arp_ignore - INTEGER - Define different modes for sending replies in response to - received ARP requests that resolve local target IP addresses: - 0 - (default): reply for any local target IP address, configured - on any interface - 1 - reply only if the target IP address is local address - configured on the incoming interface - 2 - reply only if the target IP address is local address - configured on the incoming interface and both with the - sender's IP address are part from same subnet on this interface - 3 - do not reply for local addresses configured with scope host, - only resolutions for global and link addresses are replied - 4-7 - reserved - 8 - do not reply for all local addresses - - The max value from conf/{all,interface}/arp_ignore is used - when ARP request is received on the {interface} - -arp_notify - BOOLEAN - Define mode for notification of address and device changes. - 0 - (default): do nothing - 1 - Generate gratuitous arp requests when device is brought up - or hardware address changes. - -arp_accept - BOOLEAN - Define behavior for gratuitous ARP frames who's IP is not - already present in the ARP table: - 0 - don't create new entries in the ARP table - 1 - create new entries in the ARP table - - Both replies and requests type gratuitous arp will trigger the - ARP table to be updated, if this setting is on. - - If the ARP table already contains the IP address of the - gratuitous arp frame, the arp table will be updated regardless - if this setting is on or off. - -mcast_solicit - INTEGER - The maximum number of multicast probes in INCOMPLETE state, - when the associated hardware address is unknown. Defaults - to 3. - -ucast_solicit - INTEGER - The maximum number of unicast probes in PROBE state, when - the hardware address is being reconfirmed. Defaults to 3. - -app_solicit - INTEGER - The maximum number of probes to send to the user space ARP daemon - via netlink before dropping back to multicast probes (see - mcast_resolicit). Defaults to 0. - -mcast_resolicit - INTEGER - The maximum number of multicast probes after unicast and - app probes in PROBE state. Defaults to 0. - -disable_policy - BOOLEAN - Disable IPSEC policy (SPD) for this interface - -disable_xfrm - BOOLEAN - Disable IPSEC encryption on this interface, whatever the policy - -igmpv2_unsolicited_report_interval - INTEGER - The interval in milliseconds in which the next unsolicited - IGMPv1 or IGMPv2 report retransmit will take place. - Default: 10000 (10 seconds) - -igmpv3_unsolicited_report_interval - INTEGER - The interval in milliseconds in which the next unsolicited - IGMPv3 report retransmit will take place. - Default: 1000 (1 seconds) - -promote_secondaries - BOOLEAN - When a primary IP address is removed from this interface - promote a corresponding secondary IP address instead of - removing all the corresponding secondary IP addresses. - -drop_unicast_in_l2_multicast - BOOLEAN - Drop any unicast IP packets that are received in link-layer - multicast (or broadcast) frames. - This behavior (for multicast) is actually a SHOULD in RFC - 1122, but is disabled by default for compatibility reasons. - Default: off (0) - -drop_gratuitous_arp - BOOLEAN - Drop all gratuitous ARP frames, for example if there's a known - good ARP proxy on the network and such frames need not be used - (or in the case of 802.11, must not be used to prevent attacks.) - Default: off (0) - - -tag - INTEGER - Allows you to write a number, which can be used as required. - Default value is 0. - -xfrm4_gc_thresh - INTEGER - (Obsolete since linux-4.14) - The threshold at which we will start garbage collecting for IPv4 - destination cache entries. At twice this value the system will - refuse new allocations. - -igmp_link_local_mcast_reports - BOOLEAN - Enable IGMP reports for link local multicast groups in the - 224.0.0.X range. - Default TRUE - -Alexey Kuznetsov. -kuznet@ms2.inr.ac.ru - -Updated by: -Andi Kleen -ak@muc.de -Nicolas Delon -delon.nicolas@wanadoo.fr - - - - -/proc/sys/net/ipv6/* Variables: - -IPv6 has no global variables such as tcp_*. tcp_* settings under ipv4/ also -apply to IPv6 [XXX?]. - -bindv6only - BOOLEAN - Default value for IPV6_V6ONLY socket option, - which restricts use of the IPv6 socket to IPv6 communication - only. - TRUE: disable IPv4-mapped address feature - FALSE: enable IPv4-mapped address feature - - Default: FALSE (as specified in RFC3493) - -flowlabel_consistency - BOOLEAN - Protect the consistency (and unicity) of flow label. - You have to disable it to use IPV6_FL_F_REFLECT flag on the - flow label manager. - TRUE: enabled - FALSE: disabled - Default: TRUE - -auto_flowlabels - INTEGER - Automatically generate flow labels based on a flow hash of the - packet. This allows intermediate devices, such as routers, to - identify packet flows for mechanisms like Equal Cost Multipath - Routing (see RFC 6438). - 0: automatic flow labels are completely disabled - 1: automatic flow labels are enabled by default, they can be - disabled on a per socket basis using the IPV6_AUTOFLOWLABEL - socket option - 2: automatic flow labels are allowed, they may be enabled on a - per socket basis using the IPV6_AUTOFLOWLABEL socket option - 3: automatic flow labels are enabled and enforced, they cannot - be disabled by the socket option - Default: 1 - -flowlabel_state_ranges - BOOLEAN - Split the flow label number space into two ranges. 0-0x7FFFF is - reserved for the IPv6 flow manager facility, 0x80000-0xFFFFF - is reserved for stateless flow labels as described in RFC6437. - TRUE: enabled - FALSE: disabled - Default: true - -flowlabel_reflect - INTEGER - Control flow label reflection. Needed for Path MTU - Discovery to work with Equal Cost Multipath Routing in anycast - environments. See RFC 7690 and: - https://tools.ietf.org/html/draft-wang-6man-flow-label-reflection-01 - - This is a bitmask. - 1: enabled for established flows - - Note that this prevents automatic flowlabel changes, as done - in "tcp: change IPv6 flow-label upon receiving spurious retransmission" - and "tcp: Change txhash on every SYN and RTO retransmit" - - 2: enabled for TCP RESET packets (no active listener) - If set, a RST packet sent in response to a SYN packet on a closed - port will reflect the incoming flow label. - - 4: enabled for ICMPv6 echo reply messages. - - Default: 0 - -fib_multipath_hash_policy - INTEGER - Controls which hash policy to use for multipath routes. - Default: 0 (Layer 3) - Possible values: - 0 - Layer 3 (source and destination addresses plus flow label) - 1 - Layer 4 (standard 5-tuple) - 2 - Layer 3 or inner Layer 3 if present - -anycast_src_echo_reply - BOOLEAN - Controls the use of anycast addresses as source addresses for ICMPv6 - echo reply - TRUE: enabled - FALSE: disabled - Default: FALSE - -idgen_delay - INTEGER - Controls the delay in seconds after which time to retry - privacy stable address generation if a DAD conflict is - detected. - Default: 1 (as specified in RFC7217) - -idgen_retries - INTEGER - Controls the number of retries to generate a stable privacy - address if a DAD conflict is detected. - Default: 3 (as specified in RFC7217) - -mld_qrv - INTEGER - Controls the MLD query robustness variable (see RFC3810 9.1). - Default: 2 (as specified by RFC3810 9.1) - Minimum: 1 (as specified by RFC6636 4.5) - -max_dst_opts_number - INTEGER - Maximum number of non-padding TLVs allowed in a Destination - options extension header. If this value is less than zero - then unknown options are disallowed and the number of known - TLVs allowed is the absolute value of this number. - Default: 8 - -max_hbh_opts_number - INTEGER - Maximum number of non-padding TLVs allowed in a Hop-by-Hop - options extension header. If this value is less than zero - then unknown options are disallowed and the number of known - TLVs allowed is the absolute value of this number. - Default: 8 - -max_dst_opts_length - INTEGER - Maximum length allowed for a Destination options extension - header. - Default: INT_MAX (unlimited) - -max_hbh_length - INTEGER - Maximum length allowed for a Hop-by-Hop options extension - header. - Default: INT_MAX (unlimited) - -skip_notify_on_dev_down - BOOLEAN - Controls whether an RTM_DELROUTE message is generated for routes - removed when a device is taken down or deleted. IPv4 does not - generate this message; IPv6 does by default. Setting this sysctl - to true skips the message, making IPv4 and IPv6 on par in relying - on userspace caches to track link events and evict routes. - Default: false (generate message) - -nexthop_compat_mode - BOOLEAN - New nexthop API provides a means for managing nexthops independent of - prefixes. Backwards compatibilty with old route format is enabled by - default which means route dumps and notifications contain the new - nexthop attribute but also the full, expanded nexthop definition. - Further, updates or deletes of a nexthop configuration generate route - notifications for each fib entry using the nexthop. Once a system - understands the new API, this sysctl can be disabled to achieve full - performance benefits of the new API by disabling the nexthop expansion - and extraneous notifications. - Default: true (backward compat mode) - -IPv6 Fragmentation: - -ip6frag_high_thresh - INTEGER - Maximum memory used to reassemble IPv6 fragments. When - ip6frag_high_thresh bytes of memory is allocated for this purpose, - the fragment handler will toss packets until ip6frag_low_thresh - is reached. - -ip6frag_low_thresh - INTEGER - See ip6frag_high_thresh - -ip6frag_time - INTEGER - Time in seconds to keep an IPv6 fragment in memory. - -IPv6 Segment Routing: - -seg6_flowlabel - INTEGER - Controls the behaviour of computing the flowlabel of outer - IPv6 header in case of SR T.encaps - - -1 set flowlabel to zero. - 0 copy flowlabel from Inner packet in case of Inner IPv6 - (Set flowlabel to 0 in case IPv4/L2) - 1 Compute the flowlabel using seg6_make_flowlabel() - - Default is 0. - -conf/default/*: - Change the interface-specific default settings. - - -conf/all/*: - Change all the interface-specific settings. - - [XXX: Other special features than forwarding?] - -conf/all/forwarding - BOOLEAN - Enable global IPv6 forwarding between all interfaces. - - IPv4 and IPv6 work differently here; e.g. netfilter must be used - to control which interfaces may forward packets and which not. - - This also sets all interfaces' Host/Router setting - 'forwarding' to the specified value. See below for details. - - This referred to as global forwarding. - -proxy_ndp - BOOLEAN - Do proxy ndp. - -fwmark_reflect - BOOLEAN - Controls the fwmark of kernel-generated IPv6 reply packets that are not - associated with a socket for example, TCP RSTs or ICMPv6 echo replies). - If unset, these packets have a fwmark of zero. If set, they have the - fwmark of the packet they are replying to. - Default: 0 - -conf/interface/*: - Change special settings per interface. - - The functional behaviour for certain settings is different - depending on whether local forwarding is enabled or not. - -accept_ra - INTEGER - Accept Router Advertisements; autoconfigure using them. - - It also determines whether or not to transmit Router - Solicitations. If and only if the functional setting is to - accept Router Advertisements, Router Solicitations will be - transmitted. - - Possible values are: - 0 Do not accept Router Advertisements. - 1 Accept Router Advertisements if forwarding is disabled. - 2 Overrule forwarding behaviour. Accept Router Advertisements - even if forwarding is enabled. - - Functional default: enabled if local forwarding is disabled. - disabled if local forwarding is enabled. - -accept_ra_defrtr - BOOLEAN - Learn default router in Router Advertisement. - - Functional default: enabled if accept_ra is enabled. - disabled if accept_ra is disabled. - -accept_ra_from_local - BOOLEAN - Accept RA with source-address that is found on local machine - if the RA is otherwise proper and able to be accepted. - Default is to NOT accept these as it may be an un-intended - network loop. - - Functional default: - enabled if accept_ra_from_local is enabled - on a specific interface. - disabled if accept_ra_from_local is disabled - on a specific interface. - -accept_ra_min_hop_limit - INTEGER - Minimum hop limit Information in Router Advertisement. - - Hop limit Information in Router Advertisement less than this - variable shall be ignored. - - Default: 1 - -accept_ra_pinfo - BOOLEAN - Learn Prefix Information in Router Advertisement. - - Functional default: enabled if accept_ra is enabled. - disabled if accept_ra is disabled. - -accept_ra_rt_info_min_plen - INTEGER - Minimum prefix length of Route Information in RA. - - Route Information w/ prefix smaller than this variable shall - be ignored. - - Functional default: 0 if accept_ra_rtr_pref is enabled. - -1 if accept_ra_rtr_pref is disabled. - -accept_ra_rt_info_max_plen - INTEGER - Maximum prefix length of Route Information in RA. - - Route Information w/ prefix larger than this variable shall - be ignored. - - Functional default: 0 if accept_ra_rtr_pref is enabled. - -1 if accept_ra_rtr_pref is disabled. - -accept_ra_rtr_pref - BOOLEAN - Accept Router Preference in RA. - - Functional default: enabled if accept_ra is enabled. - disabled if accept_ra is disabled. - -accept_ra_mtu - BOOLEAN - Apply the MTU value specified in RA option 5 (RFC4861). If - disabled, the MTU specified in the RA will be ignored. - - Functional default: enabled if accept_ra is enabled. - disabled if accept_ra is disabled. - -accept_redirects - BOOLEAN - Accept Redirects. - - Functional default: enabled if local forwarding is disabled. - disabled if local forwarding is enabled. - -accept_source_route - INTEGER - Accept source routing (routing extension header). - - >= 0: Accept only routing header type 2. - < 0: Do not accept routing header. - - Default: 0 - -autoconf - BOOLEAN - Autoconfigure addresses using Prefix Information in Router - Advertisements. - - Functional default: enabled if accept_ra_pinfo is enabled. - disabled if accept_ra_pinfo is disabled. - -dad_transmits - INTEGER - The amount of Duplicate Address Detection probes to send. - Default: 1 - -forwarding - INTEGER - Configure interface-specific Host/Router behaviour. - - Note: It is recommended to have the same setting on all - interfaces; mixed router/host scenarios are rather uncommon. - - Possible values are: - 0 Forwarding disabled - 1 Forwarding enabled - - FALSE (0): - - By default, Host behaviour is assumed. This means: - - 1. IsRouter flag is not set in Neighbour Advertisements. - 2. If accept_ra is TRUE (default), transmit Router - Solicitations. - 3. If accept_ra is TRUE (default), accept Router - Advertisements (and do autoconfiguration). - 4. If accept_redirects is TRUE (default), accept Redirects. - - TRUE (1): - - If local forwarding is enabled, Router behaviour is assumed. - This means exactly the reverse from the above: - - 1. IsRouter flag is set in Neighbour Advertisements. - 2. Router Solicitations are not sent unless accept_ra is 2. - 3. Router Advertisements are ignored unless accept_ra is 2. - 4. Redirects are ignored. - - Default: 0 (disabled) if global forwarding is disabled (default), - otherwise 1 (enabled). - -hop_limit - INTEGER - Default Hop Limit to set. - Default: 64 - -mtu - INTEGER - Default Maximum Transfer Unit - Default: 1280 (IPv6 required minimum) - -ip_nonlocal_bind - BOOLEAN - If set, allows processes to bind() to non-local IPv6 addresses, - which can be quite useful - but may break some applications. - Default: 0 - -router_probe_interval - INTEGER - Minimum interval (in seconds) between Router Probing described - in RFC4191. - - Default: 60 - -router_solicitation_delay - INTEGER - Number of seconds to wait after interface is brought up - before sending Router Solicitations. - Default: 1 - -router_solicitation_interval - INTEGER - Number of seconds to wait between Router Solicitations. - Default: 4 - -router_solicitations - INTEGER - Number of Router Solicitations to send until assuming no - routers are present. - Default: 3 - -use_oif_addrs_only - BOOLEAN - When enabled, the candidate source addresses for destinations - routed via this interface are restricted to the set of addresses - configured on this interface (vis. RFC 6724, section 4). - - Default: false - -use_tempaddr - INTEGER - Preference for Privacy Extensions (RFC3041). - <= 0 : disable Privacy Extensions - == 1 : enable Privacy Extensions, but prefer public - addresses over temporary addresses. - > 1 : enable Privacy Extensions and prefer temporary - addresses over public addresses. - Default: 0 (for most devices) - -1 (for point-to-point devices and loopback devices) - -temp_valid_lft - INTEGER - valid lifetime (in seconds) for temporary addresses. - Default: 604800 (7 days) - -temp_prefered_lft - INTEGER - Preferred lifetime (in seconds) for temporary addresses. - Default: 86400 (1 day) - -keep_addr_on_down - INTEGER - Keep all IPv6 addresses on an interface down event. If set static - global addresses with no expiration time are not flushed. - >0 : enabled - 0 : system default - <0 : disabled - - Default: 0 (addresses are removed) - -max_desync_factor - INTEGER - Maximum value for DESYNC_FACTOR, which is a random value - that ensures that clients don't synchronize with each - other and generate new addresses at exactly the same time. - value is in seconds. - Default: 600 - -regen_max_retry - INTEGER - Number of attempts before give up attempting to generate - valid temporary addresses. - Default: 5 - -max_addresses - INTEGER - Maximum number of autoconfigured addresses per interface. Setting - to zero disables the limitation. It is not recommended to set this - value too large (or to zero) because it would be an easy way to - crash the kernel by allowing too many addresses to be created. - Default: 16 - -disable_ipv6 - BOOLEAN - Disable IPv6 operation. If accept_dad is set to 2, this value - will be dynamically set to TRUE if DAD fails for the link-local - address. - Default: FALSE (enable IPv6 operation) - - When this value is changed from 1 to 0 (IPv6 is being enabled), - it will dynamically create a link-local address on the given - interface and start Duplicate Address Detection, if necessary. - - When this value is changed from 0 to 1 (IPv6 is being disabled), - it will dynamically delete all addresses and routes on the given - interface. From now on it will not possible to add addresses/routes - to the selected interface. - -accept_dad - INTEGER - Whether to accept DAD (Duplicate Address Detection). - 0: Disable DAD - 1: Enable DAD (default) - 2: Enable DAD, and disable IPv6 operation if MAC-based duplicate - link-local address has been found. - - DAD operation and mode on a given interface will be selected according - to the maximum value of conf/{all,interface}/accept_dad. - -force_tllao - BOOLEAN - Enable sending the target link-layer address option even when - responding to a unicast neighbor solicitation. - Default: FALSE - - Quoting from RFC 2461, section 4.4, Target link-layer address: - - "The option MUST be included for multicast solicitations in order to - avoid infinite Neighbor Solicitation "recursion" when the peer node - does not have a cache entry to return a Neighbor Advertisements - message. When responding to unicast solicitations, the option can be - omitted since the sender of the solicitation has the correct link- - layer address; otherwise it would not have be able to send the unicast - solicitation in the first place. However, including the link-layer - address in this case adds little overhead and eliminates a potential - race condition where the sender deletes the cached link-layer address - prior to receiving a response to a previous solicitation." - -ndisc_notify - BOOLEAN - Define mode for notification of address and device changes. - 0 - (default): do nothing - 1 - Generate unsolicited neighbour advertisements when device is brought - up or hardware address changes. - -ndisc_tclass - INTEGER - The IPv6 Traffic Class to use by default when sending IPv6 Neighbor - Discovery (Router Solicitation, Router Advertisement, Neighbor - Solicitation, Neighbor Advertisement, Redirect) messages. - These 8 bits can be interpreted as 6 high order bits holding the DSCP - value and 2 low order bits representing ECN (which you probably want - to leave cleared). - 0 - (default) - -mldv1_unsolicited_report_interval - INTEGER - The interval in milliseconds in which the next unsolicited - MLDv1 report retransmit will take place. - Default: 10000 (10 seconds) - -mldv2_unsolicited_report_interval - INTEGER - The interval in milliseconds in which the next unsolicited - MLDv2 report retransmit will take place. - Default: 1000 (1 second) - -force_mld_version - INTEGER - 0 - (default) No enforcement of a MLD version, MLDv1 fallback allowed - 1 - Enforce to use MLD version 1 - 2 - Enforce to use MLD version 2 - -suppress_frag_ndisc - INTEGER - Control RFC 6980 (Security Implications of IPv6 Fragmentation - with IPv6 Neighbor Discovery) behavior: - 1 - (default) discard fragmented neighbor discovery packets - 0 - allow fragmented neighbor discovery packets - -optimistic_dad - BOOLEAN - Whether to perform Optimistic Duplicate Address Detection (RFC 4429). - 0: disabled (default) - 1: enabled - - Optimistic Duplicate Address Detection for the interface will be enabled - if at least one of conf/{all,interface}/optimistic_dad is set to 1, - it will be disabled otherwise. - -use_optimistic - BOOLEAN - If enabled, do not classify optimistic addresses as deprecated during - source address selection. Preferred addresses will still be chosen - before optimistic addresses, subject to other ranking in the source - address selection algorithm. - 0: disabled (default) - 1: enabled - - This will be enabled if at least one of - conf/{all,interface}/use_optimistic is set to 1, disabled otherwise. - -stable_secret - IPv6 address - This IPv6 address will be used as a secret to generate IPv6 - addresses for link-local addresses and autoconfigured - ones. All addresses generated after setting this secret will - be stable privacy ones by default. This can be changed via the - addrgenmode ip-link. conf/default/stable_secret is used as the - secret for the namespace, the interface specific ones can - overwrite that. Writes to conf/all/stable_secret are refused. - - It is recommended to generate this secret during installation - of a system and keep it stable after that. - - By default the stable secret is unset. - -addr_gen_mode - INTEGER - Defines how link-local and autoconf addresses are generated. - - 0: generate address based on EUI64 (default) - 1: do no generate a link-local address, use EUI64 for addresses generated - from autoconf - 2: generate stable privacy addresses, using the secret from - stable_secret (RFC7217) - 3: generate stable privacy addresses, using a random secret if unset - -drop_unicast_in_l2_multicast - BOOLEAN - Drop any unicast IPv6 packets that are received in link-layer - multicast (or broadcast) frames. - - By default this is turned off. - -drop_unsolicited_na - BOOLEAN - Drop all unsolicited neighbor advertisements, for example if there's - a known good NA proxy on the network and such frames need not be used - (or in the case of 802.11, must not be used to prevent attacks.) - - By default this is turned off. - -enhanced_dad - BOOLEAN - Include a nonce option in the IPv6 neighbor solicitation messages used for - duplicate address detection per RFC7527. A received DAD NS will only signal - a duplicate address if the nonce is different. This avoids any false - detection of duplicates due to loopback of the NS messages that we send. - The nonce option will be sent on an interface unless both of - conf/{all,interface}/enhanced_dad are set to FALSE. - Default: TRUE - -icmp/*: -ratelimit - INTEGER - Limit the maximal rates for sending ICMPv6 messages. - 0 to disable any limiting, - otherwise the minimal space between responses in milliseconds. - Default: 1000 - -ratemask - list of comma separated ranges - For ICMPv6 message types matching the ranges in the ratemask, limit - the sending of the message according to ratelimit parameter. - - The format used for both input and output is a comma separated - list of ranges (e.g. "0-127,129" for ICMPv6 message type 0 to 127 and - 129). Writing to the file will clear all previous ranges of ICMPv6 - message types and update the current list with the input. - - Refer to: https://www.iana.org/assignments/icmpv6-parameters/icmpv6-parameters.xhtml - for numerical values of ICMPv6 message types, e.g. echo request is 128 - and echo reply is 129. - - Default: 0-1,3-127 (rate limit ICMPv6 errors except Packet Too Big) - -echo_ignore_all - BOOLEAN - If set non-zero, then the kernel will ignore all ICMP ECHO - requests sent to it over the IPv6 protocol. - Default: 0 - -echo_ignore_multicast - BOOLEAN - If set non-zero, then the kernel will ignore all ICMP ECHO - requests sent to it over the IPv6 protocol via multicast. - Default: 0 - -echo_ignore_anycast - BOOLEAN - If set non-zero, then the kernel will ignore all ICMP ECHO - requests sent to it over the IPv6 protocol destined to anycast address. - Default: 0 - -xfrm6_gc_thresh - INTEGER - (Obsolete since linux-4.14) - The threshold at which we will start garbage collecting for IPv6 - destination cache entries. At twice this value the system will - refuse new allocations. - - -IPv6 Update by: -Pekka Savola -YOSHIFUJI Hideaki / USAGI Project - - -/proc/sys/net/bridge/* Variables: - -bridge-nf-call-arptables - BOOLEAN - 1 : pass bridged ARP traffic to arptables' FORWARD chain. - 0 : disable this. - Default: 1 - -bridge-nf-call-iptables - BOOLEAN - 1 : pass bridged IPv4 traffic to iptables' chains. - 0 : disable this. - Default: 1 - -bridge-nf-call-ip6tables - BOOLEAN - 1 : pass bridged IPv6 traffic to ip6tables' chains. - 0 : disable this. - Default: 1 - -bridge-nf-filter-vlan-tagged - BOOLEAN - 1 : pass bridged vlan-tagged ARP/IP/IPv6 traffic to {arp,ip,ip6}tables. - 0 : disable this. - Default: 0 - -bridge-nf-filter-pppoe-tagged - BOOLEAN - 1 : pass bridged pppoe-tagged IP/IPv6 traffic to {ip,ip6}tables. - 0 : disable this. - Default: 0 - -bridge-nf-pass-vlan-input-dev - BOOLEAN - 1: if bridge-nf-filter-vlan-tagged is enabled, try to find a vlan - interface on the bridge and set the netfilter input device to the vlan. - This allows use of e.g. "iptables -i br0.1" and makes the REDIRECT - target work with vlan-on-top-of-bridge interfaces. When no matching - vlan interface is found, or this switch is off, the input device is - set to the bridge interface. - 0: disable bridge netfilter vlan interface lookup. - Default: 0 - -proc/sys/net/sctp/* Variables: - -addip_enable - BOOLEAN - Enable or disable extension of Dynamic Address Reconfiguration - (ADD-IP) functionality specified in RFC5061. This extension provides - the ability to dynamically add and remove new addresses for the SCTP - associations. - - 1: Enable extension. - - 0: Disable extension. - - Default: 0 - -pf_enable - INTEGER - Enable or disable pf (pf is short for potentially failed) state. A value - of pf_retrans > path_max_retrans also disables pf state. That is, one of - both pf_enable and pf_retrans > path_max_retrans can disable pf state. - Since pf_retrans and path_max_retrans can be changed by userspace - application, sometimes user expects to disable pf state by the value of - pf_retrans > path_max_retrans, but occasionally the value of pf_retrans - or path_max_retrans is changed by the user application, this pf state is - enabled. As such, it is necessary to add this to dynamically enable - and disable pf state. See: - https://datatracker.ietf.org/doc/draft-ietf-tsvwg-sctp-failover for - details. - - 1: Enable pf. - - 0: Disable pf. - - Default: 1 - -pf_expose - INTEGER - Unset or enable/disable pf (pf is short for potentially failed) state - exposure. Applications can control the exposure of the PF path state - in the SCTP_PEER_ADDR_CHANGE event and the SCTP_GET_PEER_ADDR_INFO - sockopt. When it's unset, no SCTP_PEER_ADDR_CHANGE event with - SCTP_ADDR_PF state will be sent and a SCTP_PF-state transport info - can be got via SCTP_GET_PEER_ADDR_INFO sockopt; When it's enabled, - a SCTP_PEER_ADDR_CHANGE event will be sent for a transport becoming - SCTP_PF state and a SCTP_PF-state transport info can be got via - SCTP_GET_PEER_ADDR_INFO sockopt; When it's diabled, no - SCTP_PEER_ADDR_CHANGE event will be sent and it returns -EACCES when - trying to get a SCTP_PF-state transport info via SCTP_GET_PEER_ADDR_INFO - sockopt. - - 0: Unset pf state exposure, Compatible with old applications. - - 1: Disable pf state exposure. - - 2: Enable pf state exposure. - - Default: 0 - -addip_noauth_enable - BOOLEAN - Dynamic Address Reconfiguration (ADD-IP) requires the use of - authentication to protect the operations of adding or removing new - addresses. This requirement is mandated so that unauthorized hosts - would not be able to hijack associations. However, older - implementations may not have implemented this requirement while - allowing the ADD-IP extension. For reasons of interoperability, - we provide this variable to control the enforcement of the - authentication requirement. - - 1: Allow ADD-IP extension to be used without authentication. This - should only be set in a closed environment for interoperability - with older implementations. - - 0: Enforce the authentication requirement - - Default: 0 - -auth_enable - BOOLEAN - Enable or disable Authenticated Chunks extension. This extension - provides the ability to send and receive authenticated chunks and is - required for secure operation of Dynamic Address Reconfiguration - (ADD-IP) extension. - - 1: Enable this extension. - 0: Disable this extension. - - Default: 0 - -prsctp_enable - BOOLEAN - Enable or disable the Partial Reliability extension (RFC3758) which - is used to notify peers that a given DATA should no longer be expected. - - 1: Enable extension - 0: Disable - - Default: 1 - -max_burst - INTEGER - The limit of the number of new packets that can be initially sent. It - controls how bursty the generated traffic can be. - - Default: 4 - -association_max_retrans - INTEGER - Set the maximum number for retransmissions that an association can - attempt deciding that the remote end is unreachable. If this value - is exceeded, the association is terminated. - - Default: 10 - -max_init_retransmits - INTEGER - The maximum number of retransmissions of INIT and COOKIE-ECHO chunks - that an association will attempt before declaring the destination - unreachable and terminating. - - Default: 8 - -path_max_retrans - INTEGER - The maximum number of retransmissions that will be attempted on a given - path. Once this threshold is exceeded, the path is considered - unreachable, and new traffic will use a different path when the - association is multihomed. - - Default: 5 - -pf_retrans - INTEGER - The number of retransmissions that will be attempted on a given path - before traffic is redirected to an alternate transport (should one - exist). Note this is distinct from path_max_retrans, as a path that - passes the pf_retrans threshold can still be used. Its only - deprioritized when a transmission path is selected by the stack. This - setting is primarily used to enable fast failover mechanisms without - having to reduce path_max_retrans to a very low value. See: - http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt - for details. Note also that a value of pf_retrans > path_max_retrans - disables this feature. Since both pf_retrans and path_max_retrans can - be changed by userspace application, a variable pf_enable is used to - disable pf state. - - Default: 0 - -ps_retrans - INTEGER - Primary.Switchover.Max.Retrans (PSMR), it's a tunable parameter coming - from section-5 "Primary Path Switchover" in rfc7829. The primary path - will be changed to another active path when the path error counter on - the old primary path exceeds PSMR, so that "the SCTP sender is allowed - to continue data transmission on a new working path even when the old - primary destination address becomes active again". Note this feature - is disabled by initializing 'ps_retrans' per netns as 0xffff by default, - and its value can't be less than 'pf_retrans' when changing by sysctl. - - Default: 0xffff - -rto_initial - INTEGER - The initial round trip timeout value in milliseconds that will be used - in calculating round trip times. This is the initial time interval - for retransmissions. - - Default: 3000 - -rto_max - INTEGER - The maximum value (in milliseconds) of the round trip timeout. This - is the largest time interval that can elapse between retransmissions. - - Default: 60000 - -rto_min - INTEGER - The minimum value (in milliseconds) of the round trip timeout. This - is the smallest time interval the can elapse between retransmissions. - - Default: 1000 - -hb_interval - INTEGER - The interval (in milliseconds) between HEARTBEAT chunks. These chunks - are sent at the specified interval on idle paths to probe the state of - a given path between 2 associations. - - Default: 30000 - -sack_timeout - INTEGER - The amount of time (in milliseconds) that the implementation will wait - to send a SACK. - - Default: 200 - -valid_cookie_life - INTEGER - The default lifetime of the SCTP cookie (in milliseconds). The cookie - is used during association establishment. - - Default: 60000 - -cookie_preserve_enable - BOOLEAN - Enable or disable the ability to extend the lifetime of the SCTP cookie - that is used during the establishment phase of SCTP association - - 1: Enable cookie lifetime extension. - 0: Disable - - Default: 1 - -cookie_hmac_alg - STRING - Select the hmac algorithm used when generating the cookie value sent by - a listening sctp socket to a connecting client in the INIT-ACK chunk. - Valid values are: - * md5 - * sha1 - * none - Ability to assign md5 or sha1 as the selected alg is predicated on the - configuration of those algorithms at build time (CONFIG_CRYPTO_MD5 and - CONFIG_CRYPTO_SHA1). - - Default: Dependent on configuration. MD5 if available, else SHA1 if - available, else none. - -rcvbuf_policy - INTEGER - Determines if the receive buffer is attributed to the socket or to - association. SCTP supports the capability to create multiple - associations on a single socket. When using this capability, it is - possible that a single stalled association that's buffering a lot - of data may block other associations from delivering their data by - consuming all of the receive buffer space. To work around this, - the rcvbuf_policy could be set to attribute the receiver buffer space - to each association instead of the socket. This prevents the described - blocking. - - 1: rcvbuf space is per association - 0: rcvbuf space is per socket - - Default: 0 - -sndbuf_policy - INTEGER - Similar to rcvbuf_policy above, this applies to send buffer space. - - 1: Send buffer is tracked per association - 0: Send buffer is tracked per socket. - - Default: 0 - -sctp_mem - vector of 3 INTEGERs: min, pressure, max - Number of pages allowed for queueing by all SCTP sockets. - - min: Below this number of pages SCTP is not bothered about its - memory appetite. When amount of memory allocated by SCTP exceeds - this number, SCTP starts to moderate memory usage. - - pressure: This value was introduced to follow format of tcp_mem. - - max: Number of pages allowed for queueing by all SCTP sockets. - - Default is calculated at boot time from amount of available memory. - -sctp_rmem - vector of 3 INTEGERs: min, default, max - Only the first value ("min") is used, "default" and "max" are - ignored. - - min: Minimal size of receive buffer used by SCTP socket. - It is guaranteed to each SCTP socket (but not association) even - under moderate memory pressure. - - Default: 4K - -sctp_wmem - vector of 3 INTEGERs: min, default, max - Currently this tunable has no effect. - -addr_scope_policy - INTEGER - Control IPv4 address scoping - draft-stewart-tsvwg-sctp-ipv4-00 - - 0 - Disable IPv4 address scoping - 1 - Enable IPv4 address scoping - 2 - Follow draft but allow IPv4 private addresses - 3 - Follow draft but allow IPv4 link local addresses - - Default: 1 - - -/proc/sys/net/core/* - Please see: Documentation/admin-guide/sysctl/net.rst for descriptions of these entries. - - -/proc/sys/net/unix/* -max_dgram_qlen - INTEGER - The maximum length of dgram socket receive queue - - Default: 10 - diff --git a/Documentation/networking/snmp_counter.rst b/Documentation/networking/snmp_counter.rst index 10e11099e74a..4edd0d38779e 100644 --- a/Documentation/networking/snmp_counter.rst +++ b/Documentation/networking/snmp_counter.rst @@ -792,7 +792,7 @@ counters to indicate the ACK is skipped in which scenario. The ACK would only be skipped if the received packet is either a SYN packet or it has no data. -.. _sysctl document: https://www.kernel.org/doc/Documentation/networking/ip-sysctl.txt +.. _sysctl document: https://www.kernel.org/doc/Documentation/networking/ip-sysctl.rst * TcpExtTCPACKSkippedSynRecv diff --git a/net/Kconfig b/net/Kconfig index df8d8c9bd021..8b1f85820a6b 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -86,7 +86,7 @@ config INET "Sysctl support" below, you can change various aspects of the behavior of the TCP/IP code by writing to the (virtual) files in /proc/sys/net/ipv4/*; the options are explained in the file - . + . Short answer: say Y. diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 25a8888826b8..5da4733067fb 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -49,7 +49,7 @@ config IP_ADVANCED_ROUTER Note that some distributions enable it in startup scripts. For details about rp_filter strict and loose mode read - . + . If unsure, say N here. diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index fc61f51d87a3..956a806649f7 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -853,7 +853,7 @@ static bool icmp_unreach(struct sk_buff *skb) case ICMP_FRAG_NEEDED: /* for documentation of the ip_no_pmtu_disc * values please see - * Documentation/networking/ip-sysctl.txt + * Documentation/networking/ip-sysctl.rst */ switch (net->ipv4.sysctl_ip_no_pmtu_disc) { default: -- cgit v1.2.3 From 19093313cb0486d568232934bb80dd422d891623 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 28 Apr 2020 00:01:50 +0200 Subject: docs: networking: convert ipv6.txt to ReST Not much to be done here: - add SPDX header; - add a document title; - mark a literal as such, in order to avoid a warning; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/admin-guide/kernel-parameters.txt | 6 +- Documentation/networking/index.rst | 1 + Documentation/networking/ipv6.rst | 78 +++++++++++++++++++++++++ Documentation/networking/ipv6.txt | 72 ----------------------- net/ipv6/Kconfig | 2 +- 5 files changed, 83 insertions(+), 76 deletions(-) create mode 100644 Documentation/networking/ipv6.rst delete mode 100644 Documentation/networking/ipv6.txt (limited to 'net') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index e37db6f1be64..e43f2e1f2958 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -356,7 +356,7 @@ shot down by NMI autoconf= [IPV6] - See Documentation/networking/ipv6.txt. + See Documentation/networking/ipv6.rst. show_lapic= [APIC,X86] Advanced Programmable Interrupt Controller Limit apic dumping. The parameter defines the maximal @@ -872,7 +872,7 @@ miss to occur. disable= [IPV6] - See Documentation/networking/ipv6.txt. + See Documentation/networking/ipv6.rst. hardened_usercopy= [KNL] Under CONFIG_HARDENED_USERCOPY, whether @@ -912,7 +912,7 @@ to workaround buggy firmware. disable_ipv6= [IPV6] - See Documentation/networking/ipv6.txt. + See Documentation/networking/ipv6.rst. disable_mtrr_cleanup [X86] The kernel tries to adjust MTRR layout from continuous diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 7d133d8dbe2a..709675464e51 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -70,6 +70,7 @@ Contents: iphase ipsec ip-sysctl + ipv6 .. only:: subproject and html diff --git a/Documentation/networking/ipv6.rst b/Documentation/networking/ipv6.rst new file mode 100644 index 000000000000..ba09c2f2dcc7 --- /dev/null +++ b/Documentation/networking/ipv6.rst @@ -0,0 +1,78 @@ +.. SPDX-License-Identifier: GPL-2.0 + +==== +IPv6 +==== + + +Options for the ipv6 module are supplied as parameters at load time. + +Module options may be given as command line arguments to the insmod +or modprobe command, but are usually specified in either +``/etc/modules.d/*.conf`` configuration files, or in a distro-specific +configuration file. + +The available ipv6 module parameters are listed below. If a parameter +is not specified the default value is used. + +The parameters are as follows: + +disable + + Specifies whether to load the IPv6 module, but disable all + its functionality. This might be used when another module + has a dependency on the IPv6 module being loaded, but no + IPv6 addresses or operations are desired. + + The possible values and their effects are: + + 0 + IPv6 is enabled. + + This is the default value. + + 1 + IPv6 is disabled. + + No IPv6 addresses will be added to interfaces, and + it will not be possible to open an IPv6 socket. + + A reboot is required to enable IPv6. + +autoconf + + Specifies whether to enable IPv6 address autoconfiguration + on all interfaces. This might be used when one does not wish + for addresses to be automatically generated from prefixes + received in Router Advertisements. + + The possible values and their effects are: + + 0 + IPv6 address autoconfiguration is disabled on all interfaces. + + Only the IPv6 loopback address (::1) and link-local addresses + will be added to interfaces. + + 1 + IPv6 address autoconfiguration is enabled on all interfaces. + + This is the default value. + +disable_ipv6 + + Specifies whether to disable IPv6 on all interfaces. + This might be used when no IPv6 addresses are desired. + + The possible values and their effects are: + + 0 + IPv6 is enabled on all interfaces. + + This is the default value. + + 1 + IPv6 is disabled on all interfaces. + + No IPv6 addresses will be added to interfaces. + diff --git a/Documentation/networking/ipv6.txt b/Documentation/networking/ipv6.txt deleted file mode 100644 index 6cd74fa55358..000000000000 --- a/Documentation/networking/ipv6.txt +++ /dev/null @@ -1,72 +0,0 @@ - -Options for the ipv6 module are supplied as parameters at load time. - -Module options may be given as command line arguments to the insmod -or modprobe command, but are usually specified in either -/etc/modules.d/*.conf configuration files, or in a distro-specific -configuration file. - -The available ipv6 module parameters are listed below. If a parameter -is not specified the default value is used. - -The parameters are as follows: - -disable - - Specifies whether to load the IPv6 module, but disable all - its functionality. This might be used when another module - has a dependency on the IPv6 module being loaded, but no - IPv6 addresses or operations are desired. - - The possible values and their effects are: - - 0 - IPv6 is enabled. - - This is the default value. - - 1 - IPv6 is disabled. - - No IPv6 addresses will be added to interfaces, and - it will not be possible to open an IPv6 socket. - - A reboot is required to enable IPv6. - -autoconf - - Specifies whether to enable IPv6 address autoconfiguration - on all interfaces. This might be used when one does not wish - for addresses to be automatically generated from prefixes - received in Router Advertisements. - - The possible values and their effects are: - - 0 - IPv6 address autoconfiguration is disabled on all interfaces. - - Only the IPv6 loopback address (::1) and link-local addresses - will be added to interfaces. - - 1 - IPv6 address autoconfiguration is enabled on all interfaces. - - This is the default value. - -disable_ipv6 - - Specifies whether to disable IPv6 on all interfaces. - This might be used when no IPv6 addresses are desired. - - The possible values and their effects are: - - 0 - IPv6 is enabled on all interfaces. - - This is the default value. - - 1 - IPv6 is disabled on all interfaces. - - No IPv6 addresses will be added to interfaces. - diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index 2ccaee98fddb..5a6111da26c4 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -13,7 +13,7 @@ menuconfig IPV6 For general information about IPv6, see . For specific information about IPv6 under Linux, see - Documentation/networking/ipv6.txt and read the HOWTO at + Documentation/networking/ipv6.rst and read the HOWTO at To compile this protocol support as a module, choose M here: the -- cgit v1.2.3 From 08fad438bed0ada1a3308987862327286fcbb5f5 Mon Sep 17 00:00:00 2001 From: Jouni Malinen Date: Sat, 25 Apr 2020 18:57:12 +0300 Subject: mac80211: TX legacy rate control for Beacon frames Use the Beacon frame specific legacy rate configuration, if specified for AP or mesh, instead of the generic rate mask when selecting the TX rate for Beacon frames. Signed-off-by: Jouni Malinen Link: https://lore.kernel.org/r/20200425155713.25687-4-jouni@codeaurora.org Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 26 +++++++++++++++++++++++++- net/mac80211/ieee80211_i.h | 4 ++++ net/mac80211/mesh.c | 1 + net/mac80211/tx.c | 5 ++++- 4 files changed, 34 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index ae3e06375a28..548a384b0509 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -994,7 +994,7 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, BSS_CHANGED_TWT | BSS_CHANGED_HE_OBSS_PD | BSS_CHANGED_HE_BSS_COLOR; - int err; + int i, err; int prev_beacon_int; old = sdata_dereference(sdata->u.ap.beacon, sdata); @@ -1085,6 +1085,17 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, sdata->vif.bss_conf.p2p_noa_attr.oppps_ctwindow |= IEEE80211_P2P_OPPPS_ENABLE_BIT; + sdata->beacon_rate_set = false; + if (wiphy_ext_feature_isset(local->hw.wiphy, + NL80211_EXT_FEATURE_BEACON_RATE_LEGACY)) { + for (i = 0; i < NUM_NL80211_BANDS; i++) { + sdata->beacon_rateidx_mask[i] = + params->beacon_rate.control[i].legacy; + if (sdata->beacon_rateidx_mask[i]) + sdata->beacon_rate_set = true; + } + } + err = ieee80211_assign_beacon(sdata, ¶ms->beacon, NULL); if (err < 0) { ieee80211_vif_release_channel(sdata); @@ -1189,6 +1200,7 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev) ieee80211_free_keys(sdata, true); sdata->vif.bss_conf.enable_beacon = false; + sdata->beacon_rate_set = false; sdata->vif.bss_conf.ssid_len = 0; clear_bit(SDATA_STATE_OFFCHANNEL_BEACON_STOPPED, &sdata->state); ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON_ENABLED); @@ -1949,6 +1961,7 @@ static int copy_mesh_setup(struct ieee80211_if_mesh *ifmsh, const u8 *old_ie; struct ieee80211_sub_if_data *sdata = container_of(ifmsh, struct ieee80211_sub_if_data, u.mesh); + int i; /* allocate information elements */ new_ie = NULL; @@ -1987,6 +2000,17 @@ static int copy_mesh_setup(struct ieee80211_if_mesh *ifmsh, sdata->vif.bss_conf.beacon_int = setup->beacon_interval; sdata->vif.bss_conf.dtim_period = setup->dtim_period; + sdata->beacon_rate_set = false; + if (wiphy_ext_feature_isset(sdata->local->hw.wiphy, + NL80211_EXT_FEATURE_BEACON_RATE_LEGACY)) { + for (i = 0; i < NUM_NL80211_BANDS; i++) { + sdata->beacon_rateidx_mask[i] = + setup->beacon_rate.control[i].legacy; + if (sdata->beacon_rateidx_mask[i]) + sdata->beacon_rate_set = true; + } + } + return 0; } diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 9407cf44305c..8cbae66b5cdb 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -962,6 +962,10 @@ struct ieee80211_sub_if_data { bool rc_has_vht_mcs_mask[NUM_NL80211_BANDS]; u16 rc_rateidx_vht_mcs_mask[NUM_NL80211_BANDS][NL80211_VHT_NSS_MAX]; + /* Beacon frame (non-MCS) rate (as a bitmap) */ + u32 beacon_rateidx_mask[NUM_NL80211_BANDS]; + bool beacon_rate_set; + union { struct ieee80211_if_ap ap; struct ieee80211_if_wds wds; diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index 36978a0e5000..5930d07b1e43 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -994,6 +994,7 @@ void ieee80211_stop_mesh(struct ieee80211_sub_if_data *sdata) /* stop the beacon */ ifmsh->mesh_id_len = 0; sdata->vif.bss_conf.enable_beacon = false; + sdata->beacon_rate_set = false; clear_bit(SDATA_STATE_OFFCHANNEL_BEACON_STOPPED, &sdata->state); ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON_ENABLED); diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 3dc1990e15c5..6dad67eb60b2 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -4883,7 +4883,10 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw, txrc.bss_conf = &sdata->vif.bss_conf; txrc.skb = skb; txrc.reported_rate.idx = -1; - txrc.rate_idx_mask = sdata->rc_rateidx_mask[band]; + if (sdata->beacon_rate_set && sdata->beacon_rateidx_mask[band]) + txrc.rate_idx_mask = sdata->beacon_rateidx_mask[band]; + else + txrc.rate_idx_mask = sdata->rc_rateidx_mask[band]; txrc.bss = true; rate_control_get_rate(sdata, NULL, &txrc); -- cgit v1.2.3 From 60689de46c7f6a0028c8b37b6f03db68cbfad8ed Mon Sep 17 00:00:00 2001 From: Rajkumar Manoharan Date: Fri, 24 Apr 2020 15:41:39 -0700 Subject: mac80211: fix memory overlap due to variable length param As of now HE operation element in bss_conf includes variable length optional field followed by other HE variable. Though the optional field never be used, actually it is referring to next member of the bss_conf structure which is not correct. Fix it by declaring needed HE operation fields within bss_conf itself. Signed-off-by: Rajkumar Manoharan Link: https://lore.kernel.org/r/1587768108-25248-2-git-send-email-rmanohar@codeaurora.org Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/ath11k/mac.c | 3 +-- include/net/mac80211.h | 7 +++++-- net/mac80211/he.c | 13 +++++-------- 3 files changed, 11 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/drivers/net/wireless/ath/ath11k/mac.c b/drivers/net/wireless/ath/ath11k/mac.c index 9f8bc19cc5ae..06d063274eea 100644 --- a/drivers/net/wireless/ath/ath11k/mac.c +++ b/drivers/net/wireless/ath/ath11k/mac.c @@ -1168,8 +1168,7 @@ static void ath11k_peer_assoc_h_he(struct ath11k *ar, sizeof(arg->peer_he_cap_macinfo)); memcpy(&arg->peer_he_cap_phyinfo, he_cap->he_cap_elem.phy_cap_info, sizeof(arg->peer_he_cap_phyinfo)); - memcpy(&arg->peer_he_ops, &vif->bss_conf.he_operation, - sizeof(arg->peer_he_ops)); + arg->peer_he_ops = vif->bss_conf.he_oper.params; /* the top most byte is used to indicate BSS color info */ arg->peer_he_ops &= 0xffffff; diff --git a/include/net/mac80211.h b/include/net/mac80211.h index ecb219e3ec4f..78f7ce586287 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -604,7 +604,7 @@ struct ieee80211_ftm_responder_params { * nontransmitted BSSIDs * @profile_periodicity: the least number of beacon frames need to be received * in order to discover all the nontransmitted BSSIDs in the set. - * @he_operation: HE operation information of the AP we are connected to + * @he_oper: HE operation information of the AP we are connected to * @he_obss_pd: OBSS Packet Detection parameters. * @he_bss_color: BSS coloring settings, if BSS supports HE */ @@ -668,7 +668,10 @@ struct ieee80211_bss_conf { u8 bssid_indicator; bool ema_ap; u8 profile_periodicity; - struct ieee80211_he_operation he_operation; + struct { + u32 params; + u16 nss_set; + } he_oper; struct ieee80211_he_obss_pd he_obss_pd; struct cfg80211_he_bss_color he_bss_color; }; diff --git a/net/mac80211/he.c b/net/mac80211/he.c index 1087f715338b..f520552b22be 100644 --- a/net/mac80211/he.c +++ b/net/mac80211/he.c @@ -57,17 +57,14 @@ ieee80211_he_cap_ie_to_sta_he_cap(struct ieee80211_sub_if_data *sdata, void ieee80211_he_op_ie_to_bss_conf(struct ieee80211_vif *vif, - const struct ieee80211_he_operation *he_op_ie_elem) + const struct ieee80211_he_operation *he_op_ie) { - struct ieee80211_he_operation *he_operation = - &vif->bss_conf.he_operation; - - if (!he_op_ie_elem) { - memset(he_operation, 0, sizeof(*he_operation)); + memset(&vif->bss_conf.he_oper, 0, sizeof(vif->bss_conf.he_oper)); + if (!he_op_ie) return; - } - vif->bss_conf.he_operation = *he_op_ie_elem; + vif->bss_conf.he_oper.params = __le32_to_cpu(he_op_ie->he_oper_params); + vif->bss_conf.he_oper.nss_set = __le16_to_cpu(he_op_ie->he_mcs_nss_set); } void -- cgit v1.2.3 From a54776f2c4939bdee084c9ecd00a4a5a25b7c429 Mon Sep 17 00:00:00 2001 From: Yunjian Wang Date: Wed, 29 Apr 2020 18:20:58 +0800 Subject: netpoll: Fix use correct return type for ndo_start_xmit() The method ndo_start_xmit() returns a value of type netdev_tx_t. Fix the ndo function to use the correct type. Signed-off-by: Yunjian Wang Signed-off-by: David S. Miller --- net/core/netpoll.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 849380a622ef..15b366a1a958 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -69,10 +69,11 @@ module_param(carrier_timeout, uint, 0644); #define np_notice(np, fmt, ...) \ pr_notice("%s: " fmt, np->name, ##__VA_ARGS__) -static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev, - struct netdev_queue *txq) +static netdev_tx_t netpoll_start_xmit(struct sk_buff *skb, + struct net_device *dev, + struct netdev_queue *txq) { - int status = NETDEV_TX_OK; + netdev_tx_t status = NETDEV_TX_OK; netdev_features_t features; features = netif_skb_features(skb); @@ -307,7 +308,7 @@ static int netpoll_owner_active(struct net_device *dev) void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, struct net_device *dev) { - int status = NETDEV_TX_BUSY; + netdev_tx_t status = NETDEV_TX_BUSY; unsigned long tries; /* It is up to the caller to keep npinfo alive. */ struct netpoll_info *npinfo; -- cgit v1.2.3 From ad56623119fda623ade9bc570294a1c2b203d374 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 29 Apr 2020 21:24:30 +0800 Subject: net: hsr: remove unused inline functions There's no callers in-tree anymore. Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- net/hsr/hsr_main.h | 19 ------------------- 1 file changed, 19 deletions(-) (limited to 'net') diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h index 7321cf8d6d2c..f74193465bf5 100644 --- a/net/hsr/hsr_main.h +++ b/net/hsr/hsr_main.h @@ -62,15 +62,6 @@ struct hsr_tag { * with the path field in-between, which seems strange. I'm guessing the MAC * address definition is in error. */ -static inline u16 get_hsr_tag_path(struct hsr_tag *ht) -{ - return ntohs(ht->path_and_LSDU_size) >> 12; -} - -static inline u16 get_hsr_tag_LSDU_size(struct hsr_tag *ht) -{ - return ntohs(ht->path_and_LSDU_size) & 0x0FFF; -} static inline void set_hsr_tag_path(struct hsr_tag *ht, u16 path) { @@ -103,16 +94,6 @@ struct hsr_sup_payload { unsigned char macaddress_A[ETH_ALEN]; } __packed; -static inline u16 get_hsr_stag_path(struct hsr_sup_tag *hst) -{ - return get_hsr_tag_path((struct hsr_tag *)hst); -} - -static inline u16 get_hsr_stag_HSR_ver(struct hsr_sup_tag *hst) -{ - return get_hsr_tag_LSDU_size((struct hsr_tag *)hst); -} - static inline void set_hsr_stag_path(struct hsr_sup_tag *hst, u16 path) { set_hsr_tag_path((struct hsr_tag *)hst, path); -- cgit v1.2.3 From 0477e032a9ea34780180d9d5289f9da06714de43 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 29 Apr 2020 21:25:30 +0800 Subject: ila: remove unused inline function ila_addr_is_ila There's no callers in-tree anymore since commit 84287bb32856 ("ila: add checksum neutral map auto"). Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- net/ipv6/ila/ila.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'net') diff --git a/net/ipv6/ila/ila.h b/net/ipv6/ila/ila.h index bb6fc0d54dae..ad5f6f6ba333 100644 --- a/net/ipv6/ila/ila.h +++ b/net/ipv6/ila/ila.h @@ -68,11 +68,6 @@ static inline struct ila_addr *ila_a2i(struct in6_addr *addr) return (struct ila_addr *)addr; } -static inline bool ila_addr_is_ila(struct ila_addr *iaddr) -{ - return (iaddr->ident.type != ILA_ATYPE_IID); -} - struct ila_params { struct ila_locator locator; struct ila_locator locator_match; -- cgit v1.2.3 From fdff704dc60418e9a1bac78ae09c857d05c65aa3 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Wed, 29 Apr 2020 17:10:37 +0200 Subject: net/smc: rework pnet table to support SMC-R failover The pnet table stored pnet ids in the smc device structures. When a device is going down its smc device structure is freed, and when the device is brought online again it no longer has a pnet id set. Rework the pnet table implementation to store the device name with their assigned pnet id and apply the pnet id to devices when they are registered. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_ib.c | 5 +- net/smc/smc_ism.c | 3 +- net/smc/smc_pnet.c | 539 +++++++++++++++++++++++++++++++---------------------- net/smc/smc_pnet.h | 2 + 4 files changed, 319 insertions(+), 230 deletions(-) (limited to 'net') diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 04b6fefb8bce..440f9e319a38 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -579,8 +579,9 @@ static void smc_ib_add_dev(struct ib_device *ibdev) i++) { set_bit(i, &smcibdev->port_event_mask); /* determine pnetids of the port */ - smc_pnetid_by_dev_port(ibdev->dev.parent, i, - smcibdev->pnetid[i]); + if (smc_pnetid_by_dev_port(ibdev->dev.parent, i, + smcibdev->pnetid[i])) + smc_pnetid_by_table_ib(smcibdev, i + 1); } schedule_work(&smcibdev->port_event_work); } diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index 5c4727d5066e..32be2da2cb85 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -296,7 +296,8 @@ struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, device_initialize(&smcd->dev); dev_set_name(&smcd->dev, name); smcd->ops = ops; - smc_pnetid_by_dev_port(parent, 0, smcd->pnetid); + if (smc_pnetid_by_dev_port(parent, 0, smcd->pnetid)) + smc_pnetid_by_table_smcd(smcd); spin_lock_init(&smcd->lock); spin_lock_init(&smcd->lgr_lock); diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 2a5ed47c3e08..bd01c71b827a 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -50,29 +50,26 @@ static struct nla_policy smc_pnet_policy[SMC_PNETID_MAX + 1] = { static struct genl_family smc_pnet_nl_family; -/** - * struct smc_user_pnetentry - pnet identifier name entry for/from user - * @list: List node. - * @pnet_name: Pnet identifier name - * @ndev: pointer to network device. - * @smcibdev: Pointer to IB device. - * @ib_port: Port of IB device. - * @smcd_dev: Pointer to smcd device. - */ -struct smc_user_pnetentry { - struct list_head list; - char pnet_name[SMC_MAX_PNETID_LEN + 1]; - struct net_device *ndev; - struct smc_ib_device *smcibdev; - u8 ib_port; - struct smcd_dev *smcd_dev; +enum smc_pnet_nametype { + SMC_PNET_ETH = 1, + SMC_PNET_IB = 2, }; /* pnet entry stored in pnet table */ struct smc_pnetentry { struct list_head list; char pnet_name[SMC_MAX_PNETID_LEN + 1]; - struct net_device *ndev; + enum smc_pnet_nametype type; + union { + struct { + char eth_name[IFNAMSIZ + 1]; + struct net_device *ndev; + }; + struct { + char ib_name[IB_DEVICE_NAME_MAX + 1]; + u8 ib_port; + }; + }; }; /* Check if two given pnetids match */ @@ -106,14 +103,15 @@ static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name) sn = net_generic(net, smc_net_id); pnettable = &sn->pnettable; - /* remove netdevices */ + /* remove table entry */ write_lock(&pnettable->lock); list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist, list) { if (!pnet_name || smc_pnet_match(pnetelem->pnet_name, pnet_name)) { list_del(&pnetelem->list); - dev_put(pnetelem->ndev); + if (pnetelem->type == SMC_PNET_ETH && pnetelem->ndev) + dev_put(pnetelem->ndev); kfree(pnetelem); rc = 0; } @@ -155,9 +153,9 @@ static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name) return rc; } -/* Remove a pnet entry mentioning a given network device from the pnet table. +/* Add the reference to a given network device to the pnet table. */ -static int smc_pnet_remove_by_ndev(struct net_device *ndev) +static int smc_pnet_add_by_ndev(struct net_device *ndev) { struct smc_pnetentry *pnetelem, *tmp_pe; struct smc_pnettable *pnettable; @@ -171,10 +169,10 @@ static int smc_pnet_remove_by_ndev(struct net_device *ndev) write_lock(&pnettable->lock); list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist, list) { - if (pnetelem->ndev == ndev) { - list_del(&pnetelem->list); - dev_put(pnetelem->ndev); - kfree(pnetelem); + if (pnetelem->type == SMC_PNET_ETH && !pnetelem->ndev && + !strncmp(pnetelem->eth_name, ndev->name, IFNAMSIZ)) { + dev_hold(ndev); + pnetelem->ndev = ndev; rc = 0; break; } @@ -183,80 +181,67 @@ static int smc_pnet_remove_by_ndev(struct net_device *ndev) return rc; } -/* Append a pnetid to the end of the pnet table if not already on this list. +/* Remove the reference to a given network device from the pnet table. */ -static int smc_pnet_enter(struct smc_pnettable *pnettable, - struct smc_user_pnetentry *new_pnetelem) +static int smc_pnet_remove_by_ndev(struct net_device *ndev) { - u8 pnet_null[SMC_MAX_PNETID_LEN] = {0}; - u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; - struct smc_pnetentry *tmp_pnetelem; - struct smc_pnetentry *pnetelem; - bool new_smcddev = false; - struct net_device *ndev; - bool new_netdev = true; - bool new_ibdev = false; - - if (new_pnetelem->smcibdev) { - struct smc_ib_device *ib_dev = new_pnetelem->smcibdev; - int ib_port = new_pnetelem->ib_port; + struct smc_pnetentry *pnetelem, *tmp_pe; + struct smc_pnettable *pnettable; + struct net *net = dev_net(ndev); + struct smc_net *sn; + int rc = -ENOENT; - spin_lock(&smc_ib_devices.lock); - if (smc_pnet_match(ib_dev->pnetid[ib_port - 1], pnet_null)) { - memcpy(ib_dev->pnetid[ib_port - 1], - new_pnetelem->pnet_name, SMC_MAX_PNETID_LEN); - ib_dev->pnetid_by_user[ib_port - 1] = true; - new_ibdev = true; - } - spin_unlock(&smc_ib_devices.lock); - } - if (new_pnetelem->smcd_dev) { - struct smcd_dev *smcd_dev = new_pnetelem->smcd_dev; + /* get pnettable for namespace */ + sn = net_generic(net, smc_net_id); + pnettable = &sn->pnettable; - spin_lock(&smcd_dev_list.lock); - if (smc_pnet_match(smcd_dev->pnetid, pnet_null)) { - memcpy(smcd_dev->pnetid, new_pnetelem->pnet_name, - SMC_MAX_PNETID_LEN); - smcd_dev->pnetid_by_user = true; - new_smcddev = true; + write_lock(&pnettable->lock); + list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist, list) { + if (pnetelem->type == SMC_PNET_ETH && pnetelem->ndev == ndev) { + dev_put(pnetelem->ndev); + pnetelem->ndev = NULL; + rc = 0; + break; } - spin_unlock(&smcd_dev_list.lock); } + write_unlock(&pnettable->lock); + return rc; +} - if (!new_pnetelem->ndev) - return (new_ibdev || new_smcddev) ? 0 : -EEXIST; +/* Apply pnetid to ib device when no pnetid is set. + */ +static bool smc_pnet_apply_ib(struct smc_ib_device *ib_dev, u8 ib_port, + char *pnet_name) +{ + u8 pnet_null[SMC_MAX_PNETID_LEN] = {0}; + bool applied = false; - /* check if (base) netdev already has a pnetid. If there is one, we do - * not want to add a pnet table entry - */ - ndev = pnet_find_base_ndev(new_pnetelem->ndev); - if (!smc_pnetid_by_dev_port(ndev->dev.parent, ndev->dev_port, - ndev_pnetid)) - return (new_ibdev || new_smcddev) ? 0 : -EEXIST; + spin_lock(&smc_ib_devices.lock); + if (smc_pnet_match(ib_dev->pnetid[ib_port - 1], pnet_null)) { + memcpy(ib_dev->pnetid[ib_port - 1], pnet_name, + SMC_MAX_PNETID_LEN); + ib_dev->pnetid_by_user[ib_port - 1] = true; + applied = true; + } + spin_unlock(&smc_ib_devices.lock); + return applied; +} - /* add a new netdev entry to the pnet table if there isn't one */ - tmp_pnetelem = kzalloc(sizeof(*pnetelem), GFP_KERNEL); - if (!tmp_pnetelem) - return -ENOMEM; - memcpy(tmp_pnetelem->pnet_name, new_pnetelem->pnet_name, - SMC_MAX_PNETID_LEN); - tmp_pnetelem->ndev = new_pnetelem->ndev; +/* Apply pnetid to smcd device when no pnetid is set. + */ +static bool smc_pnet_apply_smcd(struct smcd_dev *smcd_dev, char *pnet_name) +{ + u8 pnet_null[SMC_MAX_PNETID_LEN] = {0}; + bool applied = false; - write_lock(&pnettable->lock); - list_for_each_entry(pnetelem, &pnettable->pnetlist, list) { - if (pnetelem->ndev == new_pnetelem->ndev) - new_netdev = false; - } - if (new_netdev) { - dev_hold(tmp_pnetelem->ndev); - list_add_tail(&tmp_pnetelem->list, &pnettable->pnetlist); - write_unlock(&pnettable->lock); - } else { - write_unlock(&pnettable->lock); - kfree(tmp_pnetelem); + spin_lock(&smcd_dev_list.lock); + if (smc_pnet_match(smcd_dev->pnetid, pnet_null)) { + memcpy(smcd_dev->pnetid, pnet_name, SMC_MAX_PNETID_LEN); + smcd_dev->pnetid_by_user = true; + applied = true; } - - return (new_netdev || new_ibdev || new_smcddev) ? 0 : -EEXIST; + spin_unlock(&smcd_dev_list.lock); + return applied; } /* The limit for pnetid is 16 characters. @@ -323,57 +308,167 @@ out: return smcd_dev; } -/* Parse the supplied netlink attributes and fill a pnetentry structure. - * For ethernet and infiniband device names verify that the devices exist. +static int smc_pnet_add_eth(struct smc_pnettable *pnettable, struct net *net, + char *eth_name, char *pnet_name) +{ + struct smc_pnetentry *tmp_pe, *new_pe; + struct net_device *ndev, *base_ndev; + u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; + bool new_netdev; + int rc; + + /* check if (base) netdev already has a pnetid. If there is one, we do + * not want to add a pnet table entry + */ + rc = -EEXIST; + ndev = dev_get_by_name(net, eth_name); /* dev_hold() */ + if (ndev) { + base_ndev = pnet_find_base_ndev(ndev); + if (!smc_pnetid_by_dev_port(base_ndev->dev.parent, + base_ndev->dev_port, ndev_pnetid)) + goto out_put; + } + + /* add a new netdev entry to the pnet table if there isn't one */ + rc = -ENOMEM; + new_pe = kzalloc(sizeof(*new_pe), GFP_KERNEL); + if (!new_pe) + goto out_put; + new_pe->type = SMC_PNET_ETH; + memcpy(new_pe->pnet_name, pnet_name, SMC_MAX_PNETID_LEN); + strncpy(new_pe->eth_name, eth_name, IFNAMSIZ); + new_pe->ndev = ndev; + + rc = -EEXIST; + new_netdev = true; + write_lock(&pnettable->lock); + list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { + if (tmp_pe->type == SMC_PNET_ETH && + !strncmp(tmp_pe->eth_name, eth_name, IFNAMSIZ)) { + new_netdev = false; + break; + } + } + if (new_netdev) { + list_add_tail(&new_pe->list, &pnettable->pnetlist); + write_unlock(&pnettable->lock); + } else { + write_unlock(&pnettable->lock); + kfree(new_pe); + goto out_put; + } + return 0; + +out_put: + if (ndev) + dev_put(ndev); + return rc; +} + +static int smc_pnet_add_ib(struct smc_pnettable *pnettable, char *ib_name, + u8 ib_port, char *pnet_name) +{ + struct smc_pnetentry *tmp_pe, *new_pe; + struct smc_ib_device *ib_dev; + bool smcddev_applied = true; + bool ibdev_applied = true; + struct smcd_dev *smcd_dev; + bool new_ibdev; + + /* try to apply the pnetid to active devices */ + ib_dev = smc_pnet_find_ib(ib_name); + if (ib_dev) + ibdev_applied = smc_pnet_apply_ib(ib_dev, ib_port, pnet_name); + smcd_dev = smc_pnet_find_smcd(ib_name); + if (smcd_dev) + smcddev_applied = smc_pnet_apply_smcd(smcd_dev, pnet_name); + /* Apply fails when a device has a hardware-defined pnetid set, do not + * add a pnet table entry in that case. + */ + if (!ibdev_applied || !smcddev_applied) + return -EEXIST; + + /* add a new ib entry to the pnet table if there isn't one */ + new_pe = kzalloc(sizeof(*new_pe), GFP_KERNEL); + if (!new_pe) + return -ENOMEM; + new_pe->type = SMC_PNET_IB; + memcpy(new_pe->pnet_name, pnet_name, SMC_MAX_PNETID_LEN); + strncpy(new_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX); + new_pe->ib_port = ib_port; + + new_ibdev = true; + write_lock(&pnettable->lock); + list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { + if (tmp_pe->type == SMC_PNET_IB && + !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX)) { + new_ibdev = false; + break; + } + } + if (new_ibdev) { + list_add_tail(&new_pe->list, &pnettable->pnetlist); + write_unlock(&pnettable->lock); + } else { + write_unlock(&pnettable->lock); + kfree(new_pe); + } + return (new_ibdev) ? 0 : -EEXIST; +} + +/* Append a pnetid to the end of the pnet table if not already on this list. */ -static int smc_pnet_fill_entry(struct net *net, - struct smc_user_pnetentry *pnetelem, - struct nlattr *tb[]) +static int smc_pnet_enter(struct net *net, struct nlattr *tb[]) { - char *string, *ibname; + char pnet_name[SMC_MAX_PNETID_LEN + 1]; + struct smc_pnettable *pnettable; + bool new_netdev = false; + bool new_ibdev = false; + struct smc_net *sn; + u8 ibport = 1; + char *string; int rc; - memset(pnetelem, 0, sizeof(*pnetelem)); - INIT_LIST_HEAD(&pnetelem->list); + /* get pnettable for namespace */ + sn = net_generic(net, smc_net_id); + pnettable = &sn->pnettable; rc = -EINVAL; if (!tb[SMC_PNETID_NAME]) goto error; string = (char *)nla_data(tb[SMC_PNETID_NAME]); - if (!smc_pnetid_valid(string, pnetelem->pnet_name)) + if (!smc_pnetid_valid(string, pnet_name)) goto error; - rc = -EINVAL; if (tb[SMC_PNETID_ETHNAME]) { string = (char *)nla_data(tb[SMC_PNETID_ETHNAME]); - pnetelem->ndev = dev_get_by_name(net, string); - if (!pnetelem->ndev) + rc = smc_pnet_add_eth(pnettable, net, string, pnet_name); + if (!rc) + new_netdev = true; + else if (rc != -EEXIST) goto error; } /* if this is not the initial namespace, stop here */ if (net != &init_net) - return 0; + return new_netdev ? 0 : -EEXIST; rc = -EINVAL; if (tb[SMC_PNETID_IBNAME]) { - ibname = (char *)nla_data(tb[SMC_PNETID_IBNAME]); - ibname = strim(ibname); - pnetelem->smcibdev = smc_pnet_find_ib(ibname); - pnetelem->smcd_dev = smc_pnet_find_smcd(ibname); - if (!pnetelem->smcibdev && !pnetelem->smcd_dev) - goto error; - if (pnetelem->smcibdev) { - if (!tb[SMC_PNETID_IBPORT]) - goto error; - pnetelem->ib_port = nla_get_u8(tb[SMC_PNETID_IBPORT]); - if (pnetelem->ib_port < 1 || - pnetelem->ib_port > SMC_MAX_PORTS) + string = (char *)nla_data(tb[SMC_PNETID_IBNAME]); + string = strim(string); + if (tb[SMC_PNETID_IBPORT]) { + ibport = nla_get_u8(tb[SMC_PNETID_IBPORT]); + if (ibport < 1 || ibport > SMC_MAX_PORTS) goto error; } + rc = smc_pnet_add_ib(pnettable, string, ibport, pnet_name); + if (!rc) + new_ibdev = true; + else if (rc != -EEXIST) + goto error; } - - return 0; + return (new_netdev || new_ibdev) ? 0 : -EEXIST; error: return rc; @@ -381,28 +476,22 @@ error: /* Convert an smc_pnetentry to a netlink attribute sequence */ static int smc_pnet_set_nla(struct sk_buff *msg, - struct smc_user_pnetentry *pnetelem) + struct smc_pnetentry *pnetelem) { if (nla_put_string(msg, SMC_PNETID_NAME, pnetelem->pnet_name)) return -1; - if (pnetelem->ndev) { + if (pnetelem->type == SMC_PNET_ETH) { if (nla_put_string(msg, SMC_PNETID_ETHNAME, - pnetelem->ndev->name)) + pnetelem->eth_name)) return -1; } else { if (nla_put_string(msg, SMC_PNETID_ETHNAME, "n/a")) return -1; } - if (pnetelem->smcibdev) { - if (nla_put_string(msg, SMC_PNETID_IBNAME, - dev_name(pnetelem->smcibdev->ibdev->dev.parent)) || + if (pnetelem->type == SMC_PNET_IB) { + if (nla_put_string(msg, SMC_PNETID_IBNAME, pnetelem->ib_name) || nla_put_u8(msg, SMC_PNETID_IBPORT, pnetelem->ib_port)) return -1; - } else if (pnetelem->smcd_dev) { - if (nla_put_string(msg, SMC_PNETID_IBNAME, - dev_name(&pnetelem->smcd_dev->dev)) || - nla_put_u8(msg, SMC_PNETID_IBPORT, 1)) - return -1; } else { if (nla_put_string(msg, SMC_PNETID_IBNAME, "n/a") || nla_put_u8(msg, SMC_PNETID_IBPORT, 0xff)) @@ -415,21 +504,8 @@ static int smc_pnet_set_nla(struct sk_buff *msg, static int smc_pnet_add(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); - struct smc_user_pnetentry pnetelem; - struct smc_pnettable *pnettable; - struct smc_net *sn; - int rc; - - /* get pnettable for namespace */ - sn = net_generic(net, smc_net_id); - pnettable = &sn->pnettable; - rc = smc_pnet_fill_entry(net, &pnetelem, info->attrs); - if (!rc) - rc = smc_pnet_enter(pnettable, &pnetelem); - if (pnetelem.ndev) - dev_put(pnetelem.ndev); - return rc; + return smc_pnet_enter(net, info->attrs); } static int smc_pnet_del(struct sk_buff *skb, struct genl_info *info) @@ -450,7 +526,7 @@ static int smc_pnet_dump_start(struct netlink_callback *cb) static int smc_pnet_dumpinfo(struct sk_buff *skb, u32 portid, u32 seq, u32 flags, - struct smc_user_pnetentry *pnetelem) + struct smc_pnetentry *pnetelem) { void *hdr; @@ -469,91 +545,32 @@ static int smc_pnet_dumpinfo(struct sk_buff *skb, static int _smc_pnet_dump(struct net *net, struct sk_buff *skb, u32 portid, u32 seq, u8 *pnetid, int start_idx) { - struct smc_user_pnetentry tmp_entry; struct smc_pnettable *pnettable; struct smc_pnetentry *pnetelem; - struct smc_ib_device *ibdev; - struct smcd_dev *smcd_dev; struct smc_net *sn; int idx = 0; - int ibport; /* get pnettable for namespace */ sn = net_generic(net, smc_net_id); pnettable = &sn->pnettable; - /* dump netdevices */ + /* dump pnettable entries */ read_lock(&pnettable->lock); list_for_each_entry(pnetelem, &pnettable->pnetlist, list) { if (pnetid && !smc_pnet_match(pnetelem->pnet_name, pnetid)) continue; if (idx++ < start_idx) continue; - memset(&tmp_entry, 0, sizeof(tmp_entry)); - memcpy(&tmp_entry.pnet_name, pnetelem->pnet_name, - SMC_MAX_PNETID_LEN); - tmp_entry.ndev = pnetelem->ndev; + /* if this is not the initial namespace, dump only netdev */ + if (net != &init_net && pnetelem->type != SMC_PNET_ETH) + continue; if (smc_pnet_dumpinfo(skb, portid, seq, NLM_F_MULTI, - &tmp_entry)) { + pnetelem)) { --idx; break; } } read_unlock(&pnettable->lock); - - /* if this is not the initial namespace, stop here */ - if (net != &init_net) - return idx; - - /* dump ib devices */ - spin_lock(&smc_ib_devices.lock); - list_for_each_entry(ibdev, &smc_ib_devices.list, list) { - for (ibport = 0; ibport < SMC_MAX_PORTS; ibport++) { - if (ibdev->pnetid_by_user[ibport]) { - if (pnetid && - !smc_pnet_match(ibdev->pnetid[ibport], - pnetid)) - continue; - if (idx++ < start_idx) - continue; - memset(&tmp_entry, 0, sizeof(tmp_entry)); - memcpy(&tmp_entry.pnet_name, - ibdev->pnetid[ibport], - SMC_MAX_PNETID_LEN); - tmp_entry.smcibdev = ibdev; - tmp_entry.ib_port = ibport + 1; - if (smc_pnet_dumpinfo(skb, portid, seq, - NLM_F_MULTI, - &tmp_entry)) { - --idx; - break; - } - } - } - } - spin_unlock(&smc_ib_devices.lock); - - /* dump smcd devices */ - spin_lock(&smcd_dev_list.lock); - list_for_each_entry(smcd_dev, &smcd_dev_list.list, list) { - if (smcd_dev->pnetid_by_user) { - if (pnetid && !smc_pnet_match(smcd_dev->pnetid, pnetid)) - continue; - if (idx++ < start_idx) - continue; - memset(&tmp_entry, 0, sizeof(tmp_entry)); - memcpy(&tmp_entry.pnet_name, smcd_dev->pnetid, - SMC_MAX_PNETID_LEN); - tmp_entry.smcd_dev = smcd_dev; - if (smc_pnet_dumpinfo(skb, portid, seq, NLM_F_MULTI, - &tmp_entry)) { - --idx; - break; - } - } - } - spin_unlock(&smcd_dev_list.lock); - return idx; } @@ -659,6 +676,9 @@ static int smc_pnet_netdev_event(struct notifier_block *this, case NETDEV_UNREGISTER: smc_pnet_remove_by_ndev(event_dev); return NOTIFY_OK; + case NETDEV_REGISTER: + smc_pnet_add_by_ndev(event_dev); + return NOTIFY_OK; default: return NOTIFY_DONE; } @@ -744,7 +764,7 @@ static int smc_pnet_find_ndev_pnetid_by_table(struct net_device *ndev, read_lock(&pnettable->lock); list_for_each_entry(pnetelem, &pnettable->pnetlist, list) { - if (ndev == pnetelem->ndev) { + if (pnetelem->type == SMC_PNET_ETH && ndev == pnetelem->ndev) { /* get pnetid of netdev device */ memcpy(pnetid, pnetelem->pnet_name, SMC_MAX_PNETID_LEN); rc = 0; @@ -755,6 +775,34 @@ static int smc_pnet_find_ndev_pnetid_by_table(struct net_device *ndev, return rc; } +/* find a roce device for the given pnetid */ +static void _smc_pnet_find_roce_by_pnetid(u8 *pnet_id, + struct smc_init_info *ini) +{ + struct smc_ib_device *ibdev; + int i; + + ini->ib_dev = NULL; + spin_lock(&smc_ib_devices.lock); + list_for_each_entry(ibdev, &smc_ib_devices.list, list) { + for (i = 1; i <= SMC_MAX_PORTS; i++) { + if (!rdma_is_port_valid(ibdev->ibdev, i)) + continue; + if (smc_pnet_match(ibdev->pnetid[i - 1], pnet_id) && + smc_ib_port_active(ibdev, i) && + !test_bit(i - 1, ibdev->ports_going_away) && + !smc_ib_determine_gid(ibdev, i, ini->vlan_id, + ini->ib_gid, NULL)) { + ini->ib_dev = ibdev; + ini->ib_port = i; + goto out; + } + } + } +out: + spin_unlock(&smc_ib_devices.lock); +} + /* if handshake network device belongs to a roce device, return its * IB device and port */ @@ -801,8 +849,6 @@ static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev, struct smc_init_info *ini) { u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; - struct smc_ib_device *ibdev; - int i; ndev = pnet_find_base_ndev(ndev); if (smc_pnetid_by_dev_port(ndev->dev.parent, ndev->dev_port, @@ -811,25 +857,7 @@ static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev, smc_pnet_find_rdma_dev(ndev, ini); return; /* pnetid could not be determined */ } - - spin_lock(&smc_ib_devices.lock); - list_for_each_entry(ibdev, &smc_ib_devices.list, list) { - for (i = 1; i <= SMC_MAX_PORTS; i++) { - if (!rdma_is_port_valid(ibdev->ibdev, i)) - continue; - if (smc_pnet_match(ibdev->pnetid[i - 1], ndev_pnetid) && - smc_ib_port_active(ibdev, i) && - !test_bit(i - 1, ibdev->ports_going_away) && - !smc_ib_determine_gid(ibdev, i, ini->vlan_id, - ini->ib_gid, NULL)) { - ini->ib_dev = ibdev; - ini->ib_port = i; - goto out; - } - } - } -out: - spin_unlock(&smc_ib_devices.lock); + _smc_pnet_find_roce_by_pnetid(ndev_pnetid, ini); } static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev, @@ -895,3 +923,60 @@ out_rel: out: return; } + +/* Lookup and apply a pnet table entry to the given ib device. + */ +int smc_pnetid_by_table_ib(struct smc_ib_device *smcibdev, u8 ib_port) +{ + char *ib_name = smcibdev->ibdev->name; + struct smc_pnettable *pnettable; + struct smc_pnetentry *tmp_pe; + struct smc_net *sn; + int rc = -ENOENT; + + /* get pnettable for init namespace */ + sn = net_generic(&init_net, smc_net_id); + pnettable = &sn->pnettable; + + read_lock(&pnettable->lock); + list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { + if (tmp_pe->type == SMC_PNET_IB && + !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX) && + tmp_pe->ib_port == ib_port) { + smc_pnet_apply_ib(smcibdev, ib_port, tmp_pe->pnet_name); + rc = 0; + break; + } + } + read_unlock(&pnettable->lock); + + return rc; +} + +/* Lookup and apply a pnet table entry to the given smcd device. + */ +int smc_pnetid_by_table_smcd(struct smcd_dev *smcddev) +{ + const char *ib_name = dev_name(&smcddev->dev); + struct smc_pnettable *pnettable; + struct smc_pnetentry *tmp_pe; + struct smc_net *sn; + int rc = -ENOENT; + + /* get pnettable for init namespace */ + sn = net_generic(&init_net, smc_net_id); + pnettable = &sn->pnettable; + + read_lock(&pnettable->lock); + list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { + if (tmp_pe->type == SMC_PNET_IB && + !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX)) { + smc_pnet_apply_smcd(smcddev, tmp_pe->pnet_name); + rc = 0; + break; + } + } + read_unlock(&pnettable->lock); + + return rc; +} diff --git a/net/smc/smc_pnet.h b/net/smc/smc_pnet.h index 4564e4d69c2e..ea207f8fc6f7 100644 --- a/net/smc/smc_pnet.h +++ b/net/smc/smc_pnet.h @@ -46,5 +46,7 @@ void smc_pnet_exit(void); void smc_pnet_net_exit(struct net *net); void smc_pnet_find_roce_resource(struct sock *sk, struct smc_init_info *ini); void smc_pnet_find_ism_resource(struct sock *sk, struct smc_init_info *ini); +int smc_pnetid_by_table_ib(struct smc_ib_device *smcibdev, u8 ib_port); +int smc_pnetid_by_table_smcd(struct smcd_dev *smcd); #endif -- cgit v1.2.3 From f3c1deddb21c19fb0eec3c61e80567ef4a79b58b Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Wed, 29 Apr 2020 17:10:38 +0200 Subject: net/smc: separate function for link initialization Move the initialization of a new link into its own function, separate from smc_lgr_create, to allow more than one link per link group. Do an extra check if the IB device initialization was successful, and reset the link state if any error occurs during smcr_link_init(). And rename two existing functions to use the prefix smcr_ to indicate that they belong to the SMC-R code path. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_core.c | 114 +++++++++++++++++++++++++++++++---------------------- 1 file changed, 66 insertions(+), 48 deletions(-) (limited to 'net') diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 824c5211b027..3bb45c33db22 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -179,7 +179,7 @@ void smc_lgr_cleanup_early(struct smc_connection *conn) * of the DELETE LINK sequence from server; or as server to * initiate the delete processing. See smc_llc_rx_delete_link(). */ -static int smc_link_send_delete(struct smc_link *lnk, bool orderly) +static int smcr_link_send_delete(struct smc_link *lnk, bool orderly) { if (lnk->state == SMC_LNK_ACTIVE && !smc_llc_send_delete_link(lnk, SMC_LLC_REQ, orderly)) { @@ -219,7 +219,7 @@ static void smc_lgr_free_work(struct work_struct *work) if (!lgr->is_smcd && !lgr->terminating) { /* try to send del link msg, on error free lgr immediately */ if (lnk->state == SMC_LNK_ACTIVE && - !smc_link_send_delete(lnk, true)) { + !smcr_link_send_delete(lnk, true)) { /* reschedule in case we never receive a response */ smc_lgr_schedule_free_work(lgr); spin_unlock_bh(lgr_lock); @@ -245,6 +245,64 @@ static void smc_lgr_terminate_work(struct work_struct *work) __smc_lgr_terminate(lgr, true); } +static int smcr_link_init(struct smc_link *lnk, u8 link_id, + struct smc_init_info *ini) +{ + u8 rndvec[3]; + int rc; + + get_device(&ini->ib_dev->ibdev->dev); + atomic_inc(&ini->ib_dev->lnk_cnt); + lnk->state = SMC_LNK_ACTIVATING; + lnk->link_id = link_id; + lnk->smcibdev = ini->ib_dev; + lnk->ibport = ini->ib_port; + lnk->path_mtu = ini->ib_dev->pattr[ini->ib_port - 1].active_mtu; + if (!ini->ib_dev->initialized) { + rc = (int)smc_ib_setup_per_ibdev(ini->ib_dev); + if (rc) + goto out; + } + get_random_bytes(rndvec, sizeof(rndvec)); + lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + + (rndvec[2] << 16); + rc = smc_ib_determine_gid(lnk->smcibdev, lnk->ibport, + ini->vlan_id, lnk->gid, &lnk->sgid_index); + if (rc) + goto out; + rc = smc_llc_link_init(lnk); + if (rc) + goto out; + rc = smc_wr_alloc_link_mem(lnk); + if (rc) + goto clear_llc_lnk; + rc = smc_ib_create_protection_domain(lnk); + if (rc) + goto free_link_mem; + rc = smc_ib_create_queue_pair(lnk); + if (rc) + goto dealloc_pd; + rc = smc_wr_create_link(lnk); + if (rc) + goto destroy_qp; + return 0; + +destroy_qp: + smc_ib_destroy_queue_pair(lnk); +dealloc_pd: + smc_ib_dealloc_protection_domain(lnk); +free_link_mem: + smc_wr_free_link_mem(lnk); +clear_llc_lnk: + smc_llc_link_clear(lnk); +out: + put_device(&ini->ib_dev->ibdev->dev); + memset(lnk, 0, sizeof(struct smc_link)); + if (!atomic_dec_return(&ini->ib_dev->lnk_cnt)) + wake_up(&ini->ib_dev->lnks_deleted); + return rc; +} + /* create a new SMC link group */ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) { @@ -252,7 +310,6 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) struct list_head *lgr_list; struct smc_link *lnk; spinlock_t *lgr_lock; - u8 rndvec[3]; int rc = 0; int i; @@ -297,48 +354,17 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) atomic_inc(&ini->ism_dev->lgr_cnt); } else { /* SMC-R specific settings */ - get_device(&ini->ib_dev->ibdev->dev); lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT; memcpy(lgr->peer_systemid, ini->ib_lcl->id_for_peer, SMC_SYSTEMID_LEN); lnk = &lgr->lnk[SMC_SINGLE_LINK]; - /* initialize link */ - lnk->state = SMC_LNK_ACTIVATING; - lnk->link_id = SMC_SINGLE_LINK; - lnk->smcibdev = ini->ib_dev; - lnk->ibport = ini->ib_port; - lgr_list = &smc_lgr_list.list; - lgr_lock = &smc_lgr_list.lock; - lnk->path_mtu = - ini->ib_dev->pattr[ini->ib_port - 1].active_mtu; - if (!ini->ib_dev->initialized) - smc_ib_setup_per_ibdev(ini->ib_dev); - get_random_bytes(rndvec, sizeof(rndvec)); - lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + - (rndvec[2] << 16); - rc = smc_ib_determine_gid(lnk->smcibdev, lnk->ibport, - ini->vlan_id, lnk->gid, - &lnk->sgid_index); - if (rc) - goto free_lgr; - rc = smc_llc_link_init(lnk); + rc = smcr_link_init(lnk, SMC_SINGLE_LINK, ini); if (rc) goto free_lgr; - rc = smc_wr_alloc_link_mem(lnk); - if (rc) - goto clear_llc_lnk; - rc = smc_ib_create_protection_domain(lnk); - if (rc) - goto free_link_mem; - rc = smc_ib_create_queue_pair(lnk); - if (rc) - goto dealloc_pd; - rc = smc_wr_create_link(lnk); - if (rc) - goto destroy_qp; + lgr_list = &smc_lgr_list.list; + lgr_lock = &smc_lgr_list.lock; atomic_inc(&lgr_cnt); - atomic_inc(&ini->ib_dev->lnk_cnt); } smc->conn.lgr = lgr; spin_lock_bh(lgr_lock); @@ -346,14 +372,6 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) spin_unlock_bh(lgr_lock); return 0; -destroy_qp: - smc_ib_destroy_queue_pair(lnk); -dealloc_pd: - smc_ib_dealloc_protection_domain(lnk); -free_link_mem: - smc_wr_free_link_mem(lnk); -clear_llc_lnk: - smc_llc_link_clear(lnk); free_lgr: kfree(lgr); ism_put_vlan: @@ -417,7 +435,7 @@ void smc_conn_free(struct smc_connection *conn) smc_lgr_schedule_free_work(lgr); } -static void smc_link_clear(struct smc_link *lnk) +static void smcr_link_clear(struct smc_link *lnk) { lnk->peer_qpn = 0; smc_llc_link_clear(lnk); @@ -426,6 +444,7 @@ static void smc_link_clear(struct smc_link *lnk) smc_ib_destroy_queue_pair(lnk); smc_ib_dealloc_protection_domain(lnk); smc_wr_free_link_mem(lnk); + put_device(&lnk->smcibdev->ibdev->dev); if (!atomic_dec_return(&lnk->smcibdev->lnk_cnt)) wake_up(&lnk->smcibdev->lnks_deleted); } @@ -512,8 +531,7 @@ static void smc_lgr_free(struct smc_link_group *lgr) if (!atomic_dec_return(&lgr->smcd->lgr_cnt)) wake_up(&lgr->smcd->lgrs_deleted); } else { - smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]); - put_device(&lgr->lnk[SMC_SINGLE_LINK].smcibdev->ibdev->dev); + smcr_link_clear(&lgr->lnk[SMC_SINGLE_LINK]); if (!atomic_dec_return(&lgr_cnt)) wake_up(&lgrs_deleted); } -- cgit v1.2.3 From 026c381fb4778d0d44af57b7ff674f31f04af221 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Wed, 29 Apr 2020 17:10:39 +0200 Subject: net/smc: introduce link_idx for link group array The link_id is the index of the link in the array of the link group. When a link in the array is reused for a new link, a different unique link_id should be used, otherwise the index in the array could collide with the previous link at this array position. Use a new variable link_idx as array index, and make link_id an increasing unique id value. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_core.c | 34 +++++++++++++++++++++++++++++----- net/smc/smc_core.h | 2 ++ 2 files changed, 31 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 3bb45c33db22..d233112ced2a 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -245,8 +245,28 @@ static void smc_lgr_terminate_work(struct work_struct *work) __smc_lgr_terminate(lgr, true); } -static int smcr_link_init(struct smc_link *lnk, u8 link_id, - struct smc_init_info *ini) +/* return next unique link id for the lgr */ +static u8 smcr_next_link_id(struct smc_link_group *lgr) +{ + u8 link_id; + int i; + + while (1) { + link_id = ++lgr->next_link_id; + if (!link_id) /* skip zero as link_id */ + link_id = ++lgr->next_link_id; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (lgr->lnk[i].state != SMC_LNK_INACTIVE && + lgr->lnk[i].link_id == link_id) + continue; + } + break; + } + return link_id; +} + +static int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, + u8 link_idx, struct smc_init_info *ini) { u8 rndvec[3]; int rc; @@ -254,7 +274,8 @@ static int smcr_link_init(struct smc_link *lnk, u8 link_id, get_device(&ini->ib_dev->ibdev->dev); atomic_inc(&ini->ib_dev->lnk_cnt); lnk->state = SMC_LNK_ACTIVATING; - lnk->link_id = link_id; + lnk->link_id = smcr_next_link_id(lgr); + lnk->link_idx = link_idx; lnk->smcibdev = ini->ib_dev; lnk->ibport = ini->ib_port; lnk->path_mtu = ini->ib_dev->pattr[ini->ib_port - 1].active_mtu; @@ -310,6 +331,7 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) struct list_head *lgr_list; struct smc_link *lnk; spinlock_t *lgr_lock; + u8 link_idx; int rc = 0; int i; @@ -338,6 +360,7 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) INIT_LIST_HEAD(&lgr->sndbufs[i]); INIT_LIST_HEAD(&lgr->rmbs[i]); } + lgr->next_link_id = 0; smc_lgr_list.num += SMC_LGR_NUM_INCR; memcpy(&lgr->id, (u8 *)&smc_lgr_list.num, SMC_LGR_ID_SIZE); INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work); @@ -358,8 +381,9 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) memcpy(lgr->peer_systemid, ini->ib_lcl->id_for_peer, SMC_SYSTEMID_LEN); - lnk = &lgr->lnk[SMC_SINGLE_LINK]; - rc = smcr_link_init(lnk, SMC_SINGLE_LINK, ini); + link_idx = SMC_SINGLE_LINK; + lnk = &lgr->lnk[link_idx]; + rc = smcr_link_init(lgr, lnk, link_idx, ini); if (rc) goto free_lgr; lgr_list = &smc_lgr_list.list; diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 8041db20c753..c459b0639bf3 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -115,6 +115,7 @@ struct smc_link { u8 peer_mac[ETH_ALEN]; /* = gid[8:10||13:15] */ u8 peer_gid[SMC_GID_SIZE]; /* gid of peer*/ u8 link_id; /* unique # within link group */ + u8 link_idx; /* index in lgr link array */ enum smc_link_state state; /* state of link */ struct workqueue_struct *llc_wq; /* single thread work queue */ @@ -222,6 +223,7 @@ struct smc_link_group { /* remote addr/key pairs */ DECLARE_BITMAP(rtokens_used_mask, SMC_RMBS_PER_LGR_MAX); /* used rtoken elements */ + u8 next_link_id; }; struct { /* SMC-D */ u64 peer_gid; -- cgit v1.2.3 From 387707fdf48697c667dd5e9715ac4feb41602d15 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Wed, 29 Apr 2020 17:10:40 +0200 Subject: net/smc: convert static link ID to dynamic references As a preparation for the support of multiple links remove the usage of a static link id (SMC_SINGLE_LINK) and allow dynamic link ids. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 17 +++++--------- net/smc/smc.h | 1 + net/smc/smc_cdc.c | 8 +++---- net/smc/smc_clc.c | 12 +++++----- net/smc/smc_core.c | 66 +++++++++++++++++++++++++++--------------------------- net/smc/smc_core.h | 7 +++--- net/smc/smc_ib.c | 58 +++++++++++++++++++++++------------------------ net/smc/smc_ib.h | 10 ++++----- net/smc/smc_llc.c | 10 ++++----- net/smc/smc_tx.c | 13 ++++++----- 10 files changed, 99 insertions(+), 103 deletions(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 6fd44bdb0fc3..6e4bad8c64a8 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -343,7 +343,7 @@ static int smc_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc, { if (!rmb_desc->wr_reg) { /* register memory region for new rmb */ - if (smc_wr_reg_send(link, rmb_desc->mr_rx[SMC_SINGLE_LINK])) { + if (smc_wr_reg_send(link, rmb_desc->mr_rx[link->link_idx])) { rmb_desc->regerr = 1; return -EFAULT; } @@ -362,12 +362,10 @@ static int smc_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc, static int smc_clnt_conf_first_link(struct smc_sock *smc) { struct net *net = sock_net(smc->clcsock->sk); - struct smc_link_group *lgr = smc->conn.lgr; - struct smc_link *link; + struct smc_link *link = smc->conn.lnk; int rest; int rc; - link = &lgr->lnk[SMC_SINGLE_LINK]; /* receive CONFIRM LINK request from server over RoCE fabric */ rest = wait_for_completion_interruptible_timeout( &link->llc_confirm, @@ -610,7 +608,7 @@ static int smc_connect_rdma(struct smc_sock *smc, mutex_unlock(&smc_client_lgr_pending); return reason_code; } - link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK]; + link = smc->conn.lnk; smc_conn_save_peer_info(smc, aclc); @@ -1002,13 +1000,10 @@ void smc_close_non_accepted(struct sock *sk) static int smc_serv_conf_first_link(struct smc_sock *smc) { struct net *net = sock_net(smc->clcsock->sk); - struct smc_link_group *lgr = smc->conn.lgr; - struct smc_link *link; + struct smc_link *link = smc->conn.lnk; int rest; int rc; - link = &lgr->lnk[SMC_SINGLE_LINK]; - if (smc_reg_rmb(link, smc->conn.rmb_desc, false)) return SMC_CLC_DECL_ERR_REGRMB; @@ -1194,7 +1189,7 @@ static int smc_listen_ism_init(struct smc_sock *new_smc, /* listen worker: register buffers */ static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact) { - struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK]; + struct smc_link *link = new_smc->conn.lnk; if (local_contact != SMC_FIRST_CONTACT) { if (smc_reg_rmb(link, new_smc->conn.rmb_desc, true)) @@ -1210,7 +1205,7 @@ static int smc_listen_rdma_finish(struct smc_sock *new_smc, struct smc_clc_msg_accept_confirm *cclc, int local_contact) { - struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK]; + struct smc_link *link = new_smc->conn.lnk; int reason_code = 0; if (local_contact == SMC_FIRST_CONTACT) diff --git a/net/smc/smc.h b/net/smc/smc.h index be11ba41190f..1a084afa7372 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -121,6 +121,7 @@ enum smc_urg_state { struct smc_connection { struct rb_node alert_node; struct smc_link_group *lgr; /* link group of connection */ + struct smc_link *lnk; /* assigned SMC-R link */ u32 alert_token_local; /* unique conn. id */ u8 peer_rmbe_idx; /* from tcp handshake */ int peer_rmbe_size; /* size of peer rx buffer */ diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index 164f1584861b..f64589d823aa 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -57,7 +57,7 @@ int smc_cdc_get_free_slot(struct smc_connection *conn, struct smc_rdma_wr **wr_rdma_buf, struct smc_cdc_tx_pend **pend) { - struct smc_link *link = &conn->lgr->lnk[SMC_SINGLE_LINK]; + struct smc_link *link = conn->lnk; int rc; rc = smc_wr_tx_get_free_slot(link, smc_cdc_tx_handler, wr_buf, @@ -91,12 +91,10 @@ int smc_cdc_msg_send(struct smc_connection *conn, struct smc_wr_buf *wr_buf, struct smc_cdc_tx_pend *pend) { + struct smc_link *link = conn->lnk; union smc_host_cursor cfed; - struct smc_link *link; int rc; - link = &conn->lgr->lnk[SMC_SINGLE_LINK]; - smc_cdc_add_pending_send(conn, pend); conn->tx_cdc_seq++; @@ -165,7 +163,7 @@ static void smc_cdc_tx_dismisser(struct smc_wr_tx_pend_priv *tx_pend) void smc_cdc_tx_dismiss_slots(struct smc_connection *conn) { - struct smc_link *link = &conn->lgr->lnk[SMC_SINGLE_LINK]; + struct smc_link *link = conn->lnk; smc_wr_tx_dismiss_slots(link, SMC_CDC_MSG_TYPE, smc_cdc_tx_filter, smc_cdc_tx_dismisser, diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index ea0068f0173c..d5627df24215 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -496,7 +496,7 @@ int smc_clc_send_confirm(struct smc_sock *smc) sizeof(SMCD_EYECATCHER)); } else { /* SMC-R specific settings */ - link = &conn->lgr->lnk[SMC_SINGLE_LINK]; + link = conn->lnk; memcpy(cclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); cclc.hdr.path = SMC_TYPE_R; @@ -508,13 +508,13 @@ int smc_clc_send_confirm(struct smc_sock *smc) ETH_ALEN); hton24(cclc.qpn, link->roce_qp->qp_num); cclc.rmb_rkey = - htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); + htonl(conn->rmb_desc->mr_rx[link->link_idx]->rkey); cclc.rmbe_idx = 1; /* for now: 1 RMB = 1 RMBE */ cclc.rmbe_alert_token = htonl(conn->alert_token_local); cclc.qp_mtu = min(link->path_mtu, link->peer_mtu); cclc.rmbe_size = conn->rmbe_size_short; cclc.rmb_dma_addr = cpu_to_be64((u64)sg_dma_address - (conn->rmb_desc->sgt[SMC_SINGLE_LINK].sgl)); + (conn->rmb_desc->sgt[link->link_idx].sgl)); hton24(cclc.psn, link->psn_initial); memcpy(cclc.smcr_trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); @@ -572,7 +572,7 @@ int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact) memcpy(aclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); aclc.hdr.path = SMC_TYPE_R; - link = &conn->lgr->lnk[SMC_SINGLE_LINK]; + link = conn->lnk; memcpy(aclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid)); memcpy(&aclc.lcl.gid, link->gid, SMC_GID_SIZE); @@ -580,13 +580,13 @@ int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact) ETH_ALEN); hton24(aclc.qpn, link->roce_qp->qp_num); aclc.rmb_rkey = - htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); + htonl(conn->rmb_desc->mr_rx[link->link_idx]->rkey); aclc.rmbe_idx = 1; /* as long as 1 RMB = 1 RMBE */ aclc.rmbe_alert_token = htonl(conn->alert_token_local); aclc.qp_mtu = link->path_mtu; aclc.rmbe_size = conn->rmbe_size_short, aclc.rmb_dma_addr = cpu_to_be64((u64)sg_dma_address - (conn->rmb_desc->sgt[SMC_SINGLE_LINK].sgl)); + (conn->rmb_desc->sgt[link->link_idx].sgl)); hton24(aclc.psn, link->psn_initial); memcpy(aclc.smcr_trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index d233112ced2a..1d695093f205 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -131,6 +131,11 @@ static void smc_lgr_register_conn(struct smc_connection *conn) conn->alert_token_local = 0; } smc_lgr_add_alert_token(conn); + + /* assign the new connection to a link */ + if (!conn->lgr->is_smcd) + conn->lnk = &conn->lgr->lnk[SMC_SINGLE_LINK]; + conn->lgr->conns_num++; } @@ -275,6 +280,7 @@ static int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, atomic_inc(&ini->ib_dev->lnk_cnt); lnk->state = SMC_LNK_ACTIVATING; lnk->link_id = smcr_next_link_id(lgr); + lnk->lgr = lgr; lnk->link_idx = link_idx; lnk->smcibdev = ini->ib_dev; lnk->ibport = ini->ib_port; @@ -421,7 +427,7 @@ static void smc_buf_unuse(struct smc_connection *conn, if (!lgr->is_smcd && !list_empty(&lgr->list)) { /* unregister rmb with peer */ smc_llc_do_delete_rkey( - &lgr->lnk[SMC_SINGLE_LINK], + conn->lnk, conn->rmb_desc); } conn->rmb_desc->used = 0; @@ -479,16 +485,15 @@ static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb, struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; if (is_rmb) { - if (buf_desc->mr_rx[SMC_SINGLE_LINK]) + if (buf_desc->mr_rx[lnk->link_idx]) smc_ib_put_memory_region( - buf_desc->mr_rx[SMC_SINGLE_LINK]); - smc_ib_buf_unmap_sg(lnk->smcibdev, buf_desc, - DMA_FROM_DEVICE); + buf_desc->mr_rx[lnk->link_idx]); + smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_FROM_DEVICE); } else { - smc_ib_buf_unmap_sg(lnk->smcibdev, buf_desc, - DMA_TO_DEVICE); + smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_TO_DEVICE); } - sg_free_table(&buf_desc->sgt[SMC_SINGLE_LINK]); + sg_free_table(&buf_desc->sgt[lnk->link_idx]); + if (buf_desc->pages) __free_pages(buf_desc->pages, buf_desc->order); kfree(buf_desc); @@ -1026,17 +1031,16 @@ static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr, /* build the sg table from the pages */ lnk = &lgr->lnk[SMC_SINGLE_LINK]; - rc = sg_alloc_table(&buf_desc->sgt[SMC_SINGLE_LINK], 1, - GFP_KERNEL); + rc = sg_alloc_table(&buf_desc->sgt[lnk->link_idx], 1, GFP_KERNEL); if (rc) { smc_buf_free(lgr, is_rmb, buf_desc); return ERR_PTR(rc); } - sg_set_buf(buf_desc->sgt[SMC_SINGLE_LINK].sgl, + sg_set_buf(buf_desc->sgt[lnk->link_idx].sgl, buf_desc->cpu_addr, bufsize); /* map sg table to DMA address */ - rc = smc_ib_buf_map_sg(lnk->smcibdev, buf_desc, + rc = smc_ib_buf_map_sg(lnk, buf_desc, is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE); /* SMC protocol depends on mapping to one DMA address only */ if (rc != 1) { @@ -1049,7 +1053,7 @@ static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr, rc = smc_ib_get_memory_region(lnk->roce_pd, IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE, - buf_desc); + buf_desc, lnk->link_idx); if (rc) { smc_buf_free(lgr, is_rmb, buf_desc); return ERR_PTR(rc); @@ -1174,22 +1178,16 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn) { - struct smc_link_group *lgr = conn->lgr; - if (!conn->lgr || conn->lgr->is_smcd) return; - smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev, - conn->sndbuf_desc, DMA_TO_DEVICE); + smc_ib_sync_sg_for_cpu(conn->lnk, conn->sndbuf_desc, DMA_TO_DEVICE); } void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn) { - struct smc_link_group *lgr = conn->lgr; - if (!conn->lgr || conn->lgr->is_smcd) return; - smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev, - conn->sndbuf_desc, DMA_TO_DEVICE); + smc_ib_sync_sg_for_device(conn->lnk, conn->sndbuf_desc, DMA_TO_DEVICE); } void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn) @@ -1198,7 +1196,7 @@ void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn) if (!conn->lgr || conn->lgr->is_smcd) return; - smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev, + smc_ib_sync_sg_for_cpu(&lgr->lnk[SMC_SINGLE_LINK], conn->rmb_desc, DMA_FROM_DEVICE); } @@ -1208,7 +1206,7 @@ void smc_rmb_sync_sg_for_device(struct smc_connection *conn) if (!conn->lgr || conn->lgr->is_smcd) return; - smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev, + smc_ib_sync_sg_for_device(&lgr->lnk[SMC_SINGLE_LINK], conn->rmb_desc, DMA_FROM_DEVICE); } @@ -1245,15 +1243,16 @@ static inline int smc_rmb_reserve_rtoken_idx(struct smc_link_group *lgr) } /* add a new rtoken from peer */ -int smc_rtoken_add(struct smc_link_group *lgr, __be64 nw_vaddr, __be32 nw_rkey) +int smc_rtoken_add(struct smc_link *lnk, __be64 nw_vaddr, __be32 nw_rkey) { + struct smc_link_group *lgr = smc_get_lgr(lnk); u64 dma_addr = be64_to_cpu(nw_vaddr); u32 rkey = ntohl(nw_rkey); int i; for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { - if ((lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey) && - (lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr == dma_addr) && + if (lgr->rtokens[i][lnk->link_idx].rkey == rkey && + lgr->rtokens[i][lnk->link_idx].dma_addr == dma_addr && test_bit(i, lgr->rtokens_used_mask)) { /* already in list */ return i; @@ -1262,22 +1261,23 @@ int smc_rtoken_add(struct smc_link_group *lgr, __be64 nw_vaddr, __be32 nw_rkey) i = smc_rmb_reserve_rtoken_idx(lgr); if (i < 0) return i; - lgr->rtokens[i][SMC_SINGLE_LINK].rkey = rkey; - lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr = dma_addr; + lgr->rtokens[i][lnk->link_idx].rkey = rkey; + lgr->rtokens[i][lnk->link_idx].dma_addr = dma_addr; return i; } /* delete an rtoken */ -int smc_rtoken_delete(struct smc_link_group *lgr, __be32 nw_rkey) +int smc_rtoken_delete(struct smc_link *lnk, __be32 nw_rkey) { + struct smc_link_group *lgr = smc_get_lgr(lnk); u32 rkey = ntohl(nw_rkey); int i; for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { - if (lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey && + if (lgr->rtokens[i][lnk->link_idx].rkey == rkey && test_bit(i, lgr->rtokens_used_mask)) { - lgr->rtokens[i][SMC_SINGLE_LINK].rkey = 0; - lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr = 0; + lgr->rtokens[i][lnk->link_idx].rkey = 0; + lgr->rtokens[i][lnk->link_idx].dma_addr = 0; clear_bit(i, lgr->rtokens_used_mask); return 0; @@ -1290,7 +1290,7 @@ int smc_rtoken_delete(struct smc_link_group *lgr, __be32 nw_rkey) int smc_rmb_rtoken_handling(struct smc_connection *conn, struct smc_clc_msg_accept_confirm *clc) { - conn->rtoken_idx = smc_rtoken_add(conn->lgr, clc->rmb_dma_addr, + conn->rtoken_idx = smc_rtoken_add(conn->lnk, clc->rmb_dma_addr, clc->rmb_rkey); if (conn->rtoken_idx < 0) return conn->rtoken_idx; diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index c459b0639bf3..c71c35a3596c 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -116,6 +116,7 @@ struct smc_link { u8 peer_gid[SMC_GID_SIZE]; /* gid of peer*/ u8 link_id; /* unique # within link group */ u8 link_idx; /* index in lgr link array */ + struct smc_link_group *lgr; /* parent link group */ enum smc_link_state state; /* state of link */ struct workqueue_struct *llc_wq; /* single thread work queue */ @@ -303,8 +304,8 @@ int smc_buf_create(struct smc_sock *smc, bool is_smcd); int smc_uncompress_bufsize(u8 compressed); int smc_rmb_rtoken_handling(struct smc_connection *conn, struct smc_clc_msg_accept_confirm *clc); -int smc_rtoken_add(struct smc_link_group *lgr, __be64 nw_vaddr, __be32 nw_rkey); -int smc_rtoken_delete(struct smc_link_group *lgr, __be32 nw_rkey); +int smc_rtoken_add(struct smc_link *lnk, __be64 nw_vaddr, __be32 nw_rkey); +int smc_rtoken_delete(struct smc_link *lnk, __be32 nw_rkey); void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn); void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn); void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn); @@ -319,6 +320,6 @@ void smc_core_exit(void); static inline struct smc_link_group *smc_get_lgr(struct smc_link *link) { - return container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]); + return link->lgr; } #endif diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 440f9e319a38..c090678a3e5a 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -389,15 +389,15 @@ void smc_ib_put_memory_region(struct ib_mr *mr) ib_dereg_mr(mr); } -static int smc_ib_map_mr_sg(struct smc_buf_desc *buf_slot) +static int smc_ib_map_mr_sg(struct smc_buf_desc *buf_slot, u8 link_idx) { unsigned int offset = 0; int sg_num; /* map the largest prefix of a dma mapped SG list */ - sg_num = ib_map_mr_sg(buf_slot->mr_rx[SMC_SINGLE_LINK], - buf_slot->sgt[SMC_SINGLE_LINK].sgl, - buf_slot->sgt[SMC_SINGLE_LINK].orig_nents, + sg_num = ib_map_mr_sg(buf_slot->mr_rx[link_idx], + buf_slot->sgt[link_idx].sgl, + buf_slot->sgt[link_idx].orig_nents, &offset, PAGE_SIZE); return sg_num; @@ -405,29 +405,29 @@ static int smc_ib_map_mr_sg(struct smc_buf_desc *buf_slot) /* Allocate a memory region and map the dma mapped SG list of buf_slot */ int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags, - struct smc_buf_desc *buf_slot) + struct smc_buf_desc *buf_slot, u8 link_idx) { - if (buf_slot->mr_rx[SMC_SINGLE_LINK]) + if (buf_slot->mr_rx[link_idx]) return 0; /* already done */ - buf_slot->mr_rx[SMC_SINGLE_LINK] = + buf_slot->mr_rx[link_idx] = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, 1 << buf_slot->order); - if (IS_ERR(buf_slot->mr_rx[SMC_SINGLE_LINK])) { + if (IS_ERR(buf_slot->mr_rx[link_idx])) { int rc; - rc = PTR_ERR(buf_slot->mr_rx[SMC_SINGLE_LINK]); - buf_slot->mr_rx[SMC_SINGLE_LINK] = NULL; + rc = PTR_ERR(buf_slot->mr_rx[link_idx]); + buf_slot->mr_rx[link_idx] = NULL; return rc; } - if (smc_ib_map_mr_sg(buf_slot) != 1) + if (smc_ib_map_mr_sg(buf_slot, link_idx) != 1) return -EINVAL; return 0; } /* synchronize buffer usage for cpu access */ -void smc_ib_sync_sg_for_cpu(struct smc_ib_device *smcibdev, +void smc_ib_sync_sg_for_cpu(struct smc_link *lnk, struct smc_buf_desc *buf_slot, enum dma_data_direction data_direction) { @@ -435,11 +435,11 @@ void smc_ib_sync_sg_for_cpu(struct smc_ib_device *smcibdev, unsigned int i; /* for now there is just one DMA address */ - for_each_sg(buf_slot->sgt[SMC_SINGLE_LINK].sgl, sg, - buf_slot->sgt[SMC_SINGLE_LINK].nents, i) { + for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg, + buf_slot->sgt[lnk->link_idx].nents, i) { if (!sg_dma_len(sg)) break; - ib_dma_sync_single_for_cpu(smcibdev->ibdev, + ib_dma_sync_single_for_cpu(lnk->smcibdev->ibdev, sg_dma_address(sg), sg_dma_len(sg), data_direction); @@ -447,7 +447,7 @@ void smc_ib_sync_sg_for_cpu(struct smc_ib_device *smcibdev, } /* synchronize buffer usage for device access */ -void smc_ib_sync_sg_for_device(struct smc_ib_device *smcibdev, +void smc_ib_sync_sg_for_device(struct smc_link *lnk, struct smc_buf_desc *buf_slot, enum dma_data_direction data_direction) { @@ -455,11 +455,11 @@ void smc_ib_sync_sg_for_device(struct smc_ib_device *smcibdev, unsigned int i; /* for now there is just one DMA address */ - for_each_sg(buf_slot->sgt[SMC_SINGLE_LINK].sgl, sg, - buf_slot->sgt[SMC_SINGLE_LINK].nents, i) { + for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg, + buf_slot->sgt[lnk->link_idx].nents, i) { if (!sg_dma_len(sg)) break; - ib_dma_sync_single_for_device(smcibdev->ibdev, + ib_dma_sync_single_for_device(lnk->smcibdev->ibdev, sg_dma_address(sg), sg_dma_len(sg), data_direction); @@ -467,15 +467,15 @@ void smc_ib_sync_sg_for_device(struct smc_ib_device *smcibdev, } /* Map a new TX or RX buffer SG-table to DMA */ -int smc_ib_buf_map_sg(struct smc_ib_device *smcibdev, +int smc_ib_buf_map_sg(struct smc_link *lnk, struct smc_buf_desc *buf_slot, enum dma_data_direction data_direction) { int mapped_nents; - mapped_nents = ib_dma_map_sg(smcibdev->ibdev, - buf_slot->sgt[SMC_SINGLE_LINK].sgl, - buf_slot->sgt[SMC_SINGLE_LINK].orig_nents, + mapped_nents = ib_dma_map_sg(lnk->smcibdev->ibdev, + buf_slot->sgt[lnk->link_idx].sgl, + buf_slot->sgt[lnk->link_idx].orig_nents, data_direction); if (!mapped_nents) return -ENOMEM; @@ -483,18 +483,18 @@ int smc_ib_buf_map_sg(struct smc_ib_device *smcibdev, return mapped_nents; } -void smc_ib_buf_unmap_sg(struct smc_ib_device *smcibdev, +void smc_ib_buf_unmap_sg(struct smc_link *lnk, struct smc_buf_desc *buf_slot, enum dma_data_direction data_direction) { - if (!buf_slot->sgt[SMC_SINGLE_LINK].sgl->dma_address) + if (!buf_slot->sgt[lnk->link_idx].sgl->dma_address) return; /* already unmapped */ - ib_dma_unmap_sg(smcibdev->ibdev, - buf_slot->sgt[SMC_SINGLE_LINK].sgl, - buf_slot->sgt[SMC_SINGLE_LINK].orig_nents, + ib_dma_unmap_sg(lnk->smcibdev->ibdev, + buf_slot->sgt[lnk->link_idx].sgl, + buf_slot->sgt[lnk->link_idx].orig_nents, data_direction); - buf_slot->sgt[SMC_SINGLE_LINK].sgl->dma_address = 0; + buf_slot->sgt[lnk->link_idx].sgl->dma_address = 0; } long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index 5c2b115d36da..e6a696ae15f3 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -59,10 +59,10 @@ struct smc_link; int smc_ib_register_client(void) __init; void smc_ib_unregister_client(void); bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport); -int smc_ib_buf_map_sg(struct smc_ib_device *smcibdev, +int smc_ib_buf_map_sg(struct smc_link *lnk, struct smc_buf_desc *buf_slot, enum dma_data_direction data_direction); -void smc_ib_buf_unmap_sg(struct smc_ib_device *smcibdev, +void smc_ib_buf_unmap_sg(struct smc_link *lnk, struct smc_buf_desc *buf_slot, enum dma_data_direction data_direction); void smc_ib_dealloc_protection_domain(struct smc_link *lnk); @@ -74,12 +74,12 @@ int smc_ib_modify_qp_rts(struct smc_link *lnk); int smc_ib_modify_qp_reset(struct smc_link *lnk); long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev); int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags, - struct smc_buf_desc *buf_slot); + struct smc_buf_desc *buf_slot, u8 link_idx); void smc_ib_put_memory_region(struct ib_mr *mr); -void smc_ib_sync_sg_for_cpu(struct smc_ib_device *smcibdev, +void smc_ib_sync_sg_for_cpu(struct smc_link *lnk, struct smc_buf_desc *buf_slot, enum dma_data_direction data_direction); -void smc_ib_sync_sg_for_device(struct smc_ib_device *smcibdev, +void smc_ib_sync_sg_for_device(struct smc_link *lnk, struct smc_buf_desc *buf_slot, enum dma_data_direction data_direction); int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport, diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 0e52aab53d97..34d0752ba6af 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -231,9 +231,9 @@ static int smc_llc_send_confirm_rkey(struct smc_link *link, rkeyllc->hd.common.type = SMC_LLC_CONFIRM_RKEY; rkeyllc->hd.length = sizeof(struct smc_llc_msg_confirm_rkey); rkeyllc->rtoken[0].rmb_key = - htonl(rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); + htonl(rmb_desc->mr_rx[link->link_idx]->rkey); rkeyllc->rtoken[0].rmb_vaddr = cpu_to_be64( - (u64)sg_dma_address(rmb_desc->sgt[SMC_SINGLE_LINK].sgl)); + (u64)sg_dma_address(rmb_desc->sgt[link->link_idx].sgl)); /* send llc message */ rc = smc_wr_tx_send(link, pend); return rc; @@ -256,7 +256,7 @@ static int smc_llc_send_delete_rkey(struct smc_link *link, rkeyllc->hd.common.type = SMC_LLC_DELETE_RKEY; rkeyllc->hd.length = sizeof(struct smc_llc_msg_delete_rkey); rkeyllc->num_rkeys = 1; - rkeyllc->rkey[0] = htonl(rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); + rkeyllc->rkey[0] = htonl(rmb_desc->mr_rx[link->link_idx]->rkey); /* send llc message */ rc = smc_wr_tx_send(link, pend); return rc; @@ -501,7 +501,7 @@ static void smc_llc_rx_confirm_rkey(struct smc_link *link, SMC_LLC_FLAG_RKEY_NEG; complete(&link->llc_confirm_rkey); } else { - rc = smc_rtoken_add(smc_get_lgr(link), + rc = smc_rtoken_add(link, llc->rtoken[0].rmb_vaddr, llc->rtoken[0].rmb_key); @@ -539,7 +539,7 @@ static void smc_llc_rx_delete_rkey(struct smc_link *link, } else { max = min_t(u8, llc->num_rkeys, SMC_LLC_DEL_RKEY_MAX); for (i = 0; i < max; i++) { - if (smc_rtoken_delete(smc_get_lgr(link), llc->rkey[i])) + if (smc_rtoken_delete(link, llc->rkey[i])) err_mask |= 1 << (SMC_LLC_DEL_RKEY_MAX - 1 - i); } diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 9f1ade86d70e..d74bfe6a90f1 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -269,19 +269,18 @@ static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset, int num_sges, struct ib_rdma_wr *rdma_wr) { struct smc_link_group *lgr = conn->lgr; - struct smc_link *link; + struct smc_link *link = conn->lnk; int rc; - link = &lgr->lnk[SMC_SINGLE_LINK]; rdma_wr->wr.wr_id = smc_wr_tx_get_next_wr_id(link); rdma_wr->wr.num_sge = num_sges; rdma_wr->remote_addr = - lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].dma_addr + + lgr->rtokens[conn->rtoken_idx][link->link_idx].dma_addr + /* RMBE within RMB */ conn->tx_off + /* offset within RMBE */ peer_rmbe_offset; - rdma_wr->rkey = lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].rkey; + rdma_wr->rkey = lgr->rtokens[conn->rtoken_idx][link->link_idx].rkey; rc = ib_post_send(link->roce_qp, &rdma_wr->wr, NULL); if (rc) smc_lgr_terminate_sched(lgr); @@ -310,8 +309,10 @@ static int smcr_tx_rdma_writes(struct smc_connection *conn, size_t len, size_t dst_off, size_t dst_len, struct smc_rdma_wr *wr_rdma_buf) { + struct smc_link *link = conn->lnk; + dma_addr_t dma_addr = - sg_dma_address(conn->sndbuf_desc->sgt[SMC_SINGLE_LINK].sgl); + sg_dma_address(conn->sndbuf_desc->sgt[link->link_idx].sgl); int src_len_sum = src_len, dst_len_sum = dst_len; int sent_count = src_off; int srcchunk, dstchunk; @@ -507,7 +508,7 @@ static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn) if (!pflags->urg_data_present) { rc = smc_tx_rdma_writes(conn, wr_rdma_buf); if (rc) { - smc_wr_tx_put_slot(&conn->lgr->lnk[SMC_SINGLE_LINK], + smc_wr_tx_put_slot(conn->lnk, (struct smc_wr_tx_pend_priv *)pend); goto out_unlock; } -- cgit v1.2.3 From b9247544c1bccfe1b74ddf1dade719a69946cbb1 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Wed, 29 Apr 2020 17:10:41 +0200 Subject: net/smc: convert static link ID instances to support multiple links As a preparation for the support of multiple links remove the usage of a static link id (SMC_SINGLE_LINK) and allow dynamic link ids. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 54 ++++++--- net/smc/smc_clc.h | 1 + net/smc/smc_core.c | 332 ++++++++++++++++++++++++++++++++++++----------------- net/smc/smc_core.h | 37 +++--- net/smc/smc_llc.c | 2 + 5 files changed, 291 insertions(+), 135 deletions(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 6e4bad8c64a8..890dc6422f8c 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -338,28 +338,48 @@ static void smc_copy_sock_settings_to_smc(struct smc_sock *smc) } /* register a new rmb, send confirm_rkey msg to register with peer */ -static int smc_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc, - bool conf_rkey) +static int smcr_link_reg_rmb(struct smc_link *link, + struct smc_buf_desc *rmb_desc, bool conf_rkey) { - if (!rmb_desc->wr_reg) { + if (!rmb_desc->is_reg_mr[link->link_idx]) { /* register memory region for new rmb */ if (smc_wr_reg_send(link, rmb_desc->mr_rx[link->link_idx])) { - rmb_desc->regerr = 1; + rmb_desc->is_reg_err = true; return -EFAULT; } - rmb_desc->wr_reg = 1; + rmb_desc->is_reg_mr[link->link_idx] = true; } if (!conf_rkey) return 0; + /* exchange confirm_rkey msg with peer */ - if (smc_llc_do_confirm_rkey(link, rmb_desc)) { - rmb_desc->regerr = 1; - return -EFAULT; + if (!rmb_desc->is_conf_rkey) { + if (smc_llc_do_confirm_rkey(link, rmb_desc)) { + rmb_desc->is_reg_err = true; + return -EFAULT; + } + rmb_desc->is_conf_rkey = true; + } + return 0; +} + +/* register the new rmb on all links */ +static int smcr_lgr_reg_rmbs(struct smc_link_group *lgr, + struct smc_buf_desc *rmb_desc) +{ + int i, rc; + + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (lgr->lnk[i].state != SMC_LNK_ACTIVE) + continue; + rc = smcr_link_reg_rmb(&lgr->lnk[i], rmb_desc, true); + if (rc) + return rc; } return 0; } -static int smc_clnt_conf_first_link(struct smc_sock *smc) +static int smcr_clnt_conf_first_link(struct smc_sock *smc) { struct net *net = sock_net(smc->clcsock->sk); struct smc_link *link = smc->conn.lnk; @@ -387,7 +407,7 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc) smc_wr_remember_qp_attr(link); - if (smc_reg_rmb(link, smc->conn.rmb_desc, false)) + if (smcr_link_reg_rmb(link, smc->conn.rmb_desc, false)) return SMC_CLC_DECL_ERR_REGRMB; /* send CONFIRM LINK response over RoCE fabric */ @@ -632,7 +652,7 @@ static int smc_connect_rdma(struct smc_sock *smc, return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RDYLNK, ini->cln_first_contact); } else { - if (smc_reg_rmb(link, smc->conn.rmb_desc, true)) + if (smcr_lgr_reg_rmbs(smc->conn.lgr, smc->conn.rmb_desc)) return smc_connect_abort(smc, SMC_CLC_DECL_ERR_REGRMB, ini->cln_first_contact); } @@ -647,7 +667,7 @@ static int smc_connect_rdma(struct smc_sock *smc, if (ini->cln_first_contact == SMC_FIRST_CONTACT) { /* QP confirmation over RoCE fabric */ - reason_code = smc_clnt_conf_first_link(smc); + reason_code = smcr_clnt_conf_first_link(smc); if (reason_code) return smc_connect_abort(smc, reason_code, ini->cln_first_contact); @@ -997,14 +1017,14 @@ void smc_close_non_accepted(struct sock *sk) sock_put(sk); /* final sock_put */ } -static int smc_serv_conf_first_link(struct smc_sock *smc) +static int smcr_serv_conf_first_link(struct smc_sock *smc) { struct net *net = sock_net(smc->clcsock->sk); struct smc_link *link = smc->conn.lnk; int rest; int rc; - if (smc_reg_rmb(link, smc->conn.rmb_desc, false)) + if (smcr_link_reg_rmb(link, smc->conn.rmb_desc, false)) return SMC_CLC_DECL_ERR_REGRMB; /* send CONFIRM LINK request to client over the RoCE fabric */ @@ -1189,10 +1209,10 @@ static int smc_listen_ism_init(struct smc_sock *new_smc, /* listen worker: register buffers */ static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact) { - struct smc_link *link = new_smc->conn.lnk; + struct smc_connection *conn = &new_smc->conn; if (local_contact != SMC_FIRST_CONTACT) { - if (smc_reg_rmb(link, new_smc->conn.rmb_desc, true)) + if (smcr_lgr_reg_rmbs(conn->lgr, conn->rmb_desc)) return SMC_CLC_DECL_ERR_REGRMB; } smc_rmb_sync_sg_for_device(&new_smc->conn); @@ -1222,7 +1242,7 @@ static int smc_listen_rdma_finish(struct smc_sock *new_smc, goto decline; } /* QP confirmation over RoCE fabric */ - reason_code = smc_serv_conf_first_link(new_smc); + reason_code = smcr_serv_conf_first_link(new_smc); if (reason_code) goto decline; } diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index ca209272e5fa..4f2e150a2be1 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -44,6 +44,7 @@ #define SMC_CLC_DECL_DIFFPREFIX 0x03070000 /* IP prefix / subnet mismatch */ #define SMC_CLC_DECL_GETVLANERR 0x03080000 /* err to get vlan id of ip device*/ #define SMC_CLC_DECL_ISMVLANERR 0x03090000 /* err to reg vlan id on ism dev */ +#define SMC_CLC_DECL_NOACTLINK 0x030a0000 /* no active smc-r link in lgr */ #define SMC_CLC_DECL_SYNCERR 0x04000000 /* synchronization error */ #define SMC_CLC_DECL_PEERDECL 0x05000000 /* peer declined during handshake */ #define SMC_CLC_DECL_INTERR 0x09990000 /* internal error */ diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 1d695093f205..5df3f8f41d19 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -116,7 +116,7 @@ static void smc_lgr_add_alert_token(struct smc_connection *conn) * Requires @conns_lock * Note that '0' is a reserved value and not assigned. */ -static void smc_lgr_register_conn(struct smc_connection *conn) +static int smc_lgr_register_conn(struct smc_connection *conn) { struct smc_sock *smc = container_of(conn, struct smc_sock, conn); static atomic_t nexttoken = ATOMIC_INIT(0); @@ -133,10 +133,22 @@ static void smc_lgr_register_conn(struct smc_connection *conn) smc_lgr_add_alert_token(conn); /* assign the new connection to a link */ - if (!conn->lgr->is_smcd) - conn->lnk = &conn->lgr->lnk[SMC_SINGLE_LINK]; + if (!conn->lgr->is_smcd) { + struct smc_link *lnk; + int i; + /* tbd - link balancing */ + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + lnk = &conn->lgr->lnk[i]; + if (lnk->state == SMC_LNK_ACTIVATING || + lnk->state == SMC_LNK_ACTIVE) + conn->lnk = lnk; + } + if (!conn->lnk) + return SMC_CLC_DECL_NOACTLINK; + } conn->lgr->conns_num++; + return 0; } /* Unregister connection and reset the alert token of the given connection< @@ -202,8 +214,8 @@ static void smc_lgr_free_work(struct work_struct *work) struct smc_link_group, free_work); spinlock_t *lgr_lock; - struct smc_link *lnk; bool conns; + int i; smc_lgr_list_head(lgr, &lgr_lock); spin_lock_bh(lgr_lock); @@ -220,25 +232,38 @@ static void smc_lgr_free_work(struct work_struct *work) } list_del_init(&lgr->list); /* remove from smc_lgr_list */ - lnk = &lgr->lnk[SMC_SINGLE_LINK]; if (!lgr->is_smcd && !lgr->terminating) { - /* try to send del link msg, on error free lgr immediately */ - if (lnk->state == SMC_LNK_ACTIVE && - !smcr_link_send_delete(lnk, true)) { - /* reschedule in case we never receive a response */ - smc_lgr_schedule_free_work(lgr); + bool do_wait = false; + + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + struct smc_link *lnk = &lgr->lnk[i]; + /* try to send del link msg, on err free immediately */ + if (lnk->state == SMC_LNK_ACTIVE && + !smcr_link_send_delete(lnk, true)) { + /* reschedule in case we never receive a resp */ + smc_lgr_schedule_free_work(lgr); + do_wait = true; + } + } + if (do_wait) { spin_unlock_bh(lgr_lock); - return; + return; /* wait for resp, see smc_llc_rx_delete_link */ } } lgr->freeing = 1; /* this instance does the freeing, no new schedule */ spin_unlock_bh(lgr_lock); cancel_delayed_work(&lgr->free_work); - if (!lgr->is_smcd && lnk->state != SMC_LNK_INACTIVE) - smc_llc_link_inactive(lnk); if (lgr->is_smcd && !lgr->terminating) smc_ism_signal_shutdown(lgr); + if (!lgr->is_smcd) { + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + struct smc_link *lnk = &lgr->lnk[i]; + + if (lnk->state != SMC_LNK_INACTIVE) + smc_llc_link_inactive(lnk); + } + } smc_lgr_free(lgr); } @@ -417,29 +442,37 @@ out: return rc; } +static void smcr_buf_unuse(struct smc_buf_desc *rmb_desc, + struct smc_link *lnk) +{ + struct smc_link_group *lgr = lnk->lgr; + + if (rmb_desc->is_conf_rkey && !list_empty(&lgr->list)) { + /* unregister rmb with peer */ + smc_llc_do_delete_rkey(lnk, rmb_desc); + rmb_desc->is_conf_rkey = false; + } + if (rmb_desc->is_reg_err) { + /* buf registration failed, reuse not possible */ + write_lock_bh(&lgr->rmbs_lock); + list_del(&rmb_desc->list); + write_unlock_bh(&lgr->rmbs_lock); + + smc_buf_free(lgr, true, rmb_desc); + } else { + rmb_desc->used = 0; + } +} + static void smc_buf_unuse(struct smc_connection *conn, struct smc_link_group *lgr) { if (conn->sndbuf_desc) conn->sndbuf_desc->used = 0; - if (conn->rmb_desc) { - if (!conn->rmb_desc->regerr) { - if (!lgr->is_smcd && !list_empty(&lgr->list)) { - /* unregister rmb with peer */ - smc_llc_do_delete_rkey( - conn->lnk, - conn->rmb_desc); - } - conn->rmb_desc->used = 0; - } else { - /* buf registration failed, reuse not possible */ - write_lock_bh(&lgr->rmbs_lock); - list_del(&conn->rmb_desc->list); - write_unlock_bh(&lgr->rmbs_lock); - - smc_buf_free(lgr, true, conn->rmb_desc); - } - } + if (conn->rmb_desc && lgr->is_smcd) + conn->rmb_desc->used = 0; + else if (conn->rmb_desc) + smcr_buf_unuse(conn->rmb_desc, conn->lnk); } /* remove a finished connection from its link group */ @@ -467,6 +500,8 @@ void smc_conn_free(struct smc_connection *conn) static void smcr_link_clear(struct smc_link *lnk) { + if (lnk->peer_qpn == 0) + return; lnk->peer_qpn = 0; smc_llc_link_clear(lnk); smc_ib_modify_qp_reset(lnk); @@ -482,17 +517,23 @@ static void smcr_link_clear(struct smc_link *lnk) static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb, struct smc_buf_desc *buf_desc) { - struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; + struct smc_link *lnk; + int i; - if (is_rmb) { - if (buf_desc->mr_rx[lnk->link_idx]) - smc_ib_put_memory_region( - buf_desc->mr_rx[lnk->link_idx]); - smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_FROM_DEVICE); - } else { - smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_TO_DEVICE); + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + lnk = &lgr->lnk[i]; + if (!buf_desc->is_map_ib[lnk->link_idx]) + continue; + if (is_rmb) { + if (buf_desc->mr_rx[lnk->link_idx]) + smc_ib_put_memory_region( + buf_desc->mr_rx[lnk->link_idx]); + smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_FROM_DEVICE); + } else { + smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_TO_DEVICE); + } + sg_free_table(&buf_desc->sgt[lnk->link_idx]); } - sg_free_table(&buf_desc->sgt[lnk->link_idx]); if (buf_desc->pages) __free_pages(buf_desc->pages, buf_desc->order); @@ -551,6 +592,8 @@ static void smc_lgr_free_bufs(struct smc_link_group *lgr) /* remove a link group */ static void smc_lgr_free(struct smc_link_group *lgr) { + int i; + smc_lgr_free_bufs(lgr); if (lgr->is_smcd) { if (!lgr->terminating) { @@ -560,7 +603,11 @@ static void smc_lgr_free(struct smc_link_group *lgr) if (!atomic_dec_return(&lgr->smcd->lgr_cnt)) wake_up(&lgr->smcd->lgrs_deleted); } else { - smcr_link_clear(&lgr->lnk[SMC_SINGLE_LINK]); + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (lgr->lnk[i].state == SMC_LNK_INACTIVE) + continue; + smcr_link_clear(&lgr->lnk[i]); + } if (!atomic_dec_return(&lgr_cnt)) wake_up(&lgrs_deleted); } @@ -628,16 +675,20 @@ static void smc_conn_kill(struct smc_connection *conn, bool soft) static void smc_lgr_cleanup(struct smc_link_group *lgr) { + int i; + if (lgr->is_smcd) { smc_ism_signal_shutdown(lgr); smcd_unregister_all_dmbs(lgr); smc_ism_put_vlan(lgr->smcd, lgr->vlan_id); put_device(&lgr->smcd->dev); } else { - struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + struct smc_link *lnk = &lgr->lnk[i]; - if (lnk->state != SMC_LNK_INACTIVE) - smc_llc_link_inactive(lnk); + if (lnk->state != SMC_LNK_INACTIVE) + smc_llc_link_inactive(lnk); + } } } @@ -650,6 +701,7 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft) struct smc_connection *conn; struct smc_sock *smc; struct rb_node *node; + int i; if (lgr->terminating) return; /* lgr already terminating */ @@ -657,7 +709,8 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft) cancel_delayed_work_sync(&lgr->free_work); lgr->terminating = 1; if (!lgr->is_smcd) - smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]); + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) + smc_llc_link_inactive(&lgr->lnk[i]); /* kill remaining link group connections */ read_lock_bh(&lgr->conns_lock); @@ -703,14 +756,22 @@ void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport) { struct smc_link_group *lgr, *l; LIST_HEAD(lgr_free_list); + int i; spin_lock_bh(&smc_lgr_list.lock); list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) { - if (!lgr->is_smcd && - lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev && - lgr->lnk[SMC_SINGLE_LINK].ibport == ibport) { - list_move(&lgr->list, &lgr_free_list); - lgr->freeing = 1; + if (lgr->is_smcd) + continue; + /* tbd - terminate only when no more links are active */ + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (lgr->lnk[i].state == SMC_LNK_INACTIVE || + lgr->lnk[i].state == SMC_LNK_DELETING) + continue; + if (lgr->lnk[i].smcibdev == smcibdev && + lgr->lnk[i].ibport == ibport) { + list_move(&lgr->list, &lgr_free_list); + lgr->freeing = 1; + } } } spin_unlock_bh(&smc_lgr_list.lock); @@ -775,6 +836,7 @@ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev) { struct smc_link_group *lgr, *lg; LIST_HEAD(lgr_free_list); + int i; spin_lock_bh(&smc_lgr_list.lock); if (!smcibdev) { @@ -783,9 +845,12 @@ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev) lgr->freeing = 1; } else { list_for_each_entry_safe(lgr, lg, &smc_lgr_list.list, list) { - if (lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev) { - list_move(&lgr->list, &lgr_free_list); - lgr->freeing = 1; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (lgr->lnk[i].smcibdev == smcibdev) { + list_move(&lgr->list, &lgr_free_list); + lgr->freeing = 1; + break; + } } } } @@ -857,15 +922,21 @@ static bool smcr_lgr_match(struct smc_link_group *lgr, struct smc_clc_msg_local *lcl, enum smc_lgr_role role, u32 clcqpn) { - return !memcmp(lgr->peer_systemid, lcl->id_for_peer, - SMC_SYSTEMID_LEN) && - !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_gid, &lcl->gid, - SMC_GID_SIZE) && - !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_mac, lcl->mac, - sizeof(lcl->mac)) && - lgr->role == role && - (lgr->role == SMC_SERV || - lgr->lnk[SMC_SINGLE_LINK].peer_qpn == clcqpn); + int i; + + if (memcmp(lgr->peer_systemid, lcl->id_for_peer, SMC_SYSTEMID_LEN) || + lgr->role != role) + return false; + + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (lgr->lnk[i].state != SMC_LNK_ACTIVE) + continue; + if ((lgr->role == SMC_SERV || lgr->lnk[i].peer_qpn == clcqpn) && + !memcmp(lgr->lnk[i].peer_gid, &lcl->gid, SMC_GID_SIZE) && + !memcmp(lgr->lnk[i].peer_mac, lcl->mac, sizeof(lcl->mac))) + return true; + } + return false; } static bool smcd_lgr_match(struct smc_link_group *lgr, @@ -906,15 +977,17 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) /* link group found */ ini->cln_first_contact = SMC_REUSE_CONTACT; conn->lgr = lgr; - smc_lgr_register_conn(conn); /* add smc conn to lgr */ - if (delayed_work_pending(&lgr->free_work)) - cancel_delayed_work(&lgr->free_work); + rc = smc_lgr_register_conn(conn); /* add conn to lgr */ write_unlock_bh(&lgr->conns_lock); + if (!rc && delayed_work_pending(&lgr->free_work)) + cancel_delayed_work(&lgr->free_work); break; } write_unlock_bh(&lgr->conns_lock); } spin_unlock_bh(lgr_lock); + if (rc) + return rc; if (role == SMC_CLNT && !ini->srv_first_contact && ini->cln_first_contact == SMC_FIRST_CONTACT) { @@ -932,8 +1005,10 @@ create: goto out; lgr = conn->lgr; write_lock_bh(&lgr->conns_lock); - smc_lgr_register_conn(conn); /* add smc conn to lgr */ + rc = smc_lgr_register_conn(conn); /* add smc conn to lgr */ write_unlock_bh(&lgr->conns_lock); + if (rc) + goto out; } conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE; conn->local_tx_ctrl.len = SMC_WR_TX_SIZE; @@ -1006,12 +1081,55 @@ static inline int smc_rmb_wnd_update_limit(int rmbe_size) return min_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2); } +/* map an rmb buf to a link */ +static int smcr_buf_map_link(struct smc_buf_desc *buf_desc, bool is_rmb, + struct smc_link *lnk) +{ + int rc; + + if (buf_desc->is_map_ib[lnk->link_idx]) + return 0; + + rc = sg_alloc_table(&buf_desc->sgt[lnk->link_idx], 1, GFP_KERNEL); + if (rc) + return rc; + sg_set_buf(buf_desc->sgt[lnk->link_idx].sgl, + buf_desc->cpu_addr, buf_desc->len); + + /* map sg table to DMA address */ + rc = smc_ib_buf_map_sg(lnk, buf_desc, + is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE); + /* SMC protocol depends on mapping to one DMA address only */ + if (rc != 1) { + rc = -EAGAIN; + goto free_table; + } + + /* create a new memory region for the RMB */ + if (is_rmb) { + rc = smc_ib_get_memory_region(lnk->roce_pd, + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_LOCAL_WRITE, + buf_desc, lnk->link_idx); + if (rc) + goto buf_unmap; + smc_ib_sync_sg_for_device(lnk, buf_desc, DMA_FROM_DEVICE); + } + buf_desc->is_map_ib[lnk->link_idx] = true; + return 0; + +buf_unmap: + smc_ib_buf_unmap_sg(lnk, buf_desc, + is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE); +free_table: + sg_free_table(&buf_desc->sgt[lnk->link_idx]); + return rc; +} + static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr, bool is_rmb, int bufsize) { struct smc_buf_desc *buf_desc; - struct smc_link *lnk; - int rc; /* try to alloc a new buffer */ buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL); @@ -1028,40 +1146,32 @@ static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr, return ERR_PTR(-EAGAIN); } buf_desc->cpu_addr = (void *)page_address(buf_desc->pages); + buf_desc->len = bufsize; + return buf_desc; +} - /* build the sg table from the pages */ - lnk = &lgr->lnk[SMC_SINGLE_LINK]; - rc = sg_alloc_table(&buf_desc->sgt[lnk->link_idx], 1, GFP_KERNEL); - if (rc) { - smc_buf_free(lgr, is_rmb, buf_desc); - return ERR_PTR(rc); - } - sg_set_buf(buf_desc->sgt[lnk->link_idx].sgl, - buf_desc->cpu_addr, bufsize); +/* map buf_desc on all usable links, + * unused buffers stay mapped as long as the link is up + */ +static int smcr_buf_map_usable_links(struct smc_link_group *lgr, + struct smc_buf_desc *buf_desc, bool is_rmb) +{ + int i, rc = 0; - /* map sg table to DMA address */ - rc = smc_ib_buf_map_sg(lnk, buf_desc, - is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE); - /* SMC protocol depends on mapping to one DMA address only */ - if (rc != 1) { - smc_buf_free(lgr, is_rmb, buf_desc); - return ERR_PTR(-EAGAIN); - } + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + struct smc_link *lnk = &lgr->lnk[i]; - /* create a new memory region for the RMB */ - if (is_rmb) { - rc = smc_ib_get_memory_region(lnk->roce_pd, - IB_ACCESS_REMOTE_WRITE | - IB_ACCESS_LOCAL_WRITE, - buf_desc, lnk->link_idx); - if (rc) { - smc_buf_free(lgr, is_rmb, buf_desc); - return ERR_PTR(rc); + if (lnk->state != SMC_LNK_ACTIVE && + lnk->state != SMC_LNK_ACTIVATING) + continue; + if (smcr_buf_map_link(buf_desc, is_rmb, lnk)) { + smcr_buf_unuse(buf_desc, lnk); + rc = -ENOMEM; + goto out; } } - - buf_desc->len = bufsize; - return buf_desc; +out: + return rc; } #define SMCD_DMBE_SIZES 7 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */ @@ -1159,6 +1269,12 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) if (IS_ERR(buf_desc)) return -ENOMEM; + if (!is_smcd) { + if (smcr_buf_map_usable_links(lgr, buf_desc, is_rmb)) { + return -ENOMEM; + } + } + if (is_rmb) { conn->rmb_desc = buf_desc; conn->rmbe_size_short = bufsize_short; @@ -1192,22 +1308,32 @@ void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn) void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn) { - struct smc_link_group *lgr = conn->lgr; + int i; if (!conn->lgr || conn->lgr->is_smcd) return; - smc_ib_sync_sg_for_cpu(&lgr->lnk[SMC_SINGLE_LINK], - conn->rmb_desc, DMA_FROM_DEVICE); + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (conn->lgr->lnk[i].state != SMC_LNK_ACTIVE && + conn->lgr->lnk[i].state != SMC_LNK_ACTIVATING) + continue; + smc_ib_sync_sg_for_cpu(&conn->lgr->lnk[i], conn->rmb_desc, + DMA_FROM_DEVICE); + } } void smc_rmb_sync_sg_for_device(struct smc_connection *conn) { - struct smc_link_group *lgr = conn->lgr; + int i; if (!conn->lgr || conn->lgr->is_smcd) return; - smc_ib_sync_sg_for_device(&lgr->lnk[SMC_SINGLE_LINK], - conn->rmb_desc, DMA_FROM_DEVICE); + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (conn->lgr->lnk[i].state != SMC_LNK_ACTIVE && + conn->lgr->lnk[i].state != SMC_LNK_ACTIVATING) + continue; + smc_ib_sync_sg_for_device(&conn->lgr->lnk[i], conn->rmb_desc, + DMA_FROM_DEVICE); + } } /* create the send and receive buffer for an SMC socket; diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index c71c35a3596c..66753ba23bc6 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -152,25 +152,32 @@ struct smc_buf_desc { struct page *pages; int len; /* length of buffer */ u32 used; /* currently used / unused */ - u8 wr_reg : 1; /* mem region registered */ - u8 regerr : 1; /* err during registration */ union { struct { /* SMC-R */ - struct sg_table sgt[SMC_LINKS_PER_LGR_MAX]; - /* virtual buffer */ - struct ib_mr *mr_rx[SMC_LINKS_PER_LGR_MAX]; - /* for rmb only: memory region - * incl. rkey provided to peer - */ - u32 order; /* allocation order */ + struct sg_table sgt[SMC_LINKS_PER_LGR_MAX]; + /* virtual buffer */ + struct ib_mr *mr_rx[SMC_LINKS_PER_LGR_MAX]; + /* for rmb only: memory region + * incl. rkey provided to peer + */ + u32 order; /* allocation order */ + + u8 is_conf_rkey; + /* confirm_rkey done */ + u8 is_reg_mr[SMC_LINKS_PER_LGR_MAX]; + /* mem region registered */ + u8 is_map_ib[SMC_LINKS_PER_LGR_MAX]; + /* mem region mapped to lnk */ + u8 is_reg_err; + /* buffer registration err */ }; struct { /* SMC-D */ - unsigned short sba_idx; - /* SBA index number */ - u64 token; - /* DMB token number */ - dma_addr_t dma_addr; - /* DMA address */ + unsigned short sba_idx; + /* SBA index number */ + u64 token; + /* DMB token number */ + dma_addr_t dma_addr; + /* DMA address */ }; }; }; diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 34d0752ba6af..903ae068da3a 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -662,6 +662,8 @@ void smc_llc_link_deleting(struct smc_link *link) /* called in tasklet context */ void smc_llc_link_inactive(struct smc_link *link) { + if (link->state == SMC_LNK_INACTIVE) + return; link->state = SMC_LNK_INACTIVE; cancel_delayed_work(&link->llc_testlink_wrk); smc_wr_wakeup_reg_wait(link); -- cgit v1.2.3 From e07d31dc16b0d77ff6b3f71cafe3a825fb80bed4 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Wed, 29 Apr 2020 17:10:42 +0200 Subject: net/smc: multi-link support for smc_rmb_rtoken_handling() Extend smc_rmb_rtoken_handling() and smc_rtoken_delete() to support multiple links. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 4 ++-- net/smc/smc_core.c | 14 ++++++++------ net/smc/smc_core.h | 2 +- 3 files changed, 11 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 890dc6422f8c..e39f6aedd3bd 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -640,7 +640,7 @@ static int smc_connect_rdma(struct smc_sock *smc, if (ini->cln_first_contact == SMC_FIRST_CONTACT) smc_link_save_peer_info(link, aclc); - if (smc_rmb_rtoken_handling(&smc->conn, aclc)) + if (smc_rmb_rtoken_handling(&smc->conn, link, aclc)) return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RTOK, ini->cln_first_contact); @@ -1231,7 +1231,7 @@ static int smc_listen_rdma_finish(struct smc_sock *new_smc, if (local_contact == SMC_FIRST_CONTACT) smc_link_save_peer_info(link, cclc); - if (smc_rmb_rtoken_handling(&new_smc->conn, cclc)) { + if (smc_rmb_rtoken_handling(&new_smc->conn, link, cclc)) { reason_code = SMC_CLC_DECL_ERR_RTOK; goto decline; } diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 5df3f8f41d19..e8897d60b27f 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1392,19 +1392,20 @@ int smc_rtoken_add(struct smc_link *lnk, __be64 nw_vaddr, __be32 nw_rkey) return i; } -/* delete an rtoken */ +/* delete an rtoken from all links */ int smc_rtoken_delete(struct smc_link *lnk, __be32 nw_rkey) { struct smc_link_group *lgr = smc_get_lgr(lnk); u32 rkey = ntohl(nw_rkey); - int i; + int i, j; for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { if (lgr->rtokens[i][lnk->link_idx].rkey == rkey && test_bit(i, lgr->rtokens_used_mask)) { - lgr->rtokens[i][lnk->link_idx].rkey = 0; - lgr->rtokens[i][lnk->link_idx].dma_addr = 0; - + for (j = 0; j < SMC_LINKS_PER_LGR_MAX; j++) { + lgr->rtokens[i][j].rkey = 0; + lgr->rtokens[i][j].dma_addr = 0; + } clear_bit(i, lgr->rtokens_used_mask); return 0; } @@ -1414,9 +1415,10 @@ int smc_rtoken_delete(struct smc_link *lnk, __be32 nw_rkey) /* save rkey and dma_addr received from peer during clc handshake */ int smc_rmb_rtoken_handling(struct smc_connection *conn, + struct smc_link *lnk, struct smc_clc_msg_accept_confirm *clc) { - conn->rtoken_idx = smc_rtoken_add(conn->lnk, clc->rmb_dma_addr, + conn->rtoken_idx = smc_rtoken_add(lnk, clc->rmb_dma_addr, clc->rmb_rkey); if (conn->rtoken_idx < 0) return conn->rtoken_idx; diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 66753ba23bc6..f68ba187ecf8 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -309,7 +309,7 @@ void smc_smcd_terminate_all(struct smcd_dev *dev); void smc_smcr_terminate_all(struct smc_ib_device *smcibdev); int smc_buf_create(struct smc_sock *smc, bool is_smcd); int smc_uncompress_bufsize(u8 compressed); -int smc_rmb_rtoken_handling(struct smc_connection *conn, +int smc_rmb_rtoken_handling(struct smc_connection *conn, struct smc_link *link, struct smc_clc_msg_accept_confirm *clc); int smc_rtoken_add(struct smc_link *lnk, __be64 nw_vaddr, __be32 nw_rkey); int smc_rtoken_delete(struct smc_link *lnk, __be32 nw_rkey); -- cgit v1.2.3 From d854fcbfaeda9748c85de296fbe07b7763a1939c Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Wed, 29 Apr 2020 17:10:43 +0200 Subject: net/smc: add new link state and related helpers Before a link can be reused it must have been cleared. Lowest current link state is INACTIVE, which does not mean that the link is already cleared. Add a new state UNUSED that is set when the link is cleared and can be reused. Add helper smc_llc_usable_link() to find an active link in a link group, and smc_link_usable() to determine if a link is usable. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_core.c | 36 +++++++++++++++++++----------------- net/smc/smc_core.h | 9 +++++++++ net/smc/smc_llc.c | 4 ++-- net/smc/smc_llc.h | 11 +++++++++++ net/smc/smc_wr.c | 2 +- 5 files changed, 42 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index e8897d60b27f..57890cbd4e8a 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -260,7 +260,7 @@ static void smc_lgr_free_work(struct work_struct *work) for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { struct smc_link *lnk = &lgr->lnk[i]; - if (lnk->state != SMC_LNK_INACTIVE) + if (smc_link_usable(lnk)) smc_llc_link_inactive(lnk); } } @@ -286,7 +286,7 @@ static u8 smcr_next_link_id(struct smc_link_group *lgr) if (!link_id) /* skip zero as link_id */ link_id = ++lgr->next_link_id; for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { - if (lgr->lnk[i].state != SMC_LNK_INACTIVE && + if (smc_link_usable(&lgr->lnk[i]) && lgr->lnk[i].link_id == link_id) continue; } @@ -350,6 +350,7 @@ clear_llc_lnk: out: put_device(&ini->ib_dev->ibdev->dev); memset(lnk, 0, sizeof(struct smc_link)); + lnk->state = SMC_LNK_UNUSED; if (!atomic_dec_return(&ini->ib_dev->lnk_cnt)) wake_up(&ini->ib_dev->lnks_deleted); return rc; @@ -500,6 +501,8 @@ void smc_conn_free(struct smc_connection *conn) static void smcr_link_clear(struct smc_link *lnk) { + struct smc_ib_device *smcibdev; + if (lnk->peer_qpn == 0) return; lnk->peer_qpn = 0; @@ -510,8 +513,11 @@ static void smcr_link_clear(struct smc_link *lnk) smc_ib_dealloc_protection_domain(lnk); smc_wr_free_link_mem(lnk); put_device(&lnk->smcibdev->ibdev->dev); - if (!atomic_dec_return(&lnk->smcibdev->lnk_cnt)) - wake_up(&lnk->smcibdev->lnks_deleted); + smcibdev = lnk->smcibdev; + memset(lnk, 0, sizeof(struct smc_link)); + lnk->state = SMC_LNK_UNUSED; + if (!atomic_dec_return(&smcibdev->lnk_cnt)) + wake_up(&smcibdev->lnks_deleted); } static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb, @@ -604,9 +610,8 @@ static void smc_lgr_free(struct smc_link_group *lgr) wake_up(&lgr->smcd->lgrs_deleted); } else { for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { - if (lgr->lnk[i].state == SMC_LNK_INACTIVE) - continue; - smcr_link_clear(&lgr->lnk[i]); + if (lgr->lnk[i].state != SMC_LNK_UNUSED) + smcr_link_clear(&lgr->lnk[i]); } if (!atomic_dec_return(&lgr_cnt)) wake_up(&lgrs_deleted); @@ -686,7 +691,7 @@ static void smc_lgr_cleanup(struct smc_link_group *lgr) for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { struct smc_link *lnk = &lgr->lnk[i]; - if (lnk->state != SMC_LNK_INACTIVE) + if (smc_link_usable(lnk)) smc_llc_link_inactive(lnk); } } @@ -764,7 +769,7 @@ void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport) continue; /* tbd - terminate only when no more links are active */ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { - if (lgr->lnk[i].state == SMC_LNK_INACTIVE || + if (!smc_link_usable(&lgr->lnk[i]) || lgr->lnk[i].state == SMC_LNK_DELETING) continue; if (lgr->lnk[i].smcibdev == smcibdev && @@ -1161,8 +1166,7 @@ static int smcr_buf_map_usable_links(struct smc_link_group *lgr, for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { struct smc_link *lnk = &lgr->lnk[i]; - if (lnk->state != SMC_LNK_ACTIVE && - lnk->state != SMC_LNK_ACTIVATING) + if (!smc_link_usable(lnk)) continue; if (smcr_buf_map_link(buf_desc, is_rmb, lnk)) { smcr_buf_unuse(buf_desc, lnk); @@ -1294,14 +1298,14 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn) { - if (!conn->lgr || conn->lgr->is_smcd) + if (!conn->lgr || conn->lgr->is_smcd || !smc_link_usable(conn->lnk)) return; smc_ib_sync_sg_for_cpu(conn->lnk, conn->sndbuf_desc, DMA_TO_DEVICE); } void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn) { - if (!conn->lgr || conn->lgr->is_smcd) + if (!conn->lgr || conn->lgr->is_smcd || !smc_link_usable(conn->lnk)) return; smc_ib_sync_sg_for_device(conn->lnk, conn->sndbuf_desc, DMA_TO_DEVICE); } @@ -1313,8 +1317,7 @@ void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn) if (!conn->lgr || conn->lgr->is_smcd) return; for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { - if (conn->lgr->lnk[i].state != SMC_LNK_ACTIVE && - conn->lgr->lnk[i].state != SMC_LNK_ACTIVATING) + if (!smc_link_usable(&conn->lgr->lnk[i])) continue; smc_ib_sync_sg_for_cpu(&conn->lgr->lnk[i], conn->rmb_desc, DMA_FROM_DEVICE); @@ -1328,8 +1331,7 @@ void smc_rmb_sync_sg_for_device(struct smc_connection *conn) if (!conn->lgr || conn->lgr->is_smcd) return; for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { - if (conn->lgr->lnk[i].state != SMC_LNK_ACTIVE && - conn->lgr->lnk[i].state != SMC_LNK_ACTIVATING) + if (!smc_link_usable(&conn->lgr->lnk[i])) continue; smc_ib_sync_sg_for_device(&conn->lgr->lnk[i], conn->rmb_desc, DMA_FROM_DEVICE); diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index f68ba187ecf8..2b1960c8c8ce 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -32,6 +32,7 @@ enum smc_lgr_role { /* possible roles of a link group */ }; enum smc_link_state { /* possible states of a link */ + SMC_LNK_UNUSED, /* link is unused */ SMC_LNK_INACTIVE, /* link is inactive */ SMC_LNK_ACTIVATING, /* link is being activated */ SMC_LNK_ACTIVE, /* link is active */ @@ -295,6 +296,14 @@ static inline struct smc_connection *smc_lgr_find_conn( return res; } +/* returns true if the specified link is usable */ +static inline bool smc_link_usable(struct smc_link *lnk) +{ + if (lnk->state == SMC_LNK_UNUSED || lnk->state == SMC_LNK_INACTIVE) + return false; + return true; +} + struct smc_sock; struct smc_clc_msg_accept_confirm; struct smc_clc_msg_local; diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 903ae068da3a..c267f5006faa 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -372,7 +372,7 @@ static void smc_llc_send_message_work(struct work_struct *work) struct smc_wr_buf *wr_buf; int rc; - if (llcwrk->link->state == SMC_LNK_INACTIVE) + if (!smc_link_usable(llcwrk->link)) goto out; rc = smc_llc_add_pending_send(llcwrk->link, &wr_buf, &pend); if (rc) @@ -562,7 +562,7 @@ static void smc_llc_rx_handler(struct ib_wc *wc, void *buf) return; /* short message */ if (llc->raw.hdr.length != sizeof(*llc)) return; /* invalid message */ - if (link->state == SMC_LNK_INACTIVE) + if (!smc_link_usable(link)) return; /* link not active, drop msg */ switch (llc->raw.hdr.common.type) { diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index 461c0c3ef76e..08171131110c 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -35,6 +35,17 @@ enum smc_llc_msg_type { SMC_LLC_DELETE_RKEY = 0x09, }; +/* returns a usable link of the link group, or NULL */ +static inline struct smc_link *smc_llc_usable_link(struct smc_link_group *lgr) +{ + int i; + + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) + if (smc_link_usable(&lgr->lnk[i])) + return &lgr->lnk[i]; + return NULL; +} + /* transmit */ int smc_llc_send_confirm_link(struct smc_link *lnk, enum smc_llc_reqresp reqresp); diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 337ee52ad3d3..93223628c002 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -207,7 +207,7 @@ int smc_wr_tx_get_free_slot(struct smc_link *link, } else { rc = wait_event_interruptible_timeout( link->wr_tx_wait, - link->state == SMC_LNK_INACTIVE || + !smc_link_usable(link) || lgr->terminating || (smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY), SMC_WR_TX_WAIT_FREE_SLOT_TIME); -- cgit v1.2.3 From 1020e1ef53ceef715f2bc144eebbfe01e88effcf Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Wed, 29 Apr 2020 17:10:44 +0200 Subject: net/smc: move testlink work to system work queue The testlink work waits for a response to the testlink request and blocks the single threaded llc_wq. This type of work does not have to be serialized and can be moved to the system work queue. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_llc.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index c267f5006faa..69cc0d65b437 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -613,14 +613,15 @@ static void smc_llc_testlink_work(struct work_struct *work) /* receive TEST LINK response over RoCE fabric */ rc = wait_for_completion_interruptible_timeout(&link->llc_testlink_resp, SMC_LLC_WAIT_TIME); + if (link->state != SMC_LNK_ACTIVE) + return; /* link state changed */ if (rc <= 0) { smc_lgr_terminate_sched(smc_get_lgr(link)); return; } next_interval = link->llc_testlink_time; out: - queue_delayed_work(link->llc_wq, &link->llc_testlink_wrk, - next_interval); + schedule_delayed_work(&link->llc_testlink_wrk, next_interval); } int smc_llc_link_init(struct smc_link *link) @@ -648,8 +649,8 @@ void smc_llc_link_active(struct smc_link *link, int testlink_time) link->state = SMC_LNK_ACTIVE; if (testlink_time) { link->llc_testlink_time = testlink_time * HZ; - queue_delayed_work(link->llc_wq, &link->llc_testlink_wrk, - link->llc_testlink_time); + schedule_delayed_work(&link->llc_testlink_wrk, + link->llc_testlink_time); } } @@ -665,7 +666,7 @@ void smc_llc_link_inactive(struct smc_link *link) if (link->state == SMC_LNK_INACTIVE) return; link->state = SMC_LNK_INACTIVE; - cancel_delayed_work(&link->llc_testlink_wrk); + cancel_delayed_work_sync(&link->llc_testlink_wrk); smc_wr_wakeup_reg_wait(link); smc_wr_wakeup_tx_wait(link); } -- cgit v1.2.3 From 2140ac26f8f501d3cc8f1575e6419f1a50779496 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Wed, 29 Apr 2020 17:10:45 +0200 Subject: net/smc: simplify link deactivation Cancel the testlink worker during link clear processing and remove the extra function smc_llc_link_inactive(). Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_core.c | 8 ++------ net/smc/smc_llc.c | 15 ++++----------- net/smc/smc_llc.h | 1 - 3 files changed, 6 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 57890cbd4e8a..78ccfbf6e4af 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -261,7 +261,7 @@ static void smc_lgr_free_work(struct work_struct *work) struct smc_link *lnk = &lgr->lnk[i]; if (smc_link_usable(lnk)) - smc_llc_link_inactive(lnk); + lnk->state = SMC_LNK_INACTIVE; } } smc_lgr_free(lgr); @@ -692,7 +692,7 @@ static void smc_lgr_cleanup(struct smc_link_group *lgr) struct smc_link *lnk = &lgr->lnk[i]; if (smc_link_usable(lnk)) - smc_llc_link_inactive(lnk); + lnk->state = SMC_LNK_INACTIVE; } } } @@ -706,16 +706,12 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft) struct smc_connection *conn; struct smc_sock *smc; struct rb_node *node; - int i; if (lgr->terminating) return; /* lgr already terminating */ if (!soft) cancel_delayed_work_sync(&lgr->free_work); lgr->terminating = 1; - if (!lgr->is_smcd) - for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) - smc_llc_link_inactive(&lgr->lnk[i]); /* kill remaining link group connections */ read_lock_bh(&lgr->conns_lock); diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 69cc0d65b437..2f03131c85fd 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -660,22 +660,15 @@ void smc_llc_link_deleting(struct smc_link *link) smc_wr_wakeup_tx_wait(link); } -/* called in tasklet context */ -void smc_llc_link_inactive(struct smc_link *link) -{ - if (link->state == SMC_LNK_INACTIVE) - return; - link->state = SMC_LNK_INACTIVE; - cancel_delayed_work_sync(&link->llc_testlink_wrk); - smc_wr_wakeup_reg_wait(link); - smc_wr_wakeup_tx_wait(link); -} - /* called in worker context */ void smc_llc_link_clear(struct smc_link *link) { flush_workqueue(link->llc_wq); destroy_workqueue(link->llc_wq); + complete(&link->llc_testlink_resp); + cancel_delayed_work_sync(&link->llc_testlink_wrk); + smc_wr_wakeup_reg_wait(link); + smc_wr_wakeup_tx_wait(link); } /* register a new rtoken at the remote peer */ diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index 08171131110c..c2c9d48d079f 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -56,7 +56,6 @@ int smc_llc_send_delete_link(struct smc_link *link, int smc_llc_link_init(struct smc_link *link); void smc_llc_link_active(struct smc_link *link, int testlink_time); void smc_llc_link_deleting(struct smc_link *link); -void smc_llc_link_inactive(struct smc_link *link); void smc_llc_link_clear(struct smc_link *link); int smc_llc_do_confirm_rkey(struct smc_link *link, struct smc_buf_desc *rmb_desc); -- cgit v1.2.3 From 6c8968c421e0e6bea8a78ee4fdd043d850cd5b26 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Wed, 29 Apr 2020 17:10:46 +0200 Subject: net/smc: use worker to process incoming llc messages Incoming llc messages are processed in irq tasklet context, and a worker is used to send outgoing messages. The worker is needed because getting a send buffer could result in a wait for a free buffer. To make sure all incoming llc messages are processed in a serialized way introduce an event queue and create a new queue entry for each message which is queued to this event queue. A new worker processes the event queue entries in order. And remove the use of a separate worker to send outgoing llc messages because the messages are processed in worker context already. With this event queue the serialized llc_wq work queue is obsolete, remove it. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_core.c | 4 +- net/smc/smc_core.h | 7 ++- net/smc/smc_llc.c | 142 ++++++++++++++++++++++++++++++++--------------------- net/smc/smc_llc.h | 1 + 4 files changed, 96 insertions(+), 58 deletions(-) (limited to 'net') diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 78ccfbf6e4af..a1463da14614 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -412,7 +412,8 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT; memcpy(lgr->peer_systemid, ini->ib_lcl->id_for_peer, SMC_SYSTEMID_LEN); - + INIT_LIST_HEAD(&lgr->llc_event_q); + spin_lock_init(&lgr->llc_event_q_lock); link_idx = SMC_SINGLE_LINK; lnk = &lgr->lnk[link_idx]; rc = smcr_link_init(lgr, lnk, link_idx, ini); @@ -613,6 +614,7 @@ static void smc_lgr_free(struct smc_link_group *lgr) if (lgr->lnk[i].state != SMC_LNK_UNUSED) smcr_link_clear(&lgr->lnk[i]); } + smc_llc_event_flush(lgr); if (!atomic_dec_return(&lgr_cnt)) wake_up(&lgrs_deleted); } diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 2b1960c8c8ce..6548e9a06f73 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -120,7 +120,6 @@ struct smc_link { struct smc_link_group *lgr; /* parent link group */ enum smc_link_state state; /* state of link */ - struct workqueue_struct *llc_wq; /* single thread work queue */ struct completion llc_confirm; /* wait for rx of conf link */ struct completion llc_confirm_resp; /* wait 4 rx of cnf lnk rsp */ int llc_confirm_rc; /* rc from confirm link msg */ @@ -233,6 +232,12 @@ struct smc_link_group { DECLARE_BITMAP(rtokens_used_mask, SMC_RMBS_PER_LGR_MAX); /* used rtoken elements */ u8 next_link_id; + struct list_head llc_event_q; + /* queue for llc events */ + spinlock_t llc_event_q_lock; + /* protects llc_event_q */ + struct work_struct llc_event_work; + /* llc event worker */ }; struct { /* SMC-D */ u64 peer_gid; diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 2f03131c85fd..be74876a36ae 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -134,6 +134,12 @@ union smc_llc_msg { #define SMC_LLC_FLAG_RESP 0x80 +struct smc_llc_qentry { + struct list_head list; + struct smc_link *link; + union smc_llc_msg msg; +}; + /********************************** send *************************************/ struct smc_llc_tx_pend { @@ -356,46 +362,20 @@ static int smc_llc_send_test_link(struct smc_link *link, u8 user_data[16]) return rc; } -struct smc_llc_send_work { - struct work_struct work; - struct smc_link *link; - int llclen; - union smc_llc_msg llcbuf; -}; - -/* worker that sends a prepared message */ -static void smc_llc_send_message_work(struct work_struct *work) +/* schedule an llc send on link, may wait for buffers */ +static int smc_llc_send_message(struct smc_link *link, void *llcbuf) { - struct smc_llc_send_work *llcwrk = container_of(work, - struct smc_llc_send_work, work); struct smc_wr_tx_pend_priv *pend; struct smc_wr_buf *wr_buf; int rc; - if (!smc_link_usable(llcwrk->link)) - goto out; - rc = smc_llc_add_pending_send(llcwrk->link, &wr_buf, &pend); + if (!smc_link_usable(link)) + return -ENOLINK; + rc = smc_llc_add_pending_send(link, &wr_buf, &pend); if (rc) - goto out; - memcpy(wr_buf, &llcwrk->llcbuf, llcwrk->llclen); - smc_wr_tx_send(llcwrk->link, pend); -out: - kfree(llcwrk); -} - -/* copy llcbuf and schedule an llc send on link */ -static int smc_llc_send_message(struct smc_link *link, void *llcbuf, int llclen) -{ - struct smc_llc_send_work *wrk = kmalloc(sizeof(*wrk), GFP_ATOMIC); - - if (!wrk) - return -ENOMEM; - INIT_WORK(&wrk->work, smc_llc_send_message_work); - wrk->link = link; - wrk->llclen = llclen; - memcpy(&wrk->llcbuf, llcbuf, llclen); - queue_work(link->llc_wq, &wrk->work); - return 0; + return rc; + memcpy(wr_buf, llcbuf, sizeof(union smc_llc_msg)); + return smc_wr_tx_send(link, pend); } /********************************* receive ***********************************/ @@ -452,7 +432,7 @@ static void smc_llc_rx_add_link(struct smc_link *link, link->smcibdev->mac[link->ibport - 1], link->gid, SMC_LLC_RESP); } - smc_llc_send_message(link, llc, sizeof(*llc)); + smc_llc_send_message(link, llc); } } @@ -474,7 +454,7 @@ static void smc_llc_rx_delete_link(struct smc_link *link, /* server requests to delete this link, send response */ smc_llc_prep_delete_link(llc, link, SMC_LLC_RESP, true); } - smc_llc_send_message(link, llc, sizeof(*llc)); + smc_llc_send_message(link, llc); smc_lgr_terminate_sched(lgr); } } @@ -487,7 +467,7 @@ static void smc_llc_rx_test_link(struct smc_link *link, complete(&link->llc_testlink_resp); } else { llc->hd.flags |= SMC_LLC_FLAG_RESP; - smc_llc_send_message(link, llc, sizeof(*llc)); + smc_llc_send_message(link, llc); } } @@ -510,7 +490,7 @@ static void smc_llc_rx_confirm_rkey(struct smc_link *link, llc->hd.flags |= SMC_LLC_FLAG_RESP; if (rc < 0) llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG; - smc_llc_send_message(link, llc, sizeof(*llc)); + smc_llc_send_message(link, llc); } } @@ -522,7 +502,7 @@ static void smc_llc_rx_confirm_rkey_cont(struct smc_link *link, } else { /* ignore rtokens for other links, we have only one link */ llc->hd.flags |= SMC_LLC_FLAG_RESP; - smc_llc_send_message(link, llc, sizeof(*llc)); + smc_llc_send_message(link, llc); } } @@ -549,21 +529,30 @@ static void smc_llc_rx_delete_rkey(struct smc_link *link, } llc->hd.flags |= SMC_LLC_FLAG_RESP; - smc_llc_send_message(link, llc, sizeof(*llc)); + smc_llc_send_message(link, llc); } } -static void smc_llc_rx_handler(struct ib_wc *wc, void *buf) +/* flush the llc event queue */ +void smc_llc_event_flush(struct smc_link_group *lgr) { - struct smc_link *link = (struct smc_link *)wc->qp->qp_context; - union smc_llc_msg *llc = buf; + struct smc_llc_qentry *qentry, *q; + + spin_lock_bh(&lgr->llc_event_q_lock); + list_for_each_entry_safe(qentry, q, &lgr->llc_event_q, list) { + list_del_init(&qentry->list); + kfree(qentry); + } + spin_unlock_bh(&lgr->llc_event_q_lock); +} + +static void smc_llc_event_handler(struct smc_llc_qentry *qentry) +{ + union smc_llc_msg *llc = &qentry->msg; + struct smc_link *link = qentry->link; - if (wc->byte_len < sizeof(*llc)) - return; /* short message */ - if (llc->raw.hdr.length != sizeof(*llc)) - return; /* invalid message */ if (!smc_link_usable(link)) - return; /* link not active, drop msg */ + goto out; switch (llc->raw.hdr.common.type) { case SMC_LLC_TEST_LINK: @@ -588,6 +577,54 @@ static void smc_llc_rx_handler(struct ib_wc *wc, void *buf) smc_llc_rx_delete_rkey(link, &llc->delete_rkey); break; } +out: + kfree(qentry); +} + +/* worker to process llc messages on the event queue */ +static void smc_llc_event_work(struct work_struct *work) +{ + struct smc_link_group *lgr = container_of(work, struct smc_link_group, + llc_event_work); + struct smc_llc_qentry *qentry; + +again: + spin_lock_bh(&lgr->llc_event_q_lock); + if (!list_empty(&lgr->llc_event_q)) { + qentry = list_first_entry(&lgr->llc_event_q, + struct smc_llc_qentry, list); + list_del_init(&qentry->list); + spin_unlock_bh(&lgr->llc_event_q_lock); + smc_llc_event_handler(qentry); + goto again; + } + spin_unlock_bh(&lgr->llc_event_q_lock); +} + +/* copy received msg and add it to the event queue */ +static void smc_llc_rx_handler(struct ib_wc *wc, void *buf) +{ + struct smc_link *link = (struct smc_link *)wc->qp->qp_context; + struct smc_link_group *lgr = link->lgr; + struct smc_llc_qentry *qentry; + union smc_llc_msg *llc = buf; + unsigned long flags; + + if (wc->byte_len < sizeof(*llc)) + return; /* short message */ + if (llc->raw.hdr.length != sizeof(*llc)) + return; /* invalid message */ + + qentry = kmalloc(sizeof(*qentry), GFP_ATOMIC); + if (!qentry) + return; + qentry->link = link; + INIT_LIST_HEAD(&qentry->list); + memcpy(&qentry->msg, llc, sizeof(union smc_llc_msg)); + spin_lock_irqsave(&lgr->llc_event_q_lock, flags); + list_add_tail(&qentry->list, &lgr->llc_event_q); + spin_unlock_irqrestore(&lgr->llc_event_q_lock, flags); + schedule_work(&link->lgr->llc_event_work); } /***************************** worker, utils *********************************/ @@ -626,12 +663,6 @@ out: int smc_llc_link_init(struct smc_link *link) { - struct smc_link_group *lgr = smc_get_lgr(link); - link->llc_wq = alloc_ordered_workqueue("llc_wq-%x:%x)", WQ_MEM_RECLAIM, - *((u32 *)lgr->id), - link->link_id); - if (!link->llc_wq) - return -ENOMEM; init_completion(&link->llc_confirm); init_completion(&link->llc_confirm_resp); init_completion(&link->llc_add); @@ -640,6 +671,7 @@ int smc_llc_link_init(struct smc_link *link) init_completion(&link->llc_delete_rkey); mutex_init(&link->llc_delete_rkey_mutex); init_completion(&link->llc_testlink_resp); + INIT_WORK(&link->lgr->llc_event_work, smc_llc_event_work); INIT_DELAYED_WORK(&link->llc_testlink_wrk, smc_llc_testlink_work); return 0; } @@ -663,8 +695,6 @@ void smc_llc_link_deleting(struct smc_link *link) /* called in worker context */ void smc_llc_link_clear(struct smc_link *link) { - flush_workqueue(link->llc_wq); - destroy_workqueue(link->llc_wq); complete(&link->llc_testlink_resp); cancel_delayed_work_sync(&link->llc_testlink_wrk); smc_wr_wakeup_reg_wait(link); diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index c2c9d48d079f..9de83495ad14 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -61,6 +61,7 @@ int smc_llc_do_confirm_rkey(struct smc_link *link, struct smc_buf_desc *rmb_desc); int smc_llc_do_delete_rkey(struct smc_link *link, struct smc_buf_desc *rmb_desc); +void smc_llc_event_flush(struct smc_link_group *lgr); int smc_llc_init(void) __init; #endif /* SMC_LLC_H */ -- cgit v1.2.3 From ef79d439cd124d9fb7258bb35d44c71aec11b829 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Wed, 29 Apr 2020 17:10:47 +0200 Subject: net/smc: process llc responses in tasklet context When llc responses are received then possible waiters for this response are to be notified. This can be done in tasklet context, without to use a work in the llc work queue. Move all code that handles llc responses into smc_llc_rx_response(). Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_core.h | 8 +- net/smc/smc_llc.c | 216 +++++++++++++++++++++++++++-------------------------- 2 files changed, 116 insertions(+), 108 deletions(-) (limited to 'net') diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 6548e9a06f73..d785656b3489 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -129,10 +129,10 @@ struct smc_link { struct delayed_work llc_testlink_wrk; /* testlink worker */ struct completion llc_testlink_resp; /* wait for rx of testlink */ int llc_testlink_time; /* testlink interval */ - struct completion llc_confirm_rkey; /* wait 4 rx of cnf rkey */ - int llc_confirm_rkey_rc; /* rc from cnf rkey msg */ - struct completion llc_delete_rkey; /* wait 4 rx of del rkey */ - int llc_delete_rkey_rc; /* rc from del rkey msg */ + struct completion llc_confirm_rkey_resp; /* w4 rx of cnf rkey */ + int llc_confirm_rkey_resp_rc; /* rc from cnf rkey */ + struct completion llc_delete_rkey_resp; /* w4 rx of del rkey */ + int llc_delete_rkey_resp_rc; /* rc from del rkey */ struct mutex llc_delete_rkey_mutex; /* serialize usage */ }; diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index be74876a36ae..265889c8b03b 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -384,27 +384,17 @@ static void smc_llc_rx_confirm_link(struct smc_link *link, struct smc_llc_msg_confirm_link *llc) { struct smc_link_group *lgr = smc_get_lgr(link); - int conf_rc; + int conf_rc = 0; /* RMBE eyecatchers are not supported */ - if (llc->hd.flags & SMC_LLC_FLAG_NO_RMBE_EYEC) - conf_rc = 0; - else + if (!(llc->hd.flags & SMC_LLC_FLAG_NO_RMBE_EYEC)) conf_rc = ENOTSUPP; - if (llc->hd.flags & SMC_LLC_FLAG_RESP) { - if (lgr->role == SMC_SERV && - link->state == SMC_LNK_ACTIVATING) { - link->llc_confirm_resp_rc = conf_rc; - complete(&link->llc_confirm_resp); - } - } else { - if (lgr->role == SMC_CLNT && - link->state == SMC_LNK_ACTIVATING) { - link->llc_confirm_rc = conf_rc; - link->link_id = llc->link_num; - complete(&link->llc_confirm); - } + if (lgr->role == SMC_CLNT && + link->state == SMC_LNK_ACTIVATING) { + link->llc_confirm_rc = conf_rc; + link->link_id = llc->link_num; + complete(&link->llc_confirm); } } @@ -413,27 +403,22 @@ static void smc_llc_rx_add_link(struct smc_link *link, { struct smc_link_group *lgr = smc_get_lgr(link); - if (llc->hd.flags & SMC_LLC_FLAG_RESP) { - if (link->state == SMC_LNK_ACTIVATING) - complete(&link->llc_add_resp); - } else { - if (link->state == SMC_LNK_ACTIVATING) { - complete(&link->llc_add); - return; - } + if (link->state == SMC_LNK_ACTIVATING) { + complete(&link->llc_add); + return; + } - if (lgr->role == SMC_SERV) { - smc_llc_prep_add_link(llc, link, - link->smcibdev->mac[link->ibport - 1], - link->gid, SMC_LLC_REQ); + if (lgr->role == SMC_SERV) { + smc_llc_prep_add_link(llc, link, + link->smcibdev->mac[link->ibport - 1], + link->gid, SMC_LLC_REQ); - } else { - smc_llc_prep_add_link(llc, link, - link->smcibdev->mac[link->ibport - 1], - link->gid, SMC_LLC_RESP); - } - smc_llc_send_message(link, llc); + } else { + smc_llc_prep_add_link(llc, link, + link->smcibdev->mac[link->ibport - 1], + link->gid, SMC_LLC_RESP); } + smc_llc_send_message(link, llc); } static void smc_llc_rx_delete_link(struct smc_link *link, @@ -441,34 +426,24 @@ static void smc_llc_rx_delete_link(struct smc_link *link, { struct smc_link_group *lgr = smc_get_lgr(link); - if (llc->hd.flags & SMC_LLC_FLAG_RESP) { - if (lgr->role == SMC_SERV) - smc_lgr_schedule_free_work_fast(lgr); + smc_lgr_forget(lgr); + smc_llc_link_deleting(link); + if (lgr->role == SMC_SERV) { + /* client asks to delete this link, send request */ + smc_llc_prep_delete_link(llc, link, SMC_LLC_REQ, true); } else { - smc_lgr_forget(lgr); - smc_llc_link_deleting(link); - if (lgr->role == SMC_SERV) { - /* client asks to delete this link, send request */ - smc_llc_prep_delete_link(llc, link, SMC_LLC_REQ, true); - } else { - /* server requests to delete this link, send response */ - smc_llc_prep_delete_link(llc, link, SMC_LLC_RESP, true); - } - smc_llc_send_message(link, llc); - smc_lgr_terminate_sched(lgr); + /* server requests to delete this link, send response */ + smc_llc_prep_delete_link(llc, link, SMC_LLC_RESP, true); } + smc_llc_send_message(link, llc); + smc_lgr_terminate_sched(lgr); } static void smc_llc_rx_test_link(struct smc_link *link, struct smc_llc_msg_test_link *llc) { - if (llc->hd.flags & SMC_LLC_FLAG_RESP) { - if (link->state == SMC_LNK_ACTIVE) - complete(&link->llc_testlink_resp); - } else { - llc->hd.flags |= SMC_LLC_FLAG_RESP; - smc_llc_send_message(link, llc); - } + llc->hd.flags |= SMC_LLC_FLAG_RESP; + smc_llc_send_message(link, llc); } static void smc_llc_rx_confirm_rkey(struct smc_link *link, @@ -476,34 +451,24 @@ static void smc_llc_rx_confirm_rkey(struct smc_link *link, { int rc; - if (llc->hd.flags & SMC_LLC_FLAG_RESP) { - link->llc_confirm_rkey_rc = llc->hd.flags & - SMC_LLC_FLAG_RKEY_NEG; - complete(&link->llc_confirm_rkey); - } else { - rc = smc_rtoken_add(link, - llc->rtoken[0].rmb_vaddr, - llc->rtoken[0].rmb_key); + rc = smc_rtoken_add(link, + llc->rtoken[0].rmb_vaddr, + llc->rtoken[0].rmb_key); - /* ignore rtokens for other links, we have only one link */ + /* ignore rtokens for other links, we have only one link */ - llc->hd.flags |= SMC_LLC_FLAG_RESP; - if (rc < 0) - llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG; - smc_llc_send_message(link, llc); - } + llc->hd.flags |= SMC_LLC_FLAG_RESP; + if (rc < 0) + llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG; + smc_llc_send_message(link, llc); } static void smc_llc_rx_confirm_rkey_cont(struct smc_link *link, struct smc_llc_msg_confirm_rkey_cont *llc) { - if (llc->hd.flags & SMC_LLC_FLAG_RESP) { - /* unused as long as we don't send this type of msg */ - } else { - /* ignore rtokens for other links, we have only one link */ - llc->hd.flags |= SMC_LLC_FLAG_RESP; - smc_llc_send_message(link, llc); - } + /* ignore rtokens for other links, we have only one link */ + llc->hd.flags |= SMC_LLC_FLAG_RESP; + smc_llc_send_message(link, llc); } static void smc_llc_rx_delete_rkey(struct smc_link *link, @@ -512,25 +477,19 @@ static void smc_llc_rx_delete_rkey(struct smc_link *link, u8 err_mask = 0; int i, max; - if (llc->hd.flags & SMC_LLC_FLAG_RESP) { - link->llc_delete_rkey_rc = llc->hd.flags & - SMC_LLC_FLAG_RKEY_NEG; - complete(&link->llc_delete_rkey); - } else { - max = min_t(u8, llc->num_rkeys, SMC_LLC_DEL_RKEY_MAX); - for (i = 0; i < max; i++) { - if (smc_rtoken_delete(link, llc->rkey[i])) - err_mask |= 1 << (SMC_LLC_DEL_RKEY_MAX - 1 - i); - } - - if (err_mask) { - llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG; - llc->err_mask = err_mask; - } + max = min_t(u8, llc->num_rkeys, SMC_LLC_DEL_RKEY_MAX); + for (i = 0; i < max; i++) { + if (smc_rtoken_delete(link, llc->rkey[i])) + err_mask |= 1 << (SMC_LLC_DEL_RKEY_MAX - 1 - i); + } - llc->hd.flags |= SMC_LLC_FLAG_RESP; - smc_llc_send_message(link, llc); + if (err_mask) { + llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG; + llc->err_mask = err_mask; } + + llc->hd.flags |= SMC_LLC_FLAG_RESP; + smc_llc_send_message(link, llc); } /* flush the llc event queue */ @@ -601,6 +560,49 @@ again: spin_unlock_bh(&lgr->llc_event_q_lock); } +/* process llc responses in tasklet context */ +static void smc_llc_rx_response(struct smc_link *link, union smc_llc_msg *llc) +{ + int rc = 0; + + switch (llc->raw.hdr.common.type) { + case SMC_LLC_TEST_LINK: + if (link->state == SMC_LNK_ACTIVE) + complete(&link->llc_testlink_resp); + break; + case SMC_LLC_CONFIRM_LINK: + if (!(llc->raw.hdr.flags & SMC_LLC_FLAG_NO_RMBE_EYEC)) + rc = ENOTSUPP; + if (link->lgr->role == SMC_SERV && + link->state == SMC_LNK_ACTIVATING) { + link->llc_confirm_resp_rc = rc; + complete(&link->llc_confirm_resp); + } + break; + case SMC_LLC_ADD_LINK: + if (link->state == SMC_LNK_ACTIVATING) + complete(&link->llc_add_resp); + break; + case SMC_LLC_DELETE_LINK: + if (link->lgr->role == SMC_SERV) + smc_lgr_schedule_free_work_fast(link->lgr); + break; + case SMC_LLC_CONFIRM_RKEY: + link->llc_confirm_rkey_resp_rc = llc->raw.hdr.flags & + SMC_LLC_FLAG_RKEY_NEG; + complete(&link->llc_confirm_rkey_resp); + break; + case SMC_LLC_CONFIRM_RKEY_CONT: + /* unused as long as we don't send this type of msg */ + break; + case SMC_LLC_DELETE_RKEY: + link->llc_delete_rkey_resp_rc = llc->raw.hdr.flags & + SMC_LLC_FLAG_RKEY_NEG; + complete(&link->llc_delete_rkey_resp); + break; + } +} + /* copy received msg and add it to the event queue */ static void smc_llc_rx_handler(struct ib_wc *wc, void *buf) { @@ -615,6 +617,12 @@ static void smc_llc_rx_handler(struct ib_wc *wc, void *buf) if (llc->raw.hdr.length != sizeof(*llc)) return; /* invalid message */ + /* process responses immediately */ + if (llc->raw.hdr.flags & SMC_LLC_FLAG_RESP) { + smc_llc_rx_response(link, llc); + return; + } + qentry = kmalloc(sizeof(*qentry), GFP_ATOMIC); if (!qentry) return; @@ -667,8 +675,8 @@ int smc_llc_link_init(struct smc_link *link) init_completion(&link->llc_confirm_resp); init_completion(&link->llc_add); init_completion(&link->llc_add_resp); - init_completion(&link->llc_confirm_rkey); - init_completion(&link->llc_delete_rkey); + init_completion(&link->llc_confirm_rkey_resp); + init_completion(&link->llc_delete_rkey_resp); mutex_init(&link->llc_delete_rkey_mutex); init_completion(&link->llc_testlink_resp); INIT_WORK(&link->lgr->llc_event_work, smc_llc_event_work); @@ -708,14 +716,14 @@ int smc_llc_do_confirm_rkey(struct smc_link *link, int rc; /* protected by mutex smc_create_lgr_pending */ - reinit_completion(&link->llc_confirm_rkey); + reinit_completion(&link->llc_confirm_rkey_resp); rc = smc_llc_send_confirm_rkey(link, rmb_desc); if (rc) return rc; /* receive CONFIRM RKEY response from server over RoCE fabric */ - rc = wait_for_completion_interruptible_timeout(&link->llc_confirm_rkey, - SMC_LLC_WAIT_TIME); - if (rc <= 0 || link->llc_confirm_rkey_rc) + rc = wait_for_completion_interruptible_timeout( + &link->llc_confirm_rkey_resp, SMC_LLC_WAIT_TIME); + if (rc <= 0 || link->llc_confirm_rkey_resp_rc) return -EFAULT; return 0; } @@ -729,14 +737,14 @@ int smc_llc_do_delete_rkey(struct smc_link *link, mutex_lock(&link->llc_delete_rkey_mutex); if (link->state != SMC_LNK_ACTIVE) goto out; - reinit_completion(&link->llc_delete_rkey); + reinit_completion(&link->llc_delete_rkey_resp); rc = smc_llc_send_delete_rkey(link, rmb_desc); if (rc) goto out; /* receive DELETE RKEY response from server over RoCE fabric */ - rc = wait_for_completion_interruptible_timeout(&link->llc_delete_rkey, - SMC_LLC_WAIT_TIME); - if (rc <= 0 || link->llc_delete_rkey_rc) + rc = wait_for_completion_interruptible_timeout( + &link->llc_delete_rkey_resp, SMC_LLC_WAIT_TIME); + if (rc <= 0 || link->llc_delete_rkey_resp_rc) rc = -EFAULT; else rc = 0; -- cgit v1.2.3 From faca536008375bece23783e7382b5d0356c13ba5 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Wed, 29 Apr 2020 17:10:48 +0200 Subject: net/smc: use mutex instead of rwlock_t to protect buffers The locks for sndbufs and rmbs are never used from atomic context. Using a mutex for these locks will allow to nest locks with other mutexes. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_core.c | 22 +++++++++++----------- net/smc/smc_core.h | 4 ++-- 2 files changed, 13 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index a1463da14614..8a43d2948493 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -385,8 +385,8 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) lgr->freefast = 0; lgr->freeing = 0; lgr->vlan_id = ini->vlan_id; - rwlock_init(&lgr->sndbufs_lock); - rwlock_init(&lgr->rmbs_lock); + mutex_init(&lgr->sndbufs_lock); + mutex_init(&lgr->rmbs_lock); rwlock_init(&lgr->conns_lock); for (i = 0; i < SMC_RMBE_SIZES; i++) { INIT_LIST_HEAD(&lgr->sndbufs[i]); @@ -456,9 +456,9 @@ static void smcr_buf_unuse(struct smc_buf_desc *rmb_desc, } if (rmb_desc->is_reg_err) { /* buf registration failed, reuse not possible */ - write_lock_bh(&lgr->rmbs_lock); + mutex_lock(&lgr->rmbs_lock); list_del(&rmb_desc->list); - write_unlock_bh(&lgr->rmbs_lock); + mutex_unlock(&lgr->rmbs_lock); smc_buf_free(lgr, true, rmb_desc); } else { @@ -1059,19 +1059,19 @@ int smc_uncompress_bufsize(u8 compressed) * buffer size; if not available, return NULL */ static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize, - rwlock_t *lock, + struct mutex *lock, struct list_head *buf_list) { struct smc_buf_desc *buf_slot; - read_lock_bh(lock); + mutex_lock(lock); list_for_each_entry(buf_slot, buf_list, list) { if (cmpxchg(&buf_slot->used, 0, 1) == 0) { - read_unlock_bh(lock); + mutex_unlock(lock); return buf_slot; } } - read_unlock_bh(lock); + mutex_unlock(lock); return NULL; } @@ -1220,8 +1220,8 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) struct smc_link_group *lgr = conn->lgr; struct list_head *buf_list; int bufsize, bufsize_short; + struct mutex *lock; /* lock buffer list */ int sk_buf_size; - rwlock_t *lock; if (is_rmb) /* use socket recv buffer size (w/o overhead) as start value */ @@ -1262,9 +1262,9 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) continue; buf_desc->used = 1; - write_lock_bh(lock); + mutex_lock(lock); list_add(&buf_desc->list, buf_list); - write_unlock_bh(lock); + mutex_unlock(lock); break; /* found */ } diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index d785656b3489..379ced490c49 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -205,9 +205,9 @@ struct smc_link_group { unsigned short vlan_id; /* vlan id of link group */ struct list_head sndbufs[SMC_RMBE_SIZES];/* tx buffers */ - rwlock_t sndbufs_lock; /* protects tx buffers */ + struct mutex sndbufs_lock; /* protects tx buffers */ struct list_head rmbs[SMC_RMBE_SIZES]; /* rx buffers */ - rwlock_t rmbs_lock; /* protects rx buffers */ + struct mutex rmbs_lock; /* protects rx buffers */ u8 id[SMC_LGR_ID_SIZE]; /* unique lgr id */ struct delayed_work free_work; /* delayed freeing of an lgr */ -- cgit v1.2.3 From 00a049cfde95931c6832edad19d9a4be441cacf5 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Wed, 29 Apr 2020 17:10:49 +0200 Subject: net/smc: move llc layer related init and clear into smc_llc.c Introduce smc_llc_lgr_init() and smc_llc_lgr_clear() to implement all llc layer specific initialization and cleanup in module smc_llc.c. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 6 ++---- net/smc/smc_core.c | 6 +++--- net/smc/smc_core.h | 2 ++ net/smc/smc_llc.c | 26 +++++++++++++++++++++----- net/smc/smc_llc.h | 5 +++-- 5 files changed, 31 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index e39f6aedd3bd..e859e3f420d9 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -381,7 +381,6 @@ static int smcr_lgr_reg_rmbs(struct smc_link_group *lgr, static int smcr_clnt_conf_first_link(struct smc_sock *smc) { - struct net *net = sock_net(smc->clcsock->sk); struct smc_link *link = smc->conn.lnk; int rest; int rc; @@ -433,7 +432,7 @@ static int smcr_clnt_conf_first_link(struct smc_sock *smc) if (rc < 0) return SMC_CLC_DECL_TIMEOUT_AL; - smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time); + smc_llc_link_active(link); return 0; } @@ -1019,7 +1018,6 @@ void smc_close_non_accepted(struct sock *sk) static int smcr_serv_conf_first_link(struct smc_sock *smc) { - struct net *net = sock_net(smc->clcsock->sk); struct smc_link *link = smc->conn.lnk; int rest; int rc; @@ -1065,7 +1063,7 @@ static int smcr_serv_conf_first_link(struct smc_sock *smc) return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_AL : rc; } - smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time); + smc_llc_link_active(link); return 0; } diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 8a43d2948493..db49f8cd5c95 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -412,8 +412,8 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT; memcpy(lgr->peer_systemid, ini->ib_lcl->id_for_peer, SMC_SYSTEMID_LEN); - INIT_LIST_HEAD(&lgr->llc_event_q); - spin_lock_init(&lgr->llc_event_q_lock); + smc_llc_lgr_init(lgr, smc); + link_idx = SMC_SINGLE_LINK; lnk = &lgr->lnk[link_idx]; rc = smcr_link_init(lgr, lnk, link_idx, ini); @@ -614,7 +614,7 @@ static void smc_lgr_free(struct smc_link_group *lgr) if (lgr->lnk[i].state != SMC_LNK_UNUSED) smcr_link_clear(&lgr->lnk[i]); } - smc_llc_event_flush(lgr); + smc_llc_lgr_clear(lgr); if (!atomic_dec_return(&lgr_cnt)) wake_up(&lgrs_deleted); } diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 379ced490c49..b5781511063d 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -238,6 +238,8 @@ struct smc_link_group { /* protects llc_event_q */ struct work_struct llc_event_work; /* llc event worker */ + int llc_testlink_time; + /* link keep alive time */ }; struct { /* SMC-D */ u64 peer_gid; diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 265889c8b03b..e715dd6735ee 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -493,7 +493,7 @@ static void smc_llc_rx_delete_rkey(struct smc_link *link, } /* flush the llc event queue */ -void smc_llc_event_flush(struct smc_link_group *lgr) +static void smc_llc_event_flush(struct smc_link_group *lgr) { struct smc_llc_qentry *qentry, *q; @@ -669,6 +669,23 @@ out: schedule_delayed_work(&link->llc_testlink_wrk, next_interval); } +void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc) +{ + struct net *net = sock_net(smc->clcsock->sk); + + INIT_WORK(&lgr->llc_event_work, smc_llc_event_work); + INIT_LIST_HEAD(&lgr->llc_event_q); + spin_lock_init(&lgr->llc_event_q_lock); + lgr->llc_testlink_time = net->ipv4.sysctl_tcp_keepalive_time; +} + +/* called after lgr was removed from lgr_list */ +void smc_llc_lgr_clear(struct smc_link_group *lgr) +{ + smc_llc_event_flush(lgr); + cancel_work_sync(&lgr->llc_event_work); +} + int smc_llc_link_init(struct smc_link *link) { init_completion(&link->llc_confirm); @@ -679,16 +696,15 @@ int smc_llc_link_init(struct smc_link *link) init_completion(&link->llc_delete_rkey_resp); mutex_init(&link->llc_delete_rkey_mutex); init_completion(&link->llc_testlink_resp); - INIT_WORK(&link->lgr->llc_event_work, smc_llc_event_work); INIT_DELAYED_WORK(&link->llc_testlink_wrk, smc_llc_testlink_work); return 0; } -void smc_llc_link_active(struct smc_link *link, int testlink_time) +void smc_llc_link_active(struct smc_link *link) { link->state = SMC_LNK_ACTIVE; - if (testlink_time) { - link->llc_testlink_time = testlink_time * HZ; + if (link->lgr->llc_testlink_time) { + link->llc_testlink_time = link->lgr->llc_testlink_time * HZ; schedule_delayed_work(&link->llc_testlink_wrk, link->llc_testlink_time); } diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index 9de83495ad14..66063f22166b 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -53,15 +53,16 @@ int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[], enum smc_llc_reqresp reqresp); int smc_llc_send_delete_link(struct smc_link *link, enum smc_llc_reqresp reqresp, bool orderly); +void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc); +void smc_llc_lgr_clear(struct smc_link_group *lgr); int smc_llc_link_init(struct smc_link *link); -void smc_llc_link_active(struct smc_link *link, int testlink_time); +void smc_llc_link_active(struct smc_link *link); void smc_llc_link_deleting(struct smc_link *link); void smc_llc_link_clear(struct smc_link *link); int smc_llc_do_confirm_rkey(struct smc_link *link, struct smc_buf_desc *rmb_desc); int smc_llc_do_delete_rkey(struct smc_link *link, struct smc_buf_desc *rmb_desc); -void smc_llc_event_flush(struct smc_link_group *lgr); int smc_llc_init(void) __init; #endif /* SMC_LLC_H */ -- cgit v1.2.3 From 64d85290d79c0677edb5a8ee2295b36c022fa5df Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Wed, 29 Apr 2020 20:11:52 +0200 Subject: bpf: Allow bpf_map_lookup_elem for SOCKMAP and SOCKHASH White-list map lookup for SOCKMAP/SOCKHASH from BPF. Lookup returns a pointer to a full socket and acquires a reference if necessary. To support it we need to extend the verifier to know that: (1) register storing the lookup result holds a pointer to socket, if lookup was done on SOCKMAP/SOCKHASH, and that (2) map lookup on SOCKMAP/SOCKHASH is a reference acquiring operation, which needs a corresponding reference release with bpf_sk_release. On sock_map side, lookup handlers exposed via bpf_map_ops now bump sk_refcnt if socket is reference counted. In turn, bpf_sk_select_reuseport, the only in-kernel user of SOCKMAP/SOCKHASH ops->map_lookup_elem, was updated to release the reference. Sockets fetched from a map can be used in the same way as ones returned by BPF socket lookup helpers, such as bpf_sk_lookup_tcp. In particular, they can be used with bpf_sk_assign to direct packets toward a socket on TC ingress path. Suggested-by: Lorenz Bauer Signed-off-by: Jakub Sitnicki Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200429181154.479310-2-jakub@cloudflare.com --- kernel/bpf/verifier.c | 45 +++++++++++++++++++++++++++++++++++---------- net/core/filter.c | 4 ++++ net/core/sock_map.c | 18 ++++++++++++++++-- 3 files changed, 55 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2b337e32aa94..70ad009577f8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -429,11 +429,30 @@ static bool is_release_function(enum bpf_func_id func_id) return func_id == BPF_FUNC_sk_release; } -static bool is_acquire_function(enum bpf_func_id func_id) +static bool may_be_acquire_function(enum bpf_func_id func_id) { return func_id == BPF_FUNC_sk_lookup_tcp || func_id == BPF_FUNC_sk_lookup_udp || - func_id == BPF_FUNC_skc_lookup_tcp; + func_id == BPF_FUNC_skc_lookup_tcp || + func_id == BPF_FUNC_map_lookup_elem; +} + +static bool is_acquire_function(enum bpf_func_id func_id, + const struct bpf_map *map) +{ + enum bpf_map_type map_type = map ? map->map_type : BPF_MAP_TYPE_UNSPEC; + + if (func_id == BPF_FUNC_sk_lookup_tcp || + func_id == BPF_FUNC_sk_lookup_udp || + func_id == BPF_FUNC_skc_lookup_tcp) + return true; + + if (func_id == BPF_FUNC_map_lookup_elem && + (map_type == BPF_MAP_TYPE_SOCKMAP || + map_type == BPF_MAP_TYPE_SOCKHASH)) + return true; + + return false; } static bool is_ptr_cast_function(enum bpf_func_id func_id) @@ -3934,7 +3953,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_sock_map_update && func_id != BPF_FUNC_map_delete_elem && func_id != BPF_FUNC_msg_redirect_map && - func_id != BPF_FUNC_sk_select_reuseport) + func_id != BPF_FUNC_sk_select_reuseport && + func_id != BPF_FUNC_map_lookup_elem) goto error; break; case BPF_MAP_TYPE_SOCKHASH: @@ -3942,7 +3962,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_sock_hash_update && func_id != BPF_FUNC_map_delete_elem && func_id != BPF_FUNC_msg_redirect_hash && - func_id != BPF_FUNC_sk_select_reuseport) + func_id != BPF_FUNC_sk_select_reuseport && + func_id != BPF_FUNC_map_lookup_elem) goto error; break; case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY: @@ -4112,7 +4133,7 @@ static bool check_refcount_ok(const struct bpf_func_proto *fn, int func_id) /* A reference acquiring function cannot acquire * another refcounted ptr. */ - if (is_acquire_function(func_id) && count) + if (may_be_acquire_function(func_id) && count) return false; /* We only support one arg being unreferenced at the moment, @@ -4623,7 +4644,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn if (is_ptr_cast_function(func_id)) { /* For release_reference() */ regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id; - } else if (is_acquire_function(func_id)) { + } else if (is_acquire_function(func_id, meta.map_ptr)) { int id = acquire_reference_state(env, insn_idx); if (id < 0) @@ -6532,12 +6553,16 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, if (is_null) { reg->type = SCALAR_VALUE; } else if (reg->type == PTR_TO_MAP_VALUE_OR_NULL) { - if (reg->map_ptr->inner_map_meta) { + const struct bpf_map *map = reg->map_ptr; + + if (map->inner_map_meta) { reg->type = CONST_PTR_TO_MAP; - reg->map_ptr = reg->map_ptr->inner_map_meta; - } else if (reg->map_ptr->map_type == - BPF_MAP_TYPE_XSKMAP) { + reg->map_ptr = map->inner_map_meta; + } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { reg->type = PTR_TO_XDP_SOCK; + } else if (map->map_type == BPF_MAP_TYPE_SOCKMAP || + map->map_type == BPF_MAP_TYPE_SOCKHASH) { + reg->type = PTR_TO_SOCKET; } else { reg->type = PTR_TO_MAP_VALUE; } diff --git a/net/core/filter.c b/net/core/filter.c index da3b7a72c37c..70b32723e6be 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -8712,6 +8712,10 @@ BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern, reuse = rcu_dereference(selected_sk->sk_reuseport_cb); if (!reuse) { + /* Lookup in sock_map can return TCP ESTABLISHED sockets. */ + if (sk_is_refcounted(selected_sk)) + sock_put(selected_sk); + /* reuseport_array has only sk with non NULL sk_reuseport_cb. * The only (!reuse) case here is - the sk has already been * unhashed (e.g. by close()), so treat it as -ENOENT. diff --git a/net/core/sock_map.c b/net/core/sock_map.c index b08dfae10f88..00a26cf2cfe9 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -343,7 +343,14 @@ static struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key) static void *sock_map_lookup(struct bpf_map *map, void *key) { - return __sock_map_lookup_elem(map, *(u32 *)key); + struct sock *sk; + + sk = __sock_map_lookup_elem(map, *(u32 *)key); + if (!sk || !sk_fullsock(sk)) + return NULL; + if (sk_is_refcounted(sk) && !refcount_inc_not_zero(&sk->sk_refcnt)) + return NULL; + return sk; } static void *sock_map_lookup_sys(struct bpf_map *map, void *key) @@ -1051,7 +1058,14 @@ static void *sock_hash_lookup_sys(struct bpf_map *map, void *key) static void *sock_hash_lookup(struct bpf_map *map, void *key) { - return __sock_hash_lookup_elem(map, key); + struct sock *sk; + + sk = __sock_hash_lookup_elem(map, key); + if (!sk || !sk_fullsock(sk)) + return NULL; + if (sk_is_refcounted(sk) && !refcount_inc_not_zero(&sk->sk_refcnt)) + return NULL; + return sk; } static void sock_hash_release_progs(struct bpf_map *map) -- cgit v1.2.3 From 99b2292ba21b47bbd97d33ab20892347ad2ac351 Mon Sep 17 00:00:00 2001 From: Yunjian Wang Date: Thu, 30 Apr 2020 18:16:16 +0800 Subject: net: caif: Fix use correct return type for ndo_start_xmit() The method ndo_start_xmit() returns a value of type netdev_tx_t. Fix the ndo function to use the correct type. Signed-off-by: Yunjian Wang Signed-off-by: David S. Miller --- net/caif/chnl_net.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c index a56628962852..79b6a04d8eb6 100644 --- a/net/caif/chnl_net.c +++ b/net/caif/chnl_net.c @@ -211,7 +211,8 @@ static void chnl_flowctrl_cb(struct cflayer *layr, enum caif_ctrlcmd flow, } } -static int chnl_net_start_xmit(struct sk_buff *skb, struct net_device *dev) +static netdev_tx_t chnl_net_start_xmit(struct sk_buff *skb, + struct net_device *dev) { struct chnl_net *priv; struct cfpkt *pkt = NULL; -- cgit v1.2.3 From 555da9af827d95134656fa459c8f3ece04dd867a Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Thu, 30 Apr 2020 15:55:38 +0200 Subject: net/smc: add event-based llc_flow framework The new framework allows to start specific types of LLC control flows, protects active flows and makes it possible to wait for flows to finish before starting a new flow. This mechanism is used for the LLC control layer to model flows like 'add link' or 'delete link' which need to send/receive several LLC messages and are not allowed to get interrupted by the wrong type of messages. 'Add link' or 'Delete link' messages arriving in the middle of a flow are delayed and processed when the current flow finished. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_core.c | 2 + net/smc/smc_core.h | 24 ++++++++ net/smc/smc_llc.c | 165 +++++++++++++++++++++++++++++++++++++++++++++++++++++ net/smc/smc_llc.h | 8 +++ 4 files changed, 199 insertions(+) (limited to 'net') diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index db49f8cd5c95..4867ddcfe0c6 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -263,6 +263,7 @@ static void smc_lgr_free_work(struct work_struct *work) if (smc_link_usable(lnk)) lnk->state = SMC_LNK_INACTIVE; } + wake_up_interruptible_all(&lgr->llc_waiter); } smc_lgr_free(lgr); } @@ -696,6 +697,7 @@ static void smc_lgr_cleanup(struct smc_link_group *lgr) if (smc_link_usable(lnk)) lnk->state = SMC_LNK_INACTIVE; } + wake_up_interruptible_all(&lgr->llc_waiter); } } diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index b5781511063d..70399217ad6f 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -197,6 +197,20 @@ struct smc_rtoken { /* address/key of remote RMB */ struct smcd_dev; +enum smc_llc_flowtype { + SMC_LLC_FLOW_NONE = 0, + SMC_LLC_FLOW_ADD_LINK = 2, + SMC_LLC_FLOW_DEL_LINK = 4, + SMC_LLC_FLOW_RKEY = 6, +}; + +struct smc_llc_qentry; + +struct smc_llc_flow { + enum smc_llc_flowtype type; + struct smc_llc_qentry *qentry; +}; + struct smc_link_group { struct list_head list; struct rb_root conns_all; /* connection tree */ @@ -238,6 +252,16 @@ struct smc_link_group { /* protects llc_event_q */ struct work_struct llc_event_work; /* llc event worker */ + wait_queue_head_t llc_waiter; + /* w4 next llc event */ + struct smc_llc_flow llc_flow_lcl; + /* llc local control field */ + struct smc_llc_flow llc_flow_rmt; + /* llc remote control field */ + struct smc_llc_qentry *delayed_event; + /* arrived when flow active */ + spinlock_t llc_flow_lock; + /* protects llc flow */ int llc_testlink_time; /* link keep alive time */ }; diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index e715dd6735ee..647cf1a2dfa5 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -140,6 +140,154 @@ struct smc_llc_qentry { union smc_llc_msg msg; }; +struct smc_llc_qentry *smc_llc_flow_qentry_clr(struct smc_llc_flow *flow) +{ + struct smc_llc_qentry *qentry = flow->qentry; + + flow->qentry = NULL; + return qentry; +} + +void smc_llc_flow_qentry_del(struct smc_llc_flow *flow) +{ + struct smc_llc_qentry *qentry; + + if (flow->qentry) { + qentry = flow->qentry; + flow->qentry = NULL; + kfree(qentry); + } +} + +static inline void smc_llc_flow_qentry_set(struct smc_llc_flow *flow, + struct smc_llc_qentry *qentry) +{ + flow->qentry = qentry; +} + +/* try to start a new llc flow, initiated by an incoming llc msg */ +static bool smc_llc_flow_start(struct smc_llc_flow *flow, + struct smc_llc_qentry *qentry) +{ + struct smc_link_group *lgr = qentry->link->lgr; + + spin_lock_bh(&lgr->llc_flow_lock); + if (flow->type) { + /* a flow is already active */ + if ((qentry->msg.raw.hdr.common.type == SMC_LLC_ADD_LINK || + qentry->msg.raw.hdr.common.type == SMC_LLC_DELETE_LINK) && + !lgr->delayed_event) { + lgr->delayed_event = qentry; + } else { + /* forget this llc request */ + kfree(qentry); + } + spin_unlock_bh(&lgr->llc_flow_lock); + return false; + } + switch (qentry->msg.raw.hdr.common.type) { + case SMC_LLC_ADD_LINK: + flow->type = SMC_LLC_FLOW_ADD_LINK; + break; + case SMC_LLC_DELETE_LINK: + flow->type = SMC_LLC_FLOW_DEL_LINK; + break; + case SMC_LLC_CONFIRM_RKEY: + case SMC_LLC_DELETE_RKEY: + flow->type = SMC_LLC_FLOW_RKEY; + break; + default: + flow->type = SMC_LLC_FLOW_NONE; + } + if (qentry == lgr->delayed_event) + lgr->delayed_event = NULL; + spin_unlock_bh(&lgr->llc_flow_lock); + smc_llc_flow_qentry_set(flow, qentry); + return true; +} + +/* start a new local llc flow, wait till current flow finished */ +int smc_llc_flow_initiate(struct smc_link_group *lgr, + enum smc_llc_flowtype type) +{ + enum smc_llc_flowtype allowed_remote = SMC_LLC_FLOW_NONE; + int rc; + + /* all flows except confirm_rkey and delete_rkey are exclusive, + * confirm/delete rkey flows can run concurrently (local and remote) + */ + if (type == SMC_LLC_FLOW_RKEY) + allowed_remote = SMC_LLC_FLOW_RKEY; +again: + if (list_empty(&lgr->list)) + return -ENODEV; + spin_lock_bh(&lgr->llc_flow_lock); + if (lgr->llc_flow_lcl.type == SMC_LLC_FLOW_NONE && + (lgr->llc_flow_rmt.type == SMC_LLC_FLOW_NONE || + lgr->llc_flow_rmt.type == allowed_remote)) { + lgr->llc_flow_lcl.type = type; + spin_unlock_bh(&lgr->llc_flow_lock); + return 0; + } + spin_unlock_bh(&lgr->llc_flow_lock); + rc = wait_event_interruptible_timeout(lgr->llc_waiter, + (lgr->llc_flow_lcl.type == SMC_LLC_FLOW_NONE && + (lgr->llc_flow_rmt.type == SMC_LLC_FLOW_NONE || + lgr->llc_flow_rmt.type == allowed_remote)), + SMC_LLC_WAIT_TIME); + if (!rc) + return -ETIMEDOUT; + goto again; +} + +/* finish the current llc flow */ +void smc_llc_flow_stop(struct smc_link_group *lgr, struct smc_llc_flow *flow) +{ + spin_lock_bh(&lgr->llc_flow_lock); + memset(flow, 0, sizeof(*flow)); + flow->type = SMC_LLC_FLOW_NONE; + spin_unlock_bh(&lgr->llc_flow_lock); + if (!list_empty(&lgr->list) && lgr->delayed_event && + flow == &lgr->llc_flow_lcl) + schedule_work(&lgr->llc_event_work); + else + wake_up_interruptible(&lgr->llc_waiter); +} + +/* lnk is optional and used for early wakeup when link goes down, useful in + * cases where we wait for a response on the link after we sent a request + */ +struct smc_llc_qentry *smc_llc_wait(struct smc_link_group *lgr, + struct smc_link *lnk, + int time_out, u8 exp_msg) +{ + struct smc_llc_flow *flow = &lgr->llc_flow_lcl; + + wait_event_interruptible_timeout(lgr->llc_waiter, + (flow->qentry || + (lnk && !smc_link_usable(lnk)) || + list_empty(&lgr->list)), + time_out); + if (!flow->qentry || + (lnk && !smc_link_usable(lnk)) || list_empty(&lgr->list)) { + smc_llc_flow_qentry_del(flow); + goto out; + } + if (exp_msg && flow->qentry->msg.raw.hdr.common.type != exp_msg) { + if (exp_msg == SMC_LLC_ADD_LINK && + flow->qentry->msg.raw.hdr.common.type == + SMC_LLC_DELETE_LINK) { + /* flow_start will delay the unexpected msg */ + smc_llc_flow_start(&lgr->llc_flow_lcl, + smc_llc_flow_qentry_clr(flow)); + return NULL; + } + smc_llc_flow_qentry_del(flow); + } +out: + return flow->qentry; +} + /********************************** send *************************************/ struct smc_llc_tx_pend { @@ -547,6 +695,16 @@ static void smc_llc_event_work(struct work_struct *work) llc_event_work); struct smc_llc_qentry *qentry; + if (!lgr->llc_flow_lcl.type && lgr->delayed_event) { + if (smc_link_usable(lgr->delayed_event->link)) { + smc_llc_event_handler(lgr->delayed_event); + } else { + qentry = lgr->delayed_event; + lgr->delayed_event = NULL; + kfree(qentry); + } + } + again: spin_lock_bh(&lgr->llc_event_q_lock); if (!list_empty(&lgr->llc_event_q)) { @@ -676,6 +834,8 @@ void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc) INIT_WORK(&lgr->llc_event_work, smc_llc_event_work); INIT_LIST_HEAD(&lgr->llc_event_q); spin_lock_init(&lgr->llc_event_q_lock); + spin_lock_init(&lgr->llc_flow_lock); + init_waitqueue_head(&lgr->llc_waiter); lgr->llc_testlink_time = net->ipv4.sysctl_tcp_keepalive_time; } @@ -683,7 +843,12 @@ void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc) void smc_llc_lgr_clear(struct smc_link_group *lgr) { smc_llc_event_flush(lgr); + wake_up_interruptible_all(&lgr->llc_waiter); cancel_work_sync(&lgr->llc_event_work); + if (lgr->delayed_event) { + kfree(lgr->delayed_event); + lgr->delayed_event = NULL; + } } int smc_llc_link_init(struct smc_link *link) diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index 66063f22166b..49e99ff00ee7 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -63,6 +63,14 @@ int smc_llc_do_confirm_rkey(struct smc_link *link, struct smc_buf_desc *rmb_desc); int smc_llc_do_delete_rkey(struct smc_link *link, struct smc_buf_desc *rmb_desc); +int smc_llc_flow_initiate(struct smc_link_group *lgr, + enum smc_llc_flowtype type); +void smc_llc_flow_stop(struct smc_link_group *lgr, struct smc_llc_flow *flow); +struct smc_llc_qentry *smc_llc_wait(struct smc_link_group *lgr, + struct smc_link *lnk, + int time_out, u8 exp_msg); +struct smc_llc_qentry *smc_llc_flow_qentry_clr(struct smc_llc_flow *flow); +void smc_llc_flow_qentry_del(struct smc_llc_flow *flow); int smc_llc_init(void) __init; #endif /* SMC_LLC_H */ -- cgit v1.2.3 From a6688d919b220bd714948e03bb3caa8a66895005 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Thu, 30 Apr 2020 15:55:39 +0200 Subject: net/smc: enqueue all received LLC messages Introduce smc_llc_enqueue() to enqueue LLC messages, and adapt smc_llc_rx_handler() to enqueue all received LLC messages. smc_llc_enqueue() also makes it possible to enqueue LLC messages from local code. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_llc.c | 46 +++++++++++++++++++++++++++++----------------- 1 file changed, 29 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 647cf1a2dfa5..a146b3b43580 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -719,11 +719,14 @@ again: } /* process llc responses in tasklet context */ -static void smc_llc_rx_response(struct smc_link *link, union smc_llc_msg *llc) +static void smc_llc_rx_response(struct smc_link *link, + struct smc_llc_qentry *qentry) { + u8 llc_type = qentry->msg.raw.hdr.common.type; + union smc_llc_msg *llc = &qentry->msg; int rc = 0; - switch (llc->raw.hdr.common.type) { + switch (llc_type) { case SMC_LLC_TEST_LINK: if (link->state == SMC_LNK_ACTIVE) complete(&link->llc_testlink_resp); @@ -759,40 +762,49 @@ static void smc_llc_rx_response(struct smc_link *link, union smc_llc_msg *llc) complete(&link->llc_delete_rkey_resp); break; } + kfree(qentry); } -/* copy received msg and add it to the event queue */ -static void smc_llc_rx_handler(struct ib_wc *wc, void *buf) +static void smc_llc_enqueue(struct smc_link *link, union smc_llc_msg *llc) { - struct smc_link *link = (struct smc_link *)wc->qp->qp_context; struct smc_link_group *lgr = link->lgr; struct smc_llc_qentry *qentry; - union smc_llc_msg *llc = buf; unsigned long flags; - if (wc->byte_len < sizeof(*llc)) - return; /* short message */ - if (llc->raw.hdr.length != sizeof(*llc)) - return; /* invalid message */ - - /* process responses immediately */ - if (llc->raw.hdr.flags & SMC_LLC_FLAG_RESP) { - smc_llc_rx_response(link, llc); - return; - } - qentry = kmalloc(sizeof(*qentry), GFP_ATOMIC); if (!qentry) return; qentry->link = link; INIT_LIST_HEAD(&qentry->list); memcpy(&qentry->msg, llc, sizeof(union smc_llc_msg)); + + /* process responses immediately */ + if (llc->raw.hdr.flags & SMC_LLC_FLAG_RESP) { + smc_llc_rx_response(link, qentry); + return; + } + + /* add requests to event queue */ spin_lock_irqsave(&lgr->llc_event_q_lock, flags); list_add_tail(&qentry->list, &lgr->llc_event_q); spin_unlock_irqrestore(&lgr->llc_event_q_lock, flags); schedule_work(&link->lgr->llc_event_work); } +/* copy received msg and add it to the event queue */ +static void smc_llc_rx_handler(struct ib_wc *wc, void *buf) +{ + struct smc_link *link = (struct smc_link *)wc->qp->qp_context; + union smc_llc_msg *llc = buf; + + if (wc->byte_len < sizeof(*llc)) + return; /* short message */ + if (llc->raw.hdr.length != sizeof(*llc)) + return; /* invalid message */ + + smc_llc_enqueue(link, llc); +} + /***************************** worker, utils *********************************/ static void smc_llc_testlink_work(struct work_struct *work) -- cgit v1.2.3 From 81e6e5e70df46bb5b205e53f2b7885e49a9d4974 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Thu, 30 Apr 2020 15:55:40 +0200 Subject: net/smc: introduce link group type Add a type field to the link group which reflects the current link group redundancy state. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_core.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'net') diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 70399217ad6f..51366a9f4980 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -197,6 +197,14 @@ struct smc_rtoken { /* address/key of remote RMB */ struct smcd_dev; +enum smc_lgr_type { /* redundancy state of lgr */ + SMC_LGR_NONE, /* no active links, lgr to be deleted */ + SMC_LGR_SINGLE, /* 1 active RNIC on each peer */ + SMC_LGR_SYMMETRIC, /* 2 active RNICs on each peer */ + SMC_LGR_ASYMMETRIC_PEER, /* local has 2, peer 1 active RNICs */ + SMC_LGR_ASYMMETRIC_LOCAL, /* local has 1, peer 2 active RNICs */ +}; + enum smc_llc_flowtype { SMC_LLC_FLOW_NONE = 0, SMC_LLC_FLOW_ADD_LINK = 2, @@ -246,6 +254,8 @@ struct smc_link_group { DECLARE_BITMAP(rtokens_used_mask, SMC_RMBS_PER_LGR_MAX); /* used rtoken elements */ u8 next_link_id; + enum smc_lgr_type type; + /* redundancy state */ struct list_head llc_event_q; /* queue for llc events */ spinlock_t llc_event_q_lock; -- cgit v1.2.3 From 92334cfcb3a2a102dc1b23513bbe2fca4347e2d6 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Thu, 30 Apr 2020 15:55:41 +0200 Subject: net/smc: add logic to evaluate CONFIRM_LINK messages to LLC layer Introduce smc_llc_eval_conf_link() to evaluate the CONFIRM_LINK message contents. This implements this logic at the LLC layer. The function will be used by af_smc.c to process the received LLC layer messages. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_llc.c | 11 +++++++++++ net/smc/smc_llc.h | 2 ++ 2 files changed, 13 insertions(+) (limited to 'net') diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index a146b3b43580..9248b90fe37e 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -946,6 +946,17 @@ out: return rc; } +/* evaluate confirm link request or response */ +int smc_llc_eval_conf_link(struct smc_llc_qentry *qentry, + enum smc_llc_reqresp type) +{ + if (type == SMC_LLC_REQ) /* SMC server assigns link_id */ + qentry->link->link_id = qentry->msg.confirm_link.link_num; + if (!(qentry->msg.raw.hdr.flags & SMC_LLC_FLAG_NO_RMBE_EYEC)) + return -ENOTSUPP; + return 0; +} + /***************************** init, exit, misc ******************************/ static struct smc_wr_rx_handler smc_llc_rx_handlers[] = { diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index 49e99ff00ee7..637acf91ffb7 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -66,6 +66,8 @@ int smc_llc_do_delete_rkey(struct smc_link *link, int smc_llc_flow_initiate(struct smc_link_group *lgr, enum smc_llc_flowtype type); void smc_llc_flow_stop(struct smc_link_group *lgr, struct smc_llc_flow *flow); +int smc_llc_eval_conf_link(struct smc_llc_qentry *qentry, + enum smc_llc_reqresp type); struct smc_llc_qentry *smc_llc_wait(struct smc_link_group *lgr, struct smc_link *lnk, int time_out, u8 exp_msg); -- cgit v1.2.3 From 4667bb4aaabf87d6b97be1b4671b9db340a58cdc Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Thu, 30 Apr 2020 15:55:42 +0200 Subject: net/smc: adapt SMC server code to use the LLC flow Change the code that processes the SMC server part of connection establishment to use the LLC flow framework (CONFIRM_LINK response messages). Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 39 +++++++++++++++------------------------ net/smc/smc_core.h | 3 --- net/smc/smc_llc.c | 20 +++++--------------- 3 files changed, 20 insertions(+), 42 deletions(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index e859e3f420d9..ab3aef1ddfa4 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1019,9 +1019,11 @@ void smc_close_non_accepted(struct sock *sk) static int smcr_serv_conf_first_link(struct smc_sock *smc) { struct smc_link *link = smc->conn.lnk; - int rest; + struct smc_llc_qentry *qentry; int rc; + link->lgr->type = SMC_LGR_SINGLE; + if (smcr_link_reg_rmb(link, smc->conn.rmb_desc, false)) return SMC_CLC_DECL_ERR_REGRMB; @@ -1031,40 +1033,27 @@ static int smcr_serv_conf_first_link(struct smc_sock *smc) return SMC_CLC_DECL_TIMEOUT_CL; /* receive CONFIRM LINK response from client over the RoCE fabric */ - rest = wait_for_completion_interruptible_timeout( - &link->llc_confirm_resp, - SMC_LLC_WAIT_FIRST_TIME); - if (rest <= 0) { + qentry = smc_llc_wait(link->lgr, link, SMC_LLC_WAIT_TIME, + SMC_LLC_CONFIRM_LINK); + if (!qentry) { struct smc_clc_msg_decline dclc; rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc; } - - if (link->llc_confirm_resp_rc) + rc = smc_llc_eval_conf_link(qentry, SMC_LLC_RESP); + smc_llc_flow_qentry_del(&link->lgr->llc_flow_lcl); + if (rc) return SMC_CLC_DECL_RMBE_EC; - /* send ADD LINK request to client over the RoCE fabric */ - rc = smc_llc_send_add_link(link, - link->smcibdev->mac[link->ibport - 1], - link->gid, SMC_LLC_REQ); - if (rc < 0) - return SMC_CLC_DECL_TIMEOUT_AL; - - /* receive ADD LINK response from client over the RoCE fabric */ - rest = wait_for_completion_interruptible_timeout(&link->llc_add_resp, - SMC_LLC_WAIT_TIME); - if (rest <= 0) { - struct smc_clc_msg_decline dclc; - - rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), - SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); - return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_AL : rc; - } + /* confirm_rkey is implicit on 1st contact */ + smc->conn.rmb_desc->is_conf_rkey = true; smc_llc_link_active(link); + /* initial contact - try to establish second link */ + /* tbd: call smc_llc_srv_add_link(link); */ return 0; } @@ -1240,7 +1229,9 @@ static int smc_listen_rdma_finish(struct smc_sock *new_smc, goto decline; } /* QP confirmation over RoCE fabric */ + smc_llc_flow_initiate(link->lgr, SMC_LLC_FLOW_ADD_LINK); reason_code = smcr_serv_conf_first_link(new_smc); + smc_llc_flow_stop(link->lgr, &link->lgr->llc_flow_lcl); if (reason_code) goto decline; } diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 51366a9f4980..01a9cb885ef2 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -121,11 +121,8 @@ struct smc_link { enum smc_link_state state; /* state of link */ struct completion llc_confirm; /* wait for rx of conf link */ - struct completion llc_confirm_resp; /* wait 4 rx of cnf lnk rsp */ int llc_confirm_rc; /* rc from confirm link msg */ - int llc_confirm_resp_rc; /* rc from conf_resp msg */ struct completion llc_add; /* wait for rx of add link */ - struct completion llc_add_resp; /* wait for rx of add link rsp*/ struct delayed_work llc_testlink_wrk; /* testlink worker */ struct completion llc_testlink_resp; /* wait for rx of testlink */ int llc_testlink_time; /* testlink interval */ diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 9248b90fe37e..5381b16fd482 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -724,26 +724,18 @@ static void smc_llc_rx_response(struct smc_link *link, { u8 llc_type = qentry->msg.raw.hdr.common.type; union smc_llc_msg *llc = &qentry->msg; - int rc = 0; switch (llc_type) { case SMC_LLC_TEST_LINK: if (link->state == SMC_LNK_ACTIVE) complete(&link->llc_testlink_resp); break; - case SMC_LLC_CONFIRM_LINK: - if (!(llc->raw.hdr.flags & SMC_LLC_FLAG_NO_RMBE_EYEC)) - rc = ENOTSUPP; - if (link->lgr->role == SMC_SERV && - link->state == SMC_LNK_ACTIVATING) { - link->llc_confirm_resp_rc = rc; - complete(&link->llc_confirm_resp); - } - break; case SMC_LLC_ADD_LINK: - if (link->state == SMC_LNK_ACTIVATING) - complete(&link->llc_add_resp); - break; + case SMC_LLC_CONFIRM_LINK: + /* assign responses to the local flow, we requested them */ + smc_llc_flow_qentry_set(&link->lgr->llc_flow_lcl, qentry); + wake_up_interruptible(&link->lgr->llc_waiter); + return; case SMC_LLC_DELETE_LINK: if (link->lgr->role == SMC_SERV) smc_lgr_schedule_free_work_fast(link->lgr); @@ -866,9 +858,7 @@ void smc_llc_lgr_clear(struct smc_link_group *lgr) int smc_llc_link_init(struct smc_link *link) { init_completion(&link->llc_confirm); - init_completion(&link->llc_confirm_resp); init_completion(&link->llc_add); - init_completion(&link->llc_add_resp); init_completion(&link->llc_confirm_rkey_resp); init_completion(&link->llc_delete_rkey_resp); mutex_init(&link->llc_delete_rkey_mutex); -- cgit v1.2.3 From 0fb0b02bd6fd26cba38002be4a6bbcae2228fd44 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Thu, 30 Apr 2020 15:55:43 +0200 Subject: net/smc: adapt SMC client code to use the LLC flow Change the code that processes the SMC client part of connection establishment to use the LLC flow framework (CONFIRM_LINK request messages). Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 69 +++++++++++++++++++++++++++++++++------------------ net/smc/smc_clc.h | 1 + net/smc/smc_core.h | 3 --- net/smc/smc_llc.c | 72 +++++++++++++++++++----------------------------------- 4 files changed, 71 insertions(+), 74 deletions(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index ab3aef1ddfa4..bd9662d06896 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -382,22 +382,24 @@ static int smcr_lgr_reg_rmbs(struct smc_link_group *lgr, static int smcr_clnt_conf_first_link(struct smc_sock *smc) { struct smc_link *link = smc->conn.lnk; - int rest; + struct smc_llc_qentry *qentry; int rc; + link->lgr->type = SMC_LGR_SINGLE; + /* receive CONFIRM LINK request from server over RoCE fabric */ - rest = wait_for_completion_interruptible_timeout( - &link->llc_confirm, - SMC_LLC_WAIT_FIRST_TIME); - if (rest <= 0) { + qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME, + SMC_LLC_CONFIRM_LINK); + if (!qentry) { struct smc_clc_msg_decline dclc; rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc; } - - if (link->llc_confirm_rc) + rc = smc_llc_eval_conf_link(qentry, SMC_LLC_REQ); + smc_llc_flow_qentry_del(&link->lgr->llc_flow_lcl); + if (rc) return SMC_CLC_DECL_RMBE_EC; rc = smc_ib_modify_qp_rts(link); @@ -409,31 +411,30 @@ static int smcr_clnt_conf_first_link(struct smc_sock *smc) if (smcr_link_reg_rmb(link, smc->conn.rmb_desc, false)) return SMC_CLC_DECL_ERR_REGRMB; + /* confirm_rkey is implicit on 1st contact */ + smc->conn.rmb_desc->is_conf_rkey = true; + /* send CONFIRM LINK response over RoCE fabric */ rc = smc_llc_send_confirm_link(link, SMC_LLC_RESP); if (rc < 0) return SMC_CLC_DECL_TIMEOUT_CL; - /* receive ADD LINK request from server over RoCE fabric */ - rest = wait_for_completion_interruptible_timeout(&link->llc_add, - SMC_LLC_WAIT_TIME); - if (rest <= 0) { + smc_llc_link_active(link); + + /* optional 2nd link, receive ADD LINK request from server */ + qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME, + SMC_LLC_ADD_LINK); + if (!qentry) { struct smc_clc_msg_decline dclc; rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); - return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_AL : rc; + if (rc == -EAGAIN) + rc = 0; /* no DECLINE received, go with one link */ + return rc; } - - /* send add link reject message, only one link supported for now */ - rc = smc_llc_send_add_link(link, - link->smcibdev->mac[link->ibport - 1], - link->gid, SMC_LLC_RESP); - if (rc < 0) - return SMC_CLC_DECL_TIMEOUT_AL; - - smc_llc_link_active(link); - + smc_llc_flow_qentry_clr(&link->lgr->llc_flow_lcl); + /* tbd: call smc_llc_cli_add_link(link, qentry); */ return 0; } @@ -613,8 +614,8 @@ static int smc_connect_rdma(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *aclc, struct smc_init_info *ini) { + int i, reason_code = 0; struct smc_link *link; - int reason_code = 0; ini->is_smcd = false; ini->ib_lcl = &aclc->lcl; @@ -627,10 +628,28 @@ static int smc_connect_rdma(struct smc_sock *smc, mutex_unlock(&smc_client_lgr_pending); return reason_code; } - link = smc->conn.lnk; smc_conn_save_peer_info(smc, aclc); + if (ini->cln_first_contact == SMC_FIRST_CONTACT) { + link = smc->conn.lnk; + } else { + /* set link that was assigned by server */ + link = NULL; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + struct smc_link *l = &smc->conn.lgr->lnk[i]; + + if (l->peer_qpn == ntoh24(aclc->qpn)) { + link = l; + break; + } + } + if (!link) + return smc_connect_abort(smc, SMC_CLC_DECL_NOSRVLINK, + ini->cln_first_contact); + smc->conn.lnk = link; + } + /* create send buffer and rmb */ if (smc_buf_create(smc, false)) return smc_connect_abort(smc, SMC_CLC_DECL_MEM, @@ -666,7 +685,9 @@ static int smc_connect_rdma(struct smc_sock *smc, if (ini->cln_first_contact == SMC_FIRST_CONTACT) { /* QP confirmation over RoCE fabric */ + smc_llc_flow_initiate(link->lgr, SMC_LLC_FLOW_ADD_LINK); reason_code = smcr_clnt_conf_first_link(smc); + smc_llc_flow_stop(link->lgr, &link->lgr->llc_flow_lcl); if (reason_code) return smc_connect_abort(smc, reason_code, ini->cln_first_contact); diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 4f2e150a2be1..465876701b75 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -45,6 +45,7 @@ #define SMC_CLC_DECL_GETVLANERR 0x03080000 /* err to get vlan id of ip device*/ #define SMC_CLC_DECL_ISMVLANERR 0x03090000 /* err to reg vlan id on ism dev */ #define SMC_CLC_DECL_NOACTLINK 0x030a0000 /* no active smc-r link in lgr */ +#define SMC_CLC_DECL_NOSRVLINK 0x030b0000 /* SMC-R link from srv not found */ #define SMC_CLC_DECL_SYNCERR 0x04000000 /* synchronization error */ #define SMC_CLC_DECL_PEERDECL 0x05000000 /* peer declined during handshake */ #define SMC_CLC_DECL_INTERR 0x09990000 /* internal error */ diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 01a9cb885ef2..31237c4c0d93 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -120,9 +120,6 @@ struct smc_link { struct smc_link_group *lgr; /* parent link group */ enum smc_link_state state; /* state of link */ - struct completion llc_confirm; /* wait for rx of conf link */ - int llc_confirm_rc; /* rc from confirm link msg */ - struct completion llc_add; /* wait for rx of add link */ struct delayed_work llc_testlink_wrk; /* testlink worker */ struct completion llc_testlink_resp; /* wait for rx of testlink */ int llc_testlink_time; /* testlink interval */ diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 5381b16fd482..644e9ab0dec5 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -528,47 +528,6 @@ static int smc_llc_send_message(struct smc_link *link, void *llcbuf) /********************************* receive ***********************************/ -static void smc_llc_rx_confirm_link(struct smc_link *link, - struct smc_llc_msg_confirm_link *llc) -{ - struct smc_link_group *lgr = smc_get_lgr(link); - int conf_rc = 0; - - /* RMBE eyecatchers are not supported */ - if (!(llc->hd.flags & SMC_LLC_FLAG_NO_RMBE_EYEC)) - conf_rc = ENOTSUPP; - - if (lgr->role == SMC_CLNT && - link->state == SMC_LNK_ACTIVATING) { - link->llc_confirm_rc = conf_rc; - link->link_id = llc->link_num; - complete(&link->llc_confirm); - } -} - -static void smc_llc_rx_add_link(struct smc_link *link, - struct smc_llc_msg_add_link *llc) -{ - struct smc_link_group *lgr = smc_get_lgr(link); - - if (link->state == SMC_LNK_ACTIVATING) { - complete(&link->llc_add); - return; - } - - if (lgr->role == SMC_SERV) { - smc_llc_prep_add_link(llc, link, - link->smcibdev->mac[link->ibport - 1], - link->gid, SMC_LLC_REQ); - - } else { - smc_llc_prep_add_link(llc, link, - link->smcibdev->mac[link->ibport - 1], - link->gid, SMC_LLC_RESP); - } - smc_llc_send_message(link, llc); -} - static void smc_llc_rx_delete_link(struct smc_link *link, struct smc_llc_msg_del_link *llc) { @@ -657,6 +616,7 @@ static void smc_llc_event_handler(struct smc_llc_qentry *qentry) { union smc_llc_msg *llc = &qentry->msg; struct smc_link *link = qentry->link; + struct smc_link_group *lgr = link->lgr; if (!smc_link_usable(link)) goto out; @@ -665,11 +625,31 @@ static void smc_llc_event_handler(struct smc_llc_qentry *qentry) case SMC_LLC_TEST_LINK: smc_llc_rx_test_link(link, &llc->test_link); break; - case SMC_LLC_CONFIRM_LINK: - smc_llc_rx_confirm_link(link, &llc->confirm_link); - break; case SMC_LLC_ADD_LINK: - smc_llc_rx_add_link(link, &llc->add_link); + if (list_empty(&lgr->list)) + goto out; /* lgr is terminating */ + if (lgr->role == SMC_CLNT) { + if (lgr->llc_flow_lcl.type == SMC_LLC_FLOW_ADD_LINK) { + /* a flow is waiting for this message */ + smc_llc_flow_qentry_set(&lgr->llc_flow_lcl, + qentry); + wake_up_interruptible(&lgr->llc_waiter); + } else if (smc_llc_flow_start(&lgr->llc_flow_lcl, + qentry)) { + /* tbd: schedule_work(&lgr->llc_add_link_work); */ + } + } else if (smc_llc_flow_start(&lgr->llc_flow_lcl, qentry)) { + /* as smc server, handle client suggestion */ + /* tbd: schedule_work(&lgr->llc_add_link_work); */ + } + return; + case SMC_LLC_CONFIRM_LINK: + if (lgr->llc_flow_lcl.type != SMC_LLC_FLOW_NONE) { + /* a flow is waiting for this message */ + smc_llc_flow_qentry_set(&lgr->llc_flow_lcl, qentry); + wake_up_interruptible(&lgr->llc_waiter); + return; + } break; case SMC_LLC_DELETE_LINK: smc_llc_rx_delete_link(link, &llc->delete_link); @@ -857,8 +837,6 @@ void smc_llc_lgr_clear(struct smc_link_group *lgr) int smc_llc_link_init(struct smc_link *link) { - init_completion(&link->llc_confirm); - init_completion(&link->llc_add); init_completion(&link->llc_confirm_rkey_resp); init_completion(&link->llc_delete_rkey_resp); mutex_init(&link->llc_delete_rkey_mutex); -- cgit v1.2.3 From 3d88a21b0cb6a2661a567e57a431e5aa12ecb203 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Thu, 30 Apr 2020 15:55:44 +0200 Subject: net/smc: multiple link support and LLC flow for smc_llc_do_confirm_rkey Adapt smc_llc_do_confirm_rkey() to use the LLC flow and support the rkeys of multiple links when the CONFIRM_RKEY LLC message is build. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_core.h | 2 -- net/smc/smc_llc.c | 65 +++++++++++++++++++++++++++++++++++------------------- net/smc/smc_llc.h | 2 +- 3 files changed, 43 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 31237c4c0d93..4e0dfb1d5804 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -123,8 +123,6 @@ struct smc_link { struct delayed_work llc_testlink_wrk; /* testlink worker */ struct completion llc_testlink_resp; /* wait for rx of testlink */ int llc_testlink_time; /* testlink interval */ - struct completion llc_confirm_rkey_resp; /* w4 rx of cnf rkey */ - int llc_confirm_rkey_resp_rc; /* rc from cnf rkey */ struct completion llc_delete_rkey_resp; /* w4 rx of del rkey */ int llc_delete_rkey_resp_rc; /* rc from del rkey */ struct mutex llc_delete_rkey_mutex; /* serialize usage */ diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 644e9ab0dec5..5db11f54b4cd 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -369,27 +369,44 @@ int smc_llc_send_confirm_link(struct smc_link *link, } /* send LLC confirm rkey request */ -static int smc_llc_send_confirm_rkey(struct smc_link *link, +static int smc_llc_send_confirm_rkey(struct smc_link *send_link, struct smc_buf_desc *rmb_desc) { struct smc_llc_msg_confirm_rkey *rkeyllc; struct smc_wr_tx_pend_priv *pend; struct smc_wr_buf *wr_buf; - int rc; + struct smc_link *link; + int i, rc, rtok_ix; - rc = smc_llc_add_pending_send(link, &wr_buf, &pend); + rc = smc_llc_add_pending_send(send_link, &wr_buf, &pend); if (rc) return rc; rkeyllc = (struct smc_llc_msg_confirm_rkey *)wr_buf; memset(rkeyllc, 0, sizeof(*rkeyllc)); rkeyllc->hd.common.type = SMC_LLC_CONFIRM_RKEY; rkeyllc->hd.length = sizeof(struct smc_llc_msg_confirm_rkey); + + rtok_ix = 1; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + link = &send_link->lgr->lnk[i]; + if (link->state == SMC_LNK_ACTIVE && link != send_link) { + rkeyllc->rtoken[rtok_ix].link_id = link->link_id; + rkeyllc->rtoken[rtok_ix].rmb_key = + htonl(rmb_desc->mr_rx[link->link_idx]->rkey); + rkeyllc->rtoken[rtok_ix].rmb_vaddr = cpu_to_be64( + (u64)sg_dma_address( + rmb_desc->sgt[link->link_idx].sgl)); + rtok_ix++; + } + } + /* rkey of send_link is in rtoken[0] */ + rkeyllc->rtoken[0].num_rkeys = rtok_ix - 1; rkeyllc->rtoken[0].rmb_key = - htonl(rmb_desc->mr_rx[link->link_idx]->rkey); + htonl(rmb_desc->mr_rx[send_link->link_idx]->rkey); rkeyllc->rtoken[0].rmb_vaddr = cpu_to_be64( - (u64)sg_dma_address(rmb_desc->sgt[link->link_idx].sgl)); + (u64)sg_dma_address(rmb_desc->sgt[send_link->link_idx].sgl)); /* send llc message */ - rc = smc_wr_tx_send(link, pend); + rc = smc_wr_tx_send(send_link, pend); return rc; } @@ -712,6 +729,7 @@ static void smc_llc_rx_response(struct smc_link *link, break; case SMC_LLC_ADD_LINK: case SMC_LLC_CONFIRM_LINK: + case SMC_LLC_CONFIRM_RKEY: /* assign responses to the local flow, we requested them */ smc_llc_flow_qentry_set(&link->lgr->llc_flow_lcl, qentry); wake_up_interruptible(&link->lgr->llc_waiter); @@ -720,11 +738,6 @@ static void smc_llc_rx_response(struct smc_link *link, if (link->lgr->role == SMC_SERV) smc_lgr_schedule_free_work_fast(link->lgr); break; - case SMC_LLC_CONFIRM_RKEY: - link->llc_confirm_rkey_resp_rc = llc->raw.hdr.flags & - SMC_LLC_FLAG_RKEY_NEG; - complete(&link->llc_confirm_rkey_resp); - break; case SMC_LLC_CONFIRM_RKEY_CONT: /* unused as long as we don't send this type of msg */ break; @@ -837,7 +850,6 @@ void smc_llc_lgr_clear(struct smc_link_group *lgr) int smc_llc_link_init(struct smc_link *link) { - init_completion(&link->llc_confirm_rkey_resp); init_completion(&link->llc_delete_rkey_resp); mutex_init(&link->llc_delete_rkey_mutex); init_completion(&link->llc_testlink_resp); @@ -870,23 +882,30 @@ void smc_llc_link_clear(struct smc_link *link) smc_wr_wakeup_tx_wait(link); } -/* register a new rtoken at the remote peer */ -int smc_llc_do_confirm_rkey(struct smc_link *link, +/* register a new rtoken at the remote peer (for all links) */ +int smc_llc_do_confirm_rkey(struct smc_link *send_link, struct smc_buf_desc *rmb_desc) { - int rc; + struct smc_link_group *lgr = send_link->lgr; + struct smc_llc_qentry *qentry = NULL; + int rc = 0; - /* protected by mutex smc_create_lgr_pending */ - reinit_completion(&link->llc_confirm_rkey_resp); - rc = smc_llc_send_confirm_rkey(link, rmb_desc); + rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY); if (rc) return rc; + rc = smc_llc_send_confirm_rkey(send_link, rmb_desc); + if (rc) + goto out; /* receive CONFIRM RKEY response from server over RoCE fabric */ - rc = wait_for_completion_interruptible_timeout( - &link->llc_confirm_rkey_resp, SMC_LLC_WAIT_TIME); - if (rc <= 0 || link->llc_confirm_rkey_resp_rc) - return -EFAULT; - return 0; + qentry = smc_llc_wait(lgr, send_link, SMC_LLC_WAIT_TIME, + SMC_LLC_CONFIRM_RKEY); + if (!qentry || (qentry->msg.raw.hdr.flags & SMC_LLC_FLAG_RKEY_NEG)) + rc = -EFAULT; +out: + if (qentry) + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); + return rc; } /* unregister an rtoken at the remote peer */ diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index 637acf91ffb7..d82d8346b61e 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -59,7 +59,7 @@ int smc_llc_link_init(struct smc_link *link); void smc_llc_link_active(struct smc_link *link); void smc_llc_link_deleting(struct smc_link *link); void smc_llc_link_clear(struct smc_link *link); -int smc_llc_do_confirm_rkey(struct smc_link *link, +int smc_llc_do_confirm_rkey(struct smc_link *send_link, struct smc_buf_desc *rmb_desc); int smc_llc_do_delete_rkey(struct smc_link *link, struct smc_buf_desc *rmb_desc); -- cgit v1.2.3 From 6d74c3a8a3e7a488a7d9d8c4a59091ccae72fc4c Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Thu, 30 Apr 2020 15:55:45 +0200 Subject: net/smc: multiple link support and LLC flow for smc_llc_do_delete_rkey Adapt smc_llc_do_delete_rkey() to use the LLC flow and support multiple links when deleting the rkeys for rmb buffers at the peer. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_core.c | 10 ++++------ net/smc/smc_core.h | 3 --- net/smc/smc_llc.c | 39 +++++++++++++++++++-------------------- net/smc/smc_llc.h | 2 +- 4 files changed, 24 insertions(+), 30 deletions(-) (limited to 'net') diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 4867ddcfe0c6..f71a366ed6ac 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -446,13 +446,11 @@ out: } static void smcr_buf_unuse(struct smc_buf_desc *rmb_desc, - struct smc_link *lnk) + struct smc_link_group *lgr) { - struct smc_link_group *lgr = lnk->lgr; - if (rmb_desc->is_conf_rkey && !list_empty(&lgr->list)) { /* unregister rmb with peer */ - smc_llc_do_delete_rkey(lnk, rmb_desc); + smc_llc_do_delete_rkey(lgr, rmb_desc); rmb_desc->is_conf_rkey = false; } if (rmb_desc->is_reg_err) { @@ -475,7 +473,7 @@ static void smc_buf_unuse(struct smc_connection *conn, if (conn->rmb_desc && lgr->is_smcd) conn->rmb_desc->used = 0; else if (conn->rmb_desc) - smcr_buf_unuse(conn->rmb_desc, conn->lnk); + smcr_buf_unuse(conn->rmb_desc, lgr); } /* remove a finished connection from its link group */ @@ -1169,7 +1167,6 @@ static int smcr_buf_map_usable_links(struct smc_link_group *lgr, if (!smc_link_usable(lnk)) continue; if (smcr_buf_map_link(buf_desc, is_rmb, lnk)) { - smcr_buf_unuse(buf_desc, lnk); rc = -ENOMEM; goto out; } @@ -1275,6 +1272,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) if (!is_smcd) { if (smcr_buf_map_usable_links(lgr, buf_desc, is_rmb)) { + smcr_buf_unuse(buf_desc, lgr); return -ENOMEM; } } diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 4e0dfb1d5804..364a54e28d61 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -123,9 +123,6 @@ struct smc_link { struct delayed_work llc_testlink_wrk; /* testlink worker */ struct completion llc_testlink_resp; /* wait for rx of testlink */ int llc_testlink_time; /* testlink interval */ - struct completion llc_delete_rkey_resp; /* w4 rx of del rkey */ - int llc_delete_rkey_resp_rc; /* rc from del rkey */ - struct mutex llc_delete_rkey_mutex; /* serialize usage */ }; /* For now we just allow one parallel link per link group. The SMC protocol diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 5db11f54b4cd..f9ec270818fa 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -720,7 +720,6 @@ static void smc_llc_rx_response(struct smc_link *link, struct smc_llc_qentry *qentry) { u8 llc_type = qentry->msg.raw.hdr.common.type; - union smc_llc_msg *llc = &qentry->msg; switch (llc_type) { case SMC_LLC_TEST_LINK: @@ -730,6 +729,7 @@ static void smc_llc_rx_response(struct smc_link *link, case SMC_LLC_ADD_LINK: case SMC_LLC_CONFIRM_LINK: case SMC_LLC_CONFIRM_RKEY: + case SMC_LLC_DELETE_RKEY: /* assign responses to the local flow, we requested them */ smc_llc_flow_qentry_set(&link->lgr->llc_flow_lcl, qentry); wake_up_interruptible(&link->lgr->llc_waiter); @@ -741,11 +741,6 @@ static void smc_llc_rx_response(struct smc_link *link, case SMC_LLC_CONFIRM_RKEY_CONT: /* unused as long as we don't send this type of msg */ break; - case SMC_LLC_DELETE_RKEY: - link->llc_delete_rkey_resp_rc = llc->raw.hdr.flags & - SMC_LLC_FLAG_RKEY_NEG; - complete(&link->llc_delete_rkey_resp); - break; } kfree(qentry); } @@ -850,8 +845,6 @@ void smc_llc_lgr_clear(struct smc_link_group *lgr) int smc_llc_link_init(struct smc_link *link) { - init_completion(&link->llc_delete_rkey_resp); - mutex_init(&link->llc_delete_rkey_mutex); init_completion(&link->llc_testlink_resp); INIT_DELAYED_WORK(&link->llc_testlink_wrk, smc_llc_testlink_work); return 0; @@ -909,27 +902,33 @@ out: } /* unregister an rtoken at the remote peer */ -int smc_llc_do_delete_rkey(struct smc_link *link, +int smc_llc_do_delete_rkey(struct smc_link_group *lgr, struct smc_buf_desc *rmb_desc) { + struct smc_llc_qentry *qentry = NULL; + struct smc_link *send_link; int rc = 0; - mutex_lock(&link->llc_delete_rkey_mutex); - if (link->state != SMC_LNK_ACTIVE) - goto out; - reinit_completion(&link->llc_delete_rkey_resp); - rc = smc_llc_send_delete_rkey(link, rmb_desc); + send_link = smc_llc_usable_link(lgr); + if (!send_link) + return -ENOLINK; + + rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY); + if (rc) + return rc; + /* protected by llc_flow control */ + rc = smc_llc_send_delete_rkey(send_link, rmb_desc); if (rc) goto out; /* receive DELETE RKEY response from server over RoCE fabric */ - rc = wait_for_completion_interruptible_timeout( - &link->llc_delete_rkey_resp, SMC_LLC_WAIT_TIME); - if (rc <= 0 || link->llc_delete_rkey_resp_rc) + qentry = smc_llc_wait(lgr, send_link, SMC_LLC_WAIT_TIME, + SMC_LLC_DELETE_RKEY); + if (!qentry || (qentry->msg.raw.hdr.flags & SMC_LLC_FLAG_RKEY_NEG)) rc = -EFAULT; - else - rc = 0; out: - mutex_unlock(&link->llc_delete_rkey_mutex); + if (qentry) + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); return rc; } diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index d82d8346b61e..e9f23affece6 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -61,7 +61,7 @@ void smc_llc_link_deleting(struct smc_link *link); void smc_llc_link_clear(struct smc_link *link); int smc_llc_do_confirm_rkey(struct smc_link *send_link, struct smc_buf_desc *rmb_desc); -int smc_llc_do_delete_rkey(struct smc_link *link, +int smc_llc_do_delete_rkey(struct smc_link_group *lgr, struct smc_buf_desc *rmb_desc); int smc_llc_flow_initiate(struct smc_link_group *lgr, enum smc_llc_flowtype type); -- cgit v1.2.3 From 56e8091c7a098ef2257f85f16665d79cf3049da9 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Thu, 30 Apr 2020 15:55:46 +0200 Subject: net/smc: move the TEST_LINK response processing into event handler Get rid of the extra function and move the two-liner for the TEST_LINK response processing into the event handler function. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_llc.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index f9ec270818fa..4945abbad111 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -563,13 +563,6 @@ static void smc_llc_rx_delete_link(struct smc_link *link, smc_lgr_terminate_sched(lgr); } -static void smc_llc_rx_test_link(struct smc_link *link, - struct smc_llc_msg_test_link *llc) -{ - llc->hd.flags |= SMC_LLC_FLAG_RESP; - smc_llc_send_message(link, llc); -} - static void smc_llc_rx_confirm_rkey(struct smc_link *link, struct smc_llc_msg_confirm_rkey *llc) { @@ -640,7 +633,8 @@ static void smc_llc_event_handler(struct smc_llc_qentry *qentry) switch (llc->raw.hdr.common.type) { case SMC_LLC_TEST_LINK: - smc_llc_rx_test_link(link, &llc->test_link); + llc->test_link.hd.flags |= SMC_LLC_FLAG_RESP; + smc_llc_send_message(link, llc); break; case SMC_LLC_ADD_LINK: if (list_empty(&lgr->list)) -- cgit v1.2.3 From ba21abd22f9ffa023921923a6c01d28b59731ff8 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Thu, 30 Apr 2020 15:55:47 +0200 Subject: net/smc: new smc_rtoken_set functions for multiple link support Introduce smc_rtoken_set() to set the rtoken for a new link to an existing rmb whose rtoken is given, and smc_rtoken_set2() to set an rtoken for a new link whose link_id is given. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_core.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ net/smc/smc_core.h | 4 ++++ 2 files changed, 51 insertions(+) (limited to 'net') diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index f71a366ed6ac..096dce92ee2b 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1368,6 +1368,53 @@ static inline int smc_rmb_reserve_rtoken_idx(struct smc_link_group *lgr) return -ENOSPC; } +static int smc_rtoken_find_by_link(struct smc_link_group *lgr, int lnk_idx, + u32 rkey) +{ + int i; + + for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { + if (test_bit(i, lgr->rtokens_used_mask) && + lgr->rtokens[i][lnk_idx].rkey == rkey) + return i; + } + return -ENOENT; +} + +/* set rtoken for a new link to an existing rmb */ +void smc_rtoken_set(struct smc_link_group *lgr, int link_idx, int link_idx_new, + __be32 nw_rkey_known, __be64 nw_vaddr, __be32 nw_rkey) +{ + int rtok_idx; + + rtok_idx = smc_rtoken_find_by_link(lgr, link_idx, ntohl(nw_rkey_known)); + if (rtok_idx == -ENOENT) + return; + lgr->rtokens[rtok_idx][link_idx_new].rkey = ntohl(nw_rkey); + lgr->rtokens[rtok_idx][link_idx_new].dma_addr = be64_to_cpu(nw_vaddr); +} + +/* set rtoken for a new link whose link_id is given */ +void smc_rtoken_set2(struct smc_link_group *lgr, int rtok_idx, int link_id, + __be64 nw_vaddr, __be32 nw_rkey) +{ + u64 dma_addr = be64_to_cpu(nw_vaddr); + u32 rkey = ntohl(nw_rkey); + bool found = false; + int link_idx; + + for (link_idx = 0; link_idx < SMC_LINKS_PER_LGR_MAX; link_idx++) { + if (lgr->lnk[link_idx].link_id == link_id) { + found = true; + break; + } + } + if (!found) + return; + lgr->rtokens[rtok_idx][link_idx].rkey = rkey; + lgr->rtokens[rtok_idx][link_idx].dma_addr = dma_addr; +} + /* add a new rtoken from peer */ int smc_rtoken_add(struct smc_link *lnk, __be64 nw_vaddr, __be32 nw_rkey) { diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 364a54e28d61..0be26386057f 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -352,6 +352,10 @@ int smc_rmb_rtoken_handling(struct smc_connection *conn, struct smc_link *link, struct smc_clc_msg_accept_confirm *clc); int smc_rtoken_add(struct smc_link *lnk, __be64 nw_vaddr, __be32 nw_rkey); int smc_rtoken_delete(struct smc_link *lnk, __be32 nw_rkey); +void smc_rtoken_set(struct smc_link_group *lgr, int link_idx, int link_idx_new, + __be32 nw_rkey_known, __be64 nw_vaddr, __be32 nw_rkey); +void smc_rtoken_set2(struct smc_link_group *lgr, int rtok_idx, int link_id, + __be64 nw_vaddr, __be32 nw_rkey); void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn); void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn); void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn); -- cgit v1.2.3 From 3bc67e098c3e215f6e09ba3c0e1f569e7ae020d0 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Thu, 30 Apr 2020 15:55:48 +0200 Subject: net/smc: adapt SMC remote CONFIRM_RKEY processing to use the LLC flow Use the LLC flow framework for the processing of CONFIRM_RKEY messages that were received from the peer. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_llc.c | 56 ++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 41 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 4945abbad111..b7b5cc01b78e 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -105,6 +105,7 @@ struct smc_llc_msg_confirm_rkey_cont { /* type 0x08 */ }; #define SMC_LLC_DEL_RKEY_MAX 8 +#define SMC_LLC_FLAG_RKEY_RETRY 0x10 #define SMC_LLC_FLAG_RKEY_NEG 0x20 struct smc_llc_msg_delete_rkey { /* type 0x09 */ @@ -563,21 +564,41 @@ static void smc_llc_rx_delete_link(struct smc_link *link, smc_lgr_terminate_sched(lgr); } -static void smc_llc_rx_confirm_rkey(struct smc_link *link, - struct smc_llc_msg_confirm_rkey *llc) +/* process a confirm_rkey request from peer, remote flow */ +static void smc_llc_rmt_conf_rkey(struct smc_link_group *lgr) { - int rc; - - rc = smc_rtoken_add(link, - llc->rtoken[0].rmb_vaddr, - llc->rtoken[0].rmb_key); - - /* ignore rtokens for other links, we have only one link */ - + struct smc_llc_msg_confirm_rkey *llc; + struct smc_llc_qentry *qentry; + struct smc_link *link; + int num_entries; + int rk_idx; + int i; + + qentry = lgr->llc_flow_rmt.qentry; + llc = &qentry->msg.confirm_rkey; + link = qentry->link; + + num_entries = llc->rtoken[0].num_rkeys; + /* first rkey entry is for receiving link */ + rk_idx = smc_rtoken_add(link, + llc->rtoken[0].rmb_vaddr, + llc->rtoken[0].rmb_key); + if (rk_idx < 0) + goto out_err; + + for (i = 1; i <= min_t(u8, num_entries, SMC_LLC_RKEYS_PER_MSG - 1); i++) + smc_rtoken_set2(lgr, rk_idx, llc->rtoken[i].link_id, + llc->rtoken[i].rmb_vaddr, + llc->rtoken[i].rmb_key); + /* max links is 3 so there is no need to support conf_rkey_cont msgs */ + goto out; +out_err: + llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG; + llc->hd.flags |= SMC_LLC_FLAG_RKEY_RETRY; +out: llc->hd.flags |= SMC_LLC_FLAG_RESP; - if (rc < 0) - llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG; - smc_llc_send_message(link, llc); + smc_llc_send_message(link, &qentry->msg); + smc_llc_flow_qentry_del(&lgr->llc_flow_rmt); } static void smc_llc_rx_confirm_rkey_cont(struct smc_link *link, @@ -666,8 +687,13 @@ static void smc_llc_event_handler(struct smc_llc_qentry *qentry) smc_llc_rx_delete_link(link, &llc->delete_link); break; case SMC_LLC_CONFIRM_RKEY: - smc_llc_rx_confirm_rkey(link, &llc->confirm_rkey); - break; + /* new request from remote, assign to remote flow */ + if (smc_llc_flow_start(&lgr->llc_flow_rmt, qentry)) { + /* process here, does not wait for more llc msgs */ + smc_llc_rmt_conf_rkey(lgr); + smc_llc_flow_stop(lgr, &lgr->llc_flow_rmt); + } + return; case SMC_LLC_CONFIRM_RKEY_CONT: smc_llc_rx_confirm_rkey_cont(link, &llc->confirm_rkey_cont); break; -- cgit v1.2.3 From 218b24fe381238941a06496eaf221a22c5935267 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Thu, 30 Apr 2020 15:55:49 +0200 Subject: net/smc: adapt SMC remote DELETE_RKEY processing to use the LLC flow Use the LLC flow framework for the processing of DELETE_RKEY messages that were received from the peer. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_llc.c | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index b7b5cc01b78e..e458207bde9e 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -601,31 +601,37 @@ out: smc_llc_flow_qentry_del(&lgr->llc_flow_rmt); } -static void smc_llc_rx_confirm_rkey_cont(struct smc_link *link, - struct smc_llc_msg_confirm_rkey_cont *llc) -{ - /* ignore rtokens for other links, we have only one link */ - llc->hd.flags |= SMC_LLC_FLAG_RESP; - smc_llc_send_message(link, llc); -} - -static void smc_llc_rx_delete_rkey(struct smc_link *link, - struct smc_llc_msg_delete_rkey *llc) +/* process a delete_rkey request from peer, remote flow */ +static void smc_llc_rmt_delete_rkey(struct smc_link_group *lgr) { + struct smc_llc_msg_delete_rkey *llc; + struct smc_llc_qentry *qentry; + struct smc_link *link; u8 err_mask = 0; int i, max; + qentry = lgr->llc_flow_rmt.qentry; + llc = &qentry->msg.delete_rkey; + link = qentry->link; + max = min_t(u8, llc->num_rkeys, SMC_LLC_DEL_RKEY_MAX); for (i = 0; i < max; i++) { if (smc_rtoken_delete(link, llc->rkey[i])) err_mask |= 1 << (SMC_LLC_DEL_RKEY_MAX - 1 - i); } - if (err_mask) { llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG; llc->err_mask = err_mask; } + llc->hd.flags |= SMC_LLC_FLAG_RESP; + smc_llc_send_message(link, &qentry->msg); + smc_llc_flow_qentry_del(&lgr->llc_flow_rmt); +} +static void smc_llc_rx_confirm_rkey_cont(struct smc_link *link, + struct smc_llc_msg_confirm_rkey_cont *llc) +{ + /* ignore rtokens for other links, we have only one link */ llc->hd.flags |= SMC_LLC_FLAG_RESP; smc_llc_send_message(link, llc); } @@ -698,8 +704,13 @@ static void smc_llc_event_handler(struct smc_llc_qentry *qentry) smc_llc_rx_confirm_rkey_cont(link, &llc->confirm_rkey_cont); break; case SMC_LLC_DELETE_RKEY: - smc_llc_rx_delete_rkey(link, &llc->delete_rkey); - break; + /* new request from remote, assign to remote flow */ + if (smc_llc_flow_start(&lgr->llc_flow_rmt, qentry)) { + /* process here, does not wait for more llc msgs */ + smc_llc_rmt_delete_rkey(lgr); + smc_llc_flow_stop(lgr, &lgr->llc_flow_rmt); + } + return; } out: kfree(qentry); -- cgit v1.2.3 From 42d18acce9e29b61f5dbfc5118d7c72093e703a1 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Thu, 30 Apr 2020 15:55:50 +0200 Subject: net/smc: remove handling of CONFIRM_RKEY_CONTINUE The new SMC-R multiple link support will support a maximum of 3 links, and one CONFIRM_RKEY LLC message can transport 3 rkeys of an rmb buffer. There is no need for the LLC message type CONFIRM_RKEY_CONTINUE which is needed when more than 3 rkeys per rmb buffer needs to be exchanged. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_llc.c | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index e458207bde9e..92c9a8a8aaf9 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -98,12 +98,6 @@ struct smc_llc_msg_confirm_rkey { /* type 0x06 */ u8 reserved; }; -struct smc_llc_msg_confirm_rkey_cont { /* type 0x08 */ - struct smc_llc_hdr hd; - u8 num_rkeys; - struct smc_rmb_rtoken rtoken[SMC_LLC_RKEYS_PER_MSG]; -}; - #define SMC_LLC_DEL_RKEY_MAX 8 #define SMC_LLC_FLAG_RKEY_RETRY 0x10 #define SMC_LLC_FLAG_RKEY_NEG 0x20 @@ -123,7 +117,6 @@ union smc_llc_msg { struct smc_llc_msg_del_link delete_link; struct smc_llc_msg_confirm_rkey confirm_rkey; - struct smc_llc_msg_confirm_rkey_cont confirm_rkey_cont; struct smc_llc_msg_delete_rkey delete_rkey; struct smc_llc_msg_test_link test_link; @@ -628,14 +621,6 @@ static void smc_llc_rmt_delete_rkey(struct smc_link_group *lgr) smc_llc_flow_qentry_del(&lgr->llc_flow_rmt); } -static void smc_llc_rx_confirm_rkey_cont(struct smc_link *link, - struct smc_llc_msg_confirm_rkey_cont *llc) -{ - /* ignore rtokens for other links, we have only one link */ - llc->hd.flags |= SMC_LLC_FLAG_RESP; - smc_llc_send_message(link, llc); -} - /* flush the llc event queue */ static void smc_llc_event_flush(struct smc_link_group *lgr) { @@ -701,7 +686,9 @@ static void smc_llc_event_handler(struct smc_llc_qentry *qentry) } return; case SMC_LLC_CONFIRM_RKEY_CONT: - smc_llc_rx_confirm_rkey_cont(link, &llc->confirm_rkey_cont); + /* not used because max links is 3, and 3 rkeys fit into + * one CONFIRM_RKEY message + */ break; case SMC_LLC_DELETE_RKEY: /* new request from remote, assign to remote flow */ @@ -770,7 +757,7 @@ static void smc_llc_rx_response(struct smc_link *link, smc_lgr_schedule_free_work_fast(link->lgr); break; case SMC_LLC_CONFIRM_RKEY_CONT: - /* unused as long as we don't send this type of msg */ + /* not used because max links is 3 */ break; } kfree(qentry); -- cgit v1.2.3 From 41a211d862242439c9cdb2481946bb0928760541 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Thu, 30 Apr 2020 15:55:51 +0200 Subject: net/smc: remove obsolete link state DELETING The connection layer in af_smc.c is now using the new LLC flow framework, which made the link state DELETING obsolete. Remove the state and the respective helpers. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_core.c | 4 +--- net/smc/smc_core.h | 1 - net/smc/smc_llc.c | 7 ------- net/smc/smc_llc.h | 1 - 4 files changed, 1 insertion(+), 12 deletions(-) (limited to 'net') diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 096dce92ee2b..3539ceef9a97 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -200,7 +200,6 @@ static int smcr_link_send_delete(struct smc_link *lnk, bool orderly) { if (lnk->state == SMC_LNK_ACTIVE && !smc_llc_send_delete_link(lnk, SMC_LLC_REQ, orderly)) { - smc_llc_link_deleting(lnk); return 0; } return -ENOTCONN; @@ -767,8 +766,7 @@ void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport) continue; /* tbd - terminate only when no more links are active */ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { - if (!smc_link_usable(&lgr->lnk[i]) || - lgr->lnk[i].state == SMC_LNK_DELETING) + if (!smc_link_usable(&lgr->lnk[i])) continue; if (lgr->lnk[i].smcibdev == smcibdev && lgr->lnk[i].ibport == ibport) { diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 0be26386057f..f12474cc666c 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -36,7 +36,6 @@ enum smc_link_state { /* possible states of a link */ SMC_LNK_INACTIVE, /* link is inactive */ SMC_LNK_ACTIVATING, /* link is being activated */ SMC_LNK_ACTIVE, /* link is active */ - SMC_LNK_DELETING, /* link is being deleted */ }; #define SMC_WR_BUF_SIZE 48 /* size of work request buffer */ diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 92c9a8a8aaf9..327cf30b98cc 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -545,7 +545,6 @@ static void smc_llc_rx_delete_link(struct smc_link *link, struct smc_link_group *lgr = smc_get_lgr(link); smc_lgr_forget(lgr); - smc_llc_link_deleting(link); if (lgr->role == SMC_SERV) { /* client asks to delete this link, send request */ smc_llc_prep_delete_link(llc, link, SMC_LLC_REQ, true); @@ -878,12 +877,6 @@ void smc_llc_link_active(struct smc_link *link) } } -void smc_llc_link_deleting(struct smc_link *link) -{ - link->state = SMC_LNK_DELETING; - smc_wr_wakeup_tx_wait(link); -} - /* called in worker context */ void smc_llc_link_clear(struct smc_link *link) { diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index e9f23affece6..48029a5e14c3 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -57,7 +57,6 @@ void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc); void smc_llc_lgr_clear(struct smc_link_group *lgr); int smc_llc_link_init(struct smc_link *link); void smc_llc_link_active(struct smc_link *link); -void smc_llc_link_deleting(struct smc_link *link); void smc_llc_link_clear(struct smc_link *link); int smc_llc_do_confirm_rkey(struct smc_link *send_link, struct smc_buf_desc *rmb_desc); -- cgit v1.2.3 From 6e3a401fc8af01828bcdc92d713195d318b36e7e Mon Sep 17 00:00:00 2001 From: Dmitry Yakunin Date: Thu, 30 Apr 2020 18:51:14 +0300 Subject: inet_diag: add cgroup id attribute This patch adds cgroup v2 ID to common inet diag message attributes. Cgroup v2 ID is kernfs ID (ino or ino+gen). This attribute allows filter inet diag output by cgroup ID obtained by name_to_handle_at() syscall. When net_cls or net_prio cgroup is activated this ID is equal to 1 (root cgroup ID) for newly created sockets. Some notes about this ID: 1) gets initialized in socket() syscall 2) incoming socket gets ID from listening socket (not during accept() syscall) 3) not changed when process get moved to another cgroup 4) can point to deleted cgroup (refcounting) v2: - use CONFIG_SOCK_CGROUP_DATA instead if CONFIG_CGROUPS v3: - fix attr size by using nla_total_size_64bit() (Eric Dumazet) - more detailed commit message (Konstantin Khlebnikov) Signed-off-by: Dmitry Yakunin Reviewed-by: Konstantin Khlebnikov Acked-By: Tejun Heo Signed-off-by: David S. Miller --- include/linux/inet_diag.h | 6 +++++- include/uapi/linux/inet_diag.h | 1 + net/ipv4/inet_diag.c | 7 +++++++ 3 files changed, 13 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h index ce9ed1c0602f..0ef2d800fda7 100644 --- a/include/linux/inet_diag.h +++ b/include/linux/inet_diag.h @@ -71,7 +71,11 @@ static inline size_t inet_diag_msg_attrs_size(void) + nla_total_size(1) /* INET_DIAG_SKV6ONLY */ #endif + nla_total_size(4) /* INET_DIAG_MARK */ - + nla_total_size(4); /* INET_DIAG_CLASS_ID */ + + nla_total_size(4) /* INET_DIAG_CLASS_ID */ +#ifdef CONFIG_SOCK_CGROUP_DATA + + nla_total_size_64bit(sizeof(u64)) /* INET_DIAG_CGROUP_ID */ +#endif + ; } int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, struct inet_diag_msg *r, int ext, diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index 57cc429a9177..c9b1e551792c 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -157,6 +157,7 @@ enum { INET_DIAG_MD5SIG, INET_DIAG_ULP_INFO, INET_DIAG_SK_BPF_STORAGES, + INET_DIAG_CGROUP_ID, __INET_DIAG_MAX, }; diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 5d50aad3cdbf..9c4c315cbc10 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -162,6 +162,13 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, goto errout; } +#ifdef CONFIG_SOCK_CGROUP_DATA + if (nla_put_u64_64bit(skb, INET_DIAG_CGROUP_ID, + cgroup_id(sock_cgroup_ptr(&sk->sk_cgrp_data)), + INET_DIAG_PAD)) + goto errout; +#endif + r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk)); r->idiag_inode = sock_i_ino(sk); -- cgit v1.2.3 From b1f3e43dbfacfcd95296b0f80f84b186add9ef54 Mon Sep 17 00:00:00 2001 From: Dmitry Yakunin Date: Thu, 30 Apr 2020 18:51:15 +0300 Subject: inet_diag: add support for cgroup filter This patch adds ability to filter sockets based on cgroup v2 ID. Such filter is helpful in ss utility for filtering sockets by cgroup pathname. Signed-off-by: Dmitry Yakunin Reviewed-by: Konstantin Khlebnikov Signed-off-by: David S. Miller --- include/uapi/linux/inet_diag.h | 1 + net/ipv4/inet_diag.c | 31 +++++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index c9b1e551792c..e6f183ee8417 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -96,6 +96,7 @@ enum { INET_DIAG_BC_MARK_COND, INET_DIAG_BC_S_EQ, INET_DIAG_BC_D_EQ, + INET_DIAG_BC_CGROUP_COND, /* u64 cgroup v2 ID */ }; struct inet_diag_hostcond { diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 9c4c315cbc10..0034092358c3 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -43,6 +43,9 @@ struct inet_diag_entry { u16 userlocks; u32 ifindex; u32 mark; +#ifdef CONFIG_SOCK_CGROUP_DATA + u64 cgroup_id; +#endif }; static DEFINE_MUTEX(inet_diag_table_mutex); @@ -682,6 +685,16 @@ static int inet_diag_bc_run(const struct nlattr *_bc, yes = 0; break; } +#ifdef CONFIG_SOCK_CGROUP_DATA + case INET_DIAG_BC_CGROUP_COND: { + u64 cgroup_id; + + cgroup_id = get_unaligned((const u64 *)(op + 1)); + if (cgroup_id != entry->cgroup_id) + yes = 0; + break; + } +#endif } if (yes) { @@ -732,6 +745,9 @@ int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk) entry.mark = inet_rsk(inet_reqsk(sk))->ir_mark; else entry.mark = 0; +#ifdef CONFIG_SOCK_CGROUP_DATA + entry.cgroup_id = cgroup_id(sock_cgroup_ptr(&sk->sk_cgrp_data)); +#endif return inet_diag_bc_run(bc, &entry); } @@ -821,6 +837,15 @@ static bool valid_markcond(const struct inet_diag_bc_op *op, int len, return len >= *min_len; } +#ifdef CONFIG_SOCK_CGROUP_DATA +static bool valid_cgroupcond(const struct inet_diag_bc_op *op, int len, + int *min_len) +{ + *min_len += sizeof(u64); + return len >= *min_len; +} +#endif + static int inet_diag_bc_audit(const struct nlattr *attr, const struct sk_buff *skb) { @@ -863,6 +888,12 @@ static int inet_diag_bc_audit(const struct nlattr *attr, if (!valid_markcond(bc, len, &min_len)) return -EINVAL; break; +#ifdef CONFIG_SOCK_CGROUP_DATA + case INET_DIAG_BC_CGROUP_COND: + if (!valid_cgroupcond(bc, len, &min_len)) + return -EINVAL; + break; +#endif case INET_DIAG_BC_AUTO: case INET_DIAG_BC_JMP: case INET_DIAG_BC_NOP: -- cgit v1.2.3 From 40e79150c1686263e6a031d7702aec63aff31332 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 30 Apr 2020 18:03:57 +0200 Subject: docs: networking: convert lapb-module.txt to ReST - add SPDX header; - adjust title markup; - mark code blocks and literals as such; - mark tables as such; - adjust identation, whitespaces and blank lines; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/lapb-module.rst | 305 +++++++++++++++++++++++++++++++ Documentation/networking/lapb-module.txt | 263 -------------------------- MAINTAINERS | 2 +- net/lapb/Kconfig | 2 +- 5 files changed, 308 insertions(+), 265 deletions(-) create mode 100644 Documentation/networking/lapb-module.rst delete mode 100644 Documentation/networking/lapb-module.txt (limited to 'net') diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 0c5d7a037983..acd2567cf0d4 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -75,6 +75,7 @@ Contents: ipvs-sysctl kcm l2tp + lapb-module .. only:: subproject and html diff --git a/Documentation/networking/lapb-module.rst b/Documentation/networking/lapb-module.rst new file mode 100644 index 000000000000..ff586bc9f005 --- /dev/null +++ b/Documentation/networking/lapb-module.rst @@ -0,0 +1,305 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=============================== +The Linux LAPB Module Interface +=============================== + +Version 1.3 + +Jonathan Naylor 29.12.96 + +Changed (Henner Eisen, 2000-10-29): int return value for data_indication() + +The LAPB module will be a separately compiled module for use by any parts of +the Linux operating system that require a LAPB service. This document +defines the interfaces to, and the services provided by this module. The +term module in this context does not imply that the LAPB module is a +separately loadable module, although it may be. The term module is used in +its more standard meaning. + +The interface to the LAPB module consists of functions to the module, +callbacks from the module to indicate important state changes, and +structures for getting and setting information about the module. + +Structures +---------- + +Probably the most important structure is the skbuff structure for holding +received and transmitted data, however it is beyond the scope of this +document. + +The two LAPB specific structures are the LAPB initialisation structure and +the LAPB parameter structure. These will be defined in a standard header +file, . The header file is internal to the LAPB +module and is not for use. + +LAPB Initialisation Structure +----------------------------- + +This structure is used only once, in the call to lapb_register (see below). +It contains information about the device driver that requires the services +of the LAPB module:: + + struct lapb_register_struct { + void (*connect_confirmation)(int token, int reason); + void (*connect_indication)(int token, int reason); + void (*disconnect_confirmation)(int token, int reason); + void (*disconnect_indication)(int token, int reason); + int (*data_indication)(int token, struct sk_buff *skb); + void (*data_transmit)(int token, struct sk_buff *skb); + }; + +Each member of this structure corresponds to a function in the device driver +that is called when a particular event in the LAPB module occurs. These will +be described in detail below. If a callback is not required (!!) then a NULL +may be substituted. + + +LAPB Parameter Structure +------------------------ + +This structure is used with the lapb_getparms and lapb_setparms functions +(see below). They are used to allow the device driver to get and set the +operational parameters of the LAPB implementation for a given connection:: + + struct lapb_parms_struct { + unsigned int t1; + unsigned int t1timer; + unsigned int t2; + unsigned int t2timer; + unsigned int n2; + unsigned int n2count; + unsigned int window; + unsigned int state; + unsigned int mode; + }; + +T1 and T2 are protocol timing parameters and are given in units of 100ms. N2 +is the maximum number of tries on the link before it is declared a failure. +The window size is the maximum number of outstanding data packets allowed to +be unacknowledged by the remote end, the value of the window is between 1 +and 7 for a standard LAPB link, and between 1 and 127 for an extended LAPB +link. + +The mode variable is a bit field used for setting (at present) three values. +The bit fields have the following meanings: + +====== ================================================= +Bit Meaning +====== ================================================= +0 LAPB operation (0=LAPB_STANDARD 1=LAPB_EXTENDED). +1 [SM]LP operation (0=LAPB_SLP 1=LAPB=MLP). +2 DTE/DCE operation (0=LAPB_DTE 1=LAPB_DCE) +3-31 Reserved, must be 0. +====== ================================================= + +Extended LAPB operation indicates the use of extended sequence numbers and +consequently larger window sizes, the default is standard LAPB operation. +MLP operation is the same as SLP operation except that the addresses used by +LAPB are different to indicate the mode of operation, the default is Single +Link Procedure. The difference between DCE and DTE operation is (i) the +addresses used for commands and responses, and (ii) when the DCE is not +connected, it sends DM without polls set, every T1. The upper case constant +names will be defined in the public LAPB header file. + + +Functions +--------- + +The LAPB module provides a number of function entry points. + +:: + + int lapb_register(void *token, struct lapb_register_struct); + +This must be called before the LAPB module may be used. If the call is +successful then LAPB_OK is returned. The token must be a unique identifier +generated by the device driver to allow for the unique identification of the +instance of the LAPB link. It is returned by the LAPB module in all of the +callbacks, and is used by the device driver in all calls to the LAPB module. +For multiple LAPB links in a single device driver, multiple calls to +lapb_register must be made. The format of the lapb_register_struct is given +above. The return values are: + +============= ============================= +LAPB_OK LAPB registered successfully. +LAPB_BADTOKEN Token is already registered. +LAPB_NOMEM Out of memory +============= ============================= + +:: + + int lapb_unregister(void *token); + +This releases all the resources associated with a LAPB link. Any current +LAPB link will be abandoned without further messages being passed. After +this call, the value of token is no longer valid for any calls to the LAPB +function. The valid return values are: + +============= =============================== +LAPB_OK LAPB unregistered successfully. +LAPB_BADTOKEN Invalid/unknown LAPB token. +============= =============================== + +:: + + int lapb_getparms(void *token, struct lapb_parms_struct *parms); + +This allows the device driver to get the values of the current LAPB +variables, the lapb_parms_struct is described above. The valid return values +are: + +============= ============================= +LAPB_OK LAPB getparms was successful. +LAPB_BADTOKEN Invalid/unknown LAPB token. +============= ============================= + +:: + + int lapb_setparms(void *token, struct lapb_parms_struct *parms); + +This allows the device driver to set the values of the current LAPB +variables, the lapb_parms_struct is described above. The values of t1timer, +t2timer and n2count are ignored, likewise changing the mode bits when +connected will be ignored. An error implies that none of the values have +been changed. The valid return values are: + +============= ================================================= +LAPB_OK LAPB getparms was successful. +LAPB_BADTOKEN Invalid/unknown LAPB token. +LAPB_INVALUE One of the values was out of its allowable range. +============= ================================================= + +:: + + int lapb_connect_request(void *token); + +Initiate a connect using the current parameter settings. The valid return +values are: + +============== ================================= +LAPB_OK LAPB is starting to connect. +LAPB_BADTOKEN Invalid/unknown LAPB token. +LAPB_CONNECTED LAPB module is already connected. +============== ================================= + +:: + + int lapb_disconnect_request(void *token); + +Initiate a disconnect. The valid return values are: + +================= =============================== +LAPB_OK LAPB is starting to disconnect. +LAPB_BADTOKEN Invalid/unknown LAPB token. +LAPB_NOTCONNECTED LAPB module is not connected. +================= =============================== + +:: + + int lapb_data_request(void *token, struct sk_buff *skb); + +Queue data with the LAPB module for transmitting over the link. If the call +is successful then the skbuff is owned by the LAPB module and may not be +used by the device driver again. The valid return values are: + +================= ============================= +LAPB_OK LAPB has accepted the data. +LAPB_BADTOKEN Invalid/unknown LAPB token. +LAPB_NOTCONNECTED LAPB module is not connected. +================= ============================= + +:: + + int lapb_data_received(void *token, struct sk_buff *skb); + +Queue data with the LAPB module which has been received from the device. It +is expected that the data passed to the LAPB module has skb->data pointing +to the beginning of the LAPB data. If the call is successful then the skbuff +is owned by the LAPB module and may not be used by the device driver again. +The valid return values are: + +============= =========================== +LAPB_OK LAPB has accepted the data. +LAPB_BADTOKEN Invalid/unknown LAPB token. +============= =========================== + +Callbacks +--------- + +These callbacks are functions provided by the device driver for the LAPB +module to call when an event occurs. They are registered with the LAPB +module with lapb_register (see above) in the structure lapb_register_struct +(see above). + +:: + + void (*connect_confirmation)(void *token, int reason); + +This is called by the LAPB module when a connection is established after +being requested by a call to lapb_connect_request (see above). The reason is +always LAPB_OK. + +:: + + void (*connect_indication)(void *token, int reason); + +This is called by the LAPB module when the link is established by the remote +system. The value of reason is always LAPB_OK. + +:: + + void (*disconnect_confirmation)(void *token, int reason); + +This is called by the LAPB module when an event occurs after the device +driver has called lapb_disconnect_request (see above). The reason indicates +what has happened. In all cases the LAPB link can be regarded as being +terminated. The values for reason are: + +================= ==================================================== +LAPB_OK The LAPB link was terminated normally. +LAPB_NOTCONNECTED The remote system was not connected. +LAPB_TIMEDOUT No response was received in N2 tries from the remote + system. +================= ==================================================== + +:: + + void (*disconnect_indication)(void *token, int reason); + +This is called by the LAPB module when the link is terminated by the remote +system or another event has occurred to terminate the link. This may be +returned in response to a lapb_connect_request (see above) if the remote +system refused the request. The values for reason are: + +================= ==================================================== +LAPB_OK The LAPB link was terminated normally by the remote + system. +LAPB_REFUSED The remote system refused the connect request. +LAPB_NOTCONNECTED The remote system was not connected. +LAPB_TIMEDOUT No response was received in N2 tries from the remote + system. +================= ==================================================== + +:: + + int (*data_indication)(void *token, struct sk_buff *skb); + +This is called by the LAPB module when data has been received from the +remote system that should be passed onto the next layer in the protocol +stack. The skbuff becomes the property of the device driver and the LAPB +module will not perform any more actions on it. The skb->data pointer will +be pointing to the first byte of data after the LAPB header. + +This method should return NET_RX_DROP (as defined in the header +file include/linux/netdevice.h) if and only if the frame was dropped +before it could be delivered to the upper layer. + +:: + + void (*data_transmit)(void *token, struct sk_buff *skb); + +This is called by the LAPB module when data is to be transmitted to the +remote system by the device driver. The skbuff becomes the property of the +device driver and the LAPB module will not perform any more actions on it. +The skb->data pointer will be pointing to the first byte of the LAPB header. diff --git a/Documentation/networking/lapb-module.txt b/Documentation/networking/lapb-module.txt deleted file mode 100644 index d4fc8f221559..000000000000 --- a/Documentation/networking/lapb-module.txt +++ /dev/null @@ -1,263 +0,0 @@ - The Linux LAPB Module Interface 1.3 - - Jonathan Naylor 29.12.96 - -Changed (Henner Eisen, 2000-10-29): int return value for data_indication() - -The LAPB module will be a separately compiled module for use by any parts of -the Linux operating system that require a LAPB service. This document -defines the interfaces to, and the services provided by this module. The -term module in this context does not imply that the LAPB module is a -separately loadable module, although it may be. The term module is used in -its more standard meaning. - -The interface to the LAPB module consists of functions to the module, -callbacks from the module to indicate important state changes, and -structures for getting and setting information about the module. - -Structures ----------- - -Probably the most important structure is the skbuff structure for holding -received and transmitted data, however it is beyond the scope of this -document. - -The two LAPB specific structures are the LAPB initialisation structure and -the LAPB parameter structure. These will be defined in a standard header -file, . The header file is internal to the LAPB -module and is not for use. - -LAPB Initialisation Structure ------------------------------ - -This structure is used only once, in the call to lapb_register (see below). -It contains information about the device driver that requires the services -of the LAPB module. - -struct lapb_register_struct { - void (*connect_confirmation)(int token, int reason); - void (*connect_indication)(int token, int reason); - void (*disconnect_confirmation)(int token, int reason); - void (*disconnect_indication)(int token, int reason); - int (*data_indication)(int token, struct sk_buff *skb); - void (*data_transmit)(int token, struct sk_buff *skb); -}; - -Each member of this structure corresponds to a function in the device driver -that is called when a particular event in the LAPB module occurs. These will -be described in detail below. If a callback is not required (!!) then a NULL -may be substituted. - - -LAPB Parameter Structure ------------------------- - -This structure is used with the lapb_getparms and lapb_setparms functions -(see below). They are used to allow the device driver to get and set the -operational parameters of the LAPB implementation for a given connection. - -struct lapb_parms_struct { - unsigned int t1; - unsigned int t1timer; - unsigned int t2; - unsigned int t2timer; - unsigned int n2; - unsigned int n2count; - unsigned int window; - unsigned int state; - unsigned int mode; -}; - -T1 and T2 are protocol timing parameters and are given in units of 100ms. N2 -is the maximum number of tries on the link before it is declared a failure. -The window size is the maximum number of outstanding data packets allowed to -be unacknowledged by the remote end, the value of the window is between 1 -and 7 for a standard LAPB link, and between 1 and 127 for an extended LAPB -link. - -The mode variable is a bit field used for setting (at present) three values. -The bit fields have the following meanings: - -Bit Meaning -0 LAPB operation (0=LAPB_STANDARD 1=LAPB_EXTENDED). -1 [SM]LP operation (0=LAPB_SLP 1=LAPB=MLP). -2 DTE/DCE operation (0=LAPB_DTE 1=LAPB_DCE) -3-31 Reserved, must be 0. - -Extended LAPB operation indicates the use of extended sequence numbers and -consequently larger window sizes, the default is standard LAPB operation. -MLP operation is the same as SLP operation except that the addresses used by -LAPB are different to indicate the mode of operation, the default is Single -Link Procedure. The difference between DCE and DTE operation is (i) the -addresses used for commands and responses, and (ii) when the DCE is not -connected, it sends DM without polls set, every T1. The upper case constant -names will be defined in the public LAPB header file. - - -Functions ---------- - -The LAPB module provides a number of function entry points. - - -int lapb_register(void *token, struct lapb_register_struct); - -This must be called before the LAPB module may be used. If the call is -successful then LAPB_OK is returned. The token must be a unique identifier -generated by the device driver to allow for the unique identification of the -instance of the LAPB link. It is returned by the LAPB module in all of the -callbacks, and is used by the device driver in all calls to the LAPB module. -For multiple LAPB links in a single device driver, multiple calls to -lapb_register must be made. The format of the lapb_register_struct is given -above. The return values are: - -LAPB_OK LAPB registered successfully. -LAPB_BADTOKEN Token is already registered. -LAPB_NOMEM Out of memory - - -int lapb_unregister(void *token); - -This releases all the resources associated with a LAPB link. Any current -LAPB link will be abandoned without further messages being passed. After -this call, the value of token is no longer valid for any calls to the LAPB -function. The valid return values are: - -LAPB_OK LAPB unregistered successfully. -LAPB_BADTOKEN Invalid/unknown LAPB token. - - -int lapb_getparms(void *token, struct lapb_parms_struct *parms); - -This allows the device driver to get the values of the current LAPB -variables, the lapb_parms_struct is described above. The valid return values -are: - -LAPB_OK LAPB getparms was successful. -LAPB_BADTOKEN Invalid/unknown LAPB token. - - -int lapb_setparms(void *token, struct lapb_parms_struct *parms); - -This allows the device driver to set the values of the current LAPB -variables, the lapb_parms_struct is described above. The values of t1timer, -t2timer and n2count are ignored, likewise changing the mode bits when -connected will be ignored. An error implies that none of the values have -been changed. The valid return values are: - -LAPB_OK LAPB getparms was successful. -LAPB_BADTOKEN Invalid/unknown LAPB token. -LAPB_INVALUE One of the values was out of its allowable range. - - -int lapb_connect_request(void *token); - -Initiate a connect using the current parameter settings. The valid return -values are: - -LAPB_OK LAPB is starting to connect. -LAPB_BADTOKEN Invalid/unknown LAPB token. -LAPB_CONNECTED LAPB module is already connected. - - -int lapb_disconnect_request(void *token); - -Initiate a disconnect. The valid return values are: - -LAPB_OK LAPB is starting to disconnect. -LAPB_BADTOKEN Invalid/unknown LAPB token. -LAPB_NOTCONNECTED LAPB module is not connected. - - -int lapb_data_request(void *token, struct sk_buff *skb); - -Queue data with the LAPB module for transmitting over the link. If the call -is successful then the skbuff is owned by the LAPB module and may not be -used by the device driver again. The valid return values are: - -LAPB_OK LAPB has accepted the data. -LAPB_BADTOKEN Invalid/unknown LAPB token. -LAPB_NOTCONNECTED LAPB module is not connected. - - -int lapb_data_received(void *token, struct sk_buff *skb); - -Queue data with the LAPB module which has been received from the device. It -is expected that the data passed to the LAPB module has skb->data pointing -to the beginning of the LAPB data. If the call is successful then the skbuff -is owned by the LAPB module and may not be used by the device driver again. -The valid return values are: - -LAPB_OK LAPB has accepted the data. -LAPB_BADTOKEN Invalid/unknown LAPB token. - - -Callbacks ---------- - -These callbacks are functions provided by the device driver for the LAPB -module to call when an event occurs. They are registered with the LAPB -module with lapb_register (see above) in the structure lapb_register_struct -(see above). - - -void (*connect_confirmation)(void *token, int reason); - -This is called by the LAPB module when a connection is established after -being requested by a call to lapb_connect_request (see above). The reason is -always LAPB_OK. - - -void (*connect_indication)(void *token, int reason); - -This is called by the LAPB module when the link is established by the remote -system. The value of reason is always LAPB_OK. - - -void (*disconnect_confirmation)(void *token, int reason); - -This is called by the LAPB module when an event occurs after the device -driver has called lapb_disconnect_request (see above). The reason indicates -what has happened. In all cases the LAPB link can be regarded as being -terminated. The values for reason are: - -LAPB_OK The LAPB link was terminated normally. -LAPB_NOTCONNECTED The remote system was not connected. -LAPB_TIMEDOUT No response was received in N2 tries from the remote - system. - - -void (*disconnect_indication)(void *token, int reason); - -This is called by the LAPB module when the link is terminated by the remote -system or another event has occurred to terminate the link. This may be -returned in response to a lapb_connect_request (see above) if the remote -system refused the request. The values for reason are: - -LAPB_OK The LAPB link was terminated normally by the remote - system. -LAPB_REFUSED The remote system refused the connect request. -LAPB_NOTCONNECTED The remote system was not connected. -LAPB_TIMEDOUT No response was received in N2 tries from the remote - system. - - -int (*data_indication)(void *token, struct sk_buff *skb); - -This is called by the LAPB module when data has been received from the -remote system that should be passed onto the next layer in the protocol -stack. The skbuff becomes the property of the device driver and the LAPB -module will not perform any more actions on it. The skb->data pointer will -be pointing to the first byte of data after the LAPB header. - -This method should return NET_RX_DROP (as defined in the header -file include/linux/netdevice.h) if and only if the frame was dropped -before it could be delivered to the upper layer. - - -void (*data_transmit)(void *token, struct sk_buff *skb); - -This is called by the LAPB module when data is to be transmitted to the -remote system by the device driver. The skbuff becomes the property of the -device driver and the LAPB module will not perform any more actions on it. -The skb->data pointer will be pointing to the first byte of the LAPB header. diff --git a/MAINTAINERS b/MAINTAINERS index 3a5f52a3c055..956999d2d979 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9515,7 +9515,7 @@ F: drivers/soc/lantiq LAPB module L: linux-x25@vger.kernel.org S: Orphan -F: Documentation/networking/lapb-module.txt +F: Documentation/networking/lapb-module.rst F: include/*/lapb.h F: net/lapb/ diff --git a/net/lapb/Kconfig b/net/lapb/Kconfig index 6acfc999c952..5b50e8d64f26 100644 --- a/net/lapb/Kconfig +++ b/net/lapb/Kconfig @@ -15,7 +15,7 @@ config LAPB currently supports LAPB only over Ethernet connections. If you want to use LAPB connections over Ethernet, say Y here and to "LAPB over Ethernet driver" below. Read - for technical + for technical details. To compile this driver as a module, choose M here: the -- cgit v1.2.3 From 429ff87bcac75b929d9ffec8d4d24be2616f8052 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 30 Apr 2020 18:03:59 +0200 Subject: docs: networking: convert mac80211-injection.txt to ReST - add SPDX header; - adjust title markup; - mark code blocks and literals as such; - mark tables as such; - adjust identation, whitespaces and blank lines; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/mac80211-injection.rst | 106 ++++++++++++++++++++++++ Documentation/networking/mac80211-injection.txt | 97 ---------------------- MAINTAINERS | 2 +- net/mac80211/tx.c | 2 +- 5 files changed, 109 insertions(+), 99 deletions(-) create mode 100644 Documentation/networking/mac80211-injection.rst delete mode 100644 Documentation/networking/mac80211-injection.txt (limited to 'net') diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index b3608b177a8b..81c1834bfb57 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -77,6 +77,7 @@ Contents: l2tp lapb-module ltpc + mac80211-injection .. only:: subproject and html diff --git a/Documentation/networking/mac80211-injection.rst b/Documentation/networking/mac80211-injection.rst new file mode 100644 index 000000000000..75d4edcae852 --- /dev/null +++ b/Documentation/networking/mac80211-injection.rst @@ -0,0 +1,106 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================================= +How to use packet injection with mac80211 +========================================= + +mac80211 now allows arbitrary packets to be injected down any Monitor Mode +interface from userland. The packet you inject needs to be composed in the +following format:: + + [ radiotap header ] + [ ieee80211 header ] + [ payload ] + +The radiotap format is discussed in +./Documentation/networking/radiotap-headers.txt. + +Despite many radiotap parameters being currently defined, most only make sense +to appear on received packets. The following information is parsed from the +radiotap headers and used to control injection: + + * IEEE80211_RADIOTAP_FLAGS + + ========================= =========================================== + IEEE80211_RADIOTAP_F_FCS FCS will be removed and recalculated + IEEE80211_RADIOTAP_F_WEP frame will be encrypted if key available + IEEE80211_RADIOTAP_F_FRAG frame will be fragmented if longer than the + current fragmentation threshold. + ========================= =========================================== + + * IEEE80211_RADIOTAP_TX_FLAGS + + ============================= ======================================== + IEEE80211_RADIOTAP_F_TX_NOACK frame should be sent without waiting for + an ACK even if it is a unicast frame + ============================= ======================================== + + * IEEE80211_RADIOTAP_RATE + + legacy rate for the transmission (only for devices without own rate control) + + * IEEE80211_RADIOTAP_MCS + + HT rate for the transmission (only for devices without own rate control). + Also some flags are parsed + + ============================ ======================== + IEEE80211_RADIOTAP_MCS_SGI use short guard interval + IEEE80211_RADIOTAP_MCS_BW_40 send in HT40 mode + ============================ ======================== + + * IEEE80211_RADIOTAP_DATA_RETRIES + + number of retries when either IEEE80211_RADIOTAP_RATE or + IEEE80211_RADIOTAP_MCS was used + + * IEEE80211_RADIOTAP_VHT + + VHT mcs and number of streams used in the transmission (only for devices + without own rate control). Also other fields are parsed + + flags field + IEEE80211_RADIOTAP_VHT_FLAG_SGI: use short guard interval + + bandwidth field + * 1: send using 40MHz channel width + * 4: send using 80MHz channel width + * 11: send using 160MHz channel width + +The injection code can also skip all other currently defined radiotap fields +facilitating replay of captured radiotap headers directly. + +Here is an example valid radiotap header defining some parameters:: + + 0x00, 0x00, // <-- radiotap version + 0x0b, 0x00, // <- radiotap header length + 0x04, 0x0c, 0x00, 0x00, // <-- bitmap + 0x6c, // <-- rate + 0x0c, //<-- tx power + 0x01 //<-- antenna + +The ieee80211 header follows immediately afterwards, looking for example like +this:: + + 0x08, 0x01, 0x00, 0x00, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x13, 0x22, 0x33, 0x44, 0x55, 0x66, + 0x13, 0x22, 0x33, 0x44, 0x55, 0x66, + 0x10, 0x86 + +Then lastly there is the payload. + +After composing the packet contents, it is sent by send()-ing it to a logical +mac80211 interface that is in Monitor mode. Libpcap can also be used, +(which is easier than doing the work to bind the socket to the right +interface), along the following lines::: + + ppcap = pcap_open_live(szInterfaceName, 800, 1, 20, szErrbuf); + ... + r = pcap_inject(ppcap, u8aSendBuffer, nLength); + +You can also find a link to a complete inject application here: + +http://wireless.kernel.org/en/users/Documentation/packetspammer + +Andy Green diff --git a/Documentation/networking/mac80211-injection.txt b/Documentation/networking/mac80211-injection.txt deleted file mode 100644 index d58d78df9ca2..000000000000 --- a/Documentation/networking/mac80211-injection.txt +++ /dev/null @@ -1,97 +0,0 @@ -How to use packet injection with mac80211 -========================================= - -mac80211 now allows arbitrary packets to be injected down any Monitor Mode -interface from userland. The packet you inject needs to be composed in the -following format: - - [ radiotap header ] - [ ieee80211 header ] - [ payload ] - -The radiotap format is discussed in -./Documentation/networking/radiotap-headers.txt. - -Despite many radiotap parameters being currently defined, most only make sense -to appear on received packets. The following information is parsed from the -radiotap headers and used to control injection: - - * IEEE80211_RADIOTAP_FLAGS - - IEEE80211_RADIOTAP_F_FCS: FCS will be removed and recalculated - IEEE80211_RADIOTAP_F_WEP: frame will be encrypted if key available - IEEE80211_RADIOTAP_F_FRAG: frame will be fragmented if longer than the - current fragmentation threshold. - - * IEEE80211_RADIOTAP_TX_FLAGS - - IEEE80211_RADIOTAP_F_TX_NOACK: frame should be sent without waiting for - an ACK even if it is a unicast frame - - * IEEE80211_RADIOTAP_RATE - - legacy rate for the transmission (only for devices without own rate control) - - * IEEE80211_RADIOTAP_MCS - - HT rate for the transmission (only for devices without own rate control). - Also some flags are parsed - - IEEE80211_RADIOTAP_MCS_SGI: use short guard interval - IEEE80211_RADIOTAP_MCS_BW_40: send in HT40 mode - - * IEEE80211_RADIOTAP_DATA_RETRIES - - number of retries when either IEEE80211_RADIOTAP_RATE or - IEEE80211_RADIOTAP_MCS was used - - * IEEE80211_RADIOTAP_VHT - - VHT mcs and number of streams used in the transmission (only for devices - without own rate control). Also other fields are parsed - - flags field - IEEE80211_RADIOTAP_VHT_FLAG_SGI: use short guard interval - - bandwidth field - 1: send using 40MHz channel width - 4: send using 80MHz channel width - 11: send using 160MHz channel width - -The injection code can also skip all other currently defined radiotap fields -facilitating replay of captured radiotap headers directly. - -Here is an example valid radiotap header defining some parameters - - 0x00, 0x00, // <-- radiotap version - 0x0b, 0x00, // <- radiotap header length - 0x04, 0x0c, 0x00, 0x00, // <-- bitmap - 0x6c, // <-- rate - 0x0c, //<-- tx power - 0x01 //<-- antenna - -The ieee80211 header follows immediately afterwards, looking for example like -this: - - 0x08, 0x01, 0x00, 0x00, - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0x13, 0x22, 0x33, 0x44, 0x55, 0x66, - 0x13, 0x22, 0x33, 0x44, 0x55, 0x66, - 0x10, 0x86 - -Then lastly there is the payload. - -After composing the packet contents, it is sent by send()-ing it to a logical -mac80211 interface that is in Monitor mode. Libpcap can also be used, -(which is easier than doing the work to bind the socket to the right -interface), along the following lines: - - ppcap = pcap_open_live(szInterfaceName, 800, 1, 20, szErrbuf); -... - r = pcap_inject(ppcap, u8aSendBuffer, nLength); - -You can also find a link to a complete inject application here: - -http://wireless.kernel.org/en/users/Documentation/packetspammer - -Andy Green diff --git a/MAINTAINERS b/MAINTAINERS index 956999d2d979..33bfc9e4aead 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10079,7 +10079,7 @@ S: Maintained W: https://wireless.wiki.kernel.org/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/jberg/mac80211.git T: git git://git.kernel.org/pub/scm/linux/kernel/git/jberg/mac80211-next.git -F: Documentation/networking/mac80211-injection.txt +F: Documentation/networking/mac80211-injection.rst F: Documentation/networking/mac80211_hwsim/mac80211_hwsim.rst F: drivers/net/wireless/mac80211_hwsim.[ch] F: include/net/mac80211.h diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 82846aca86d9..9849c14694db 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -2144,7 +2144,7 @@ static bool ieee80211_parse_tx_radiotap(struct ieee80211_local *local, /* * Please update the file - * Documentation/networking/mac80211-injection.txt + * Documentation/networking/mac80211-injection.rst * when parsing new fields here. */ -- cgit v1.2.3 From c1e4535f24bcfeef55a7ed409a5f50548e284426 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 30 Apr 2020 18:04:13 +0200 Subject: docs: networking: convert pktgen.txt to ReST - add SPDX header; - adjust title markup; - use bold markups on a few places; - mark code blocks and literals as such; - adjust identation, whitespaces and blank lines where needed; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/pktgen.rst | 412 ++++++++++++++++++++++++++++++++++++ Documentation/networking/pktgen.txt | 400 ---------------------------------- net/Kconfig | 2 +- net/core/pktgen.c | 2 +- samples/pktgen/README.rst | 2 +- 6 files changed, 416 insertions(+), 403 deletions(-) create mode 100644 Documentation/networking/pktgen.rst delete mode 100644 Documentation/networking/pktgen.txt (limited to 'net') diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index e460026331c6..696181a96e3c 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -91,6 +91,7 @@ Contents: operstates packet_mmap phonet + pktgen .. only:: subproject and html diff --git a/Documentation/networking/pktgen.rst b/Documentation/networking/pktgen.rst new file mode 100644 index 000000000000..7afa1c9f1183 --- /dev/null +++ b/Documentation/networking/pktgen.rst @@ -0,0 +1,412 @@ +.. SPDX-License-Identifier: GPL-2.0 + +==================================== +HOWTO for the linux packet generator +==================================== + +Enable CONFIG_NET_PKTGEN to compile and build pktgen either in-kernel +or as a module. A module is preferred; modprobe pktgen if needed. Once +running, pktgen creates a thread for each CPU with affinity to that CPU. +Monitoring and controlling is done via /proc. It is easiest to select a +suitable sample script and configure that. + +On a dual CPU:: + + ps aux | grep pkt + root 129 0.3 0.0 0 0 ? SW 2003 523:20 [kpktgend_0] + root 130 0.3 0.0 0 0 ? SW 2003 509:50 [kpktgend_1] + + +For monitoring and control pktgen creates:: + + /proc/net/pktgen/pgctrl + /proc/net/pktgen/kpktgend_X + /proc/net/pktgen/ethX + + +Tuning NIC for max performance +============================== + +The default NIC settings are (likely) not tuned for pktgen's artificial +overload type of benchmarking, as this could hurt the normal use-case. + +Specifically increasing the TX ring buffer in the NIC:: + + # ethtool -G ethX tx 1024 + +A larger TX ring can improve pktgen's performance, while it can hurt +in the general case, 1) because the TX ring buffer might get larger +than the CPU's L1/L2 cache, 2) because it allows more queueing in the +NIC HW layer (which is bad for bufferbloat). + +One should hesitate to conclude that packets/descriptors in the HW +TX ring cause delay. Drivers usually delay cleaning up the +ring-buffers for various performance reasons, and packets stalling +the TX ring might just be waiting for cleanup. + +This cleanup issue is specifically the case for the driver ixgbe +(Intel 82599 chip). This driver (ixgbe) combines TX+RX ring cleanups, +and the cleanup interval is affected by the ethtool --coalesce setting +of parameter "rx-usecs". + +For ixgbe use e.g. "30" resulting in approx 33K interrupts/sec (1/30*10^6):: + + # ethtool -C ethX rx-usecs 30 + + +Kernel threads +============== +Pktgen creates a thread for each CPU with affinity to that CPU. +Which is controlled through procfile /proc/net/pktgen/kpktgend_X. + +Example: /proc/net/pktgen/kpktgend_0:: + + Running: + Stopped: eth4@0 + Result: OK: add_device=eth4@0 + +Most important are the devices assigned to the thread. + +The two basic thread commands are: + + * add_device DEVICE@NAME -- adds a single device + * rem_device_all -- remove all associated devices + +When adding a device to a thread, a corresponding procfile is created +which is used for configuring this device. Thus, device names need to +be unique. + +To support adding the same device to multiple threads, which is useful +with multi queue NICs, the device naming scheme is extended with "@": +device@something + +The part after "@" can be anything, but it is custom to use the thread +number. + +Viewing devices +=============== + +The Params section holds configured information. The Current section +holds running statistics. The Result is printed after a run or after +interruption. Example:: + + /proc/net/pktgen/eth4@0 + + Params: count 100000 min_pkt_size: 60 max_pkt_size: 60 + frags: 0 delay: 0 clone_skb: 64 ifname: eth4@0 + flows: 0 flowlen: 0 + queue_map_min: 0 queue_map_max: 0 + dst_min: 192.168.81.2 dst_max: + src_min: src_max: + src_mac: 90:e2:ba:0a:56:b4 dst_mac: 00:1b:21:3c:9d:f8 + udp_src_min: 9 udp_src_max: 109 udp_dst_min: 9 udp_dst_max: 9 + src_mac_count: 0 dst_mac_count: 0 + Flags: UDPSRC_RND NO_TIMESTAMP QUEUE_MAP_CPU + Current: + pkts-sofar: 100000 errors: 0 + started: 623913381008us stopped: 623913396439us idle: 25us + seq_num: 100001 cur_dst_mac_offset: 0 cur_src_mac_offset: 0 + cur_saddr: 192.168.8.3 cur_daddr: 192.168.81.2 + cur_udp_dst: 9 cur_udp_src: 42 + cur_queue_map: 0 + flows: 0 + Result: OK: 15430(c15405+d25) usec, 100000 (60byte,0frags) + 6480562pps 3110Mb/sec (3110669760bps) errors: 0 + + +Configuring devices +=================== +This is done via the /proc interface, and most easily done via pgset +as defined in the sample scripts. +You need to specify PGDEV environment variable to use functions from sample +scripts, i.e.:: + + export PGDEV=/proc/net/pktgen/eth4@0 + source samples/pktgen/functions.sh + +Examples:: + + pg_ctrl start starts injection. + pg_ctrl stop aborts injection. Also, ^C aborts generator. + + pgset "clone_skb 1" sets the number of copies of the same packet + pgset "clone_skb 0" use single SKB for all transmits + pgset "burst 8" uses xmit_more API to queue 8 copies of the same + packet and update HW tx queue tail pointer once. + "burst 1" is the default + pgset "pkt_size 9014" sets packet size to 9014 + pgset "frags 5" packet will consist of 5 fragments + pgset "count 200000" sets number of packets to send, set to zero + for continuous sends until explicitly stopped. + + pgset "delay 5000" adds delay to hard_start_xmit(). nanoseconds + + pgset "dst 10.0.0.1" sets IP destination address + (BEWARE! This generator is very aggressive!) + + pgset "dst_min 10.0.0.1" Same as dst + pgset "dst_max 10.0.0.254" Set the maximum destination IP. + pgset "src_min 10.0.0.1" Set the minimum (or only) source IP. + pgset "src_max 10.0.0.254" Set the maximum source IP. + pgset "dst6 fec0::1" IPV6 destination address + pgset "src6 fec0::2" IPV6 source address + pgset "dstmac 00:00:00:00:00:00" sets MAC destination address + pgset "srcmac 00:00:00:00:00:00" sets MAC source address + + pgset "queue_map_min 0" Sets the min value of tx queue interval + pgset "queue_map_max 7" Sets the max value of tx queue interval, for multiqueue devices + To select queue 1 of a given device, + use queue_map_min=1 and queue_map_max=1 + + pgset "src_mac_count 1" Sets the number of MACs we'll range through. + The 'minimum' MAC is what you set with srcmac. + + pgset "dst_mac_count 1" Sets the number of MACs we'll range through. + The 'minimum' MAC is what you set with dstmac. + + pgset "flag [name]" Set a flag to determine behaviour. Current flags + are: IPSRC_RND # IP source is random (between min/max) + IPDST_RND # IP destination is random + UDPSRC_RND, UDPDST_RND, + MACSRC_RND, MACDST_RND + TXSIZE_RND, IPV6, + MPLS_RND, VID_RND, SVID_RND + FLOW_SEQ, + QUEUE_MAP_RND # queue map random + QUEUE_MAP_CPU # queue map mirrors smp_processor_id() + UDPCSUM, + IPSEC # IPsec encapsulation (needs CONFIG_XFRM) + NODE_ALLOC # node specific memory allocation + NO_TIMESTAMP # disable timestamping + pgset 'flag ![name]' Clear a flag to determine behaviour. + Note that you might need to use single quote in + interactive mode, so that your shell wouldn't expand + the specified flag as a history command. + + pgset "spi [SPI_VALUE]" Set specific SA used to transform packet. + + pgset "udp_src_min 9" set UDP source port min, If < udp_src_max, then + cycle through the port range. + + pgset "udp_src_max 9" set UDP source port max. + pgset "udp_dst_min 9" set UDP destination port min, If < udp_dst_max, then + cycle through the port range. + pgset "udp_dst_max 9" set UDP destination port max. + + pgset "mpls 0001000a,0002000a,0000000a" set MPLS labels (in this example + outer label=16,middle label=32, + inner label=0 (IPv4 NULL)) Note that + there must be no spaces between the + arguments. Leading zeros are required. + Do not set the bottom of stack bit, + that's done automatically. If you do + set the bottom of stack bit, that + indicates that you want to randomly + generate that address and the flag + MPLS_RND will be turned on. You + can have any mix of random and fixed + labels in the label stack. + + pgset "mpls 0" turn off mpls (or any invalid argument works too!) + + pgset "vlan_id 77" set VLAN ID 0-4095 + pgset "vlan_p 3" set priority bit 0-7 (default 0) + pgset "vlan_cfi 0" set canonical format identifier 0-1 (default 0) + + pgset "svlan_id 22" set SVLAN ID 0-4095 + pgset "svlan_p 3" set priority bit 0-7 (default 0) + pgset "svlan_cfi 0" set canonical format identifier 0-1 (default 0) + + pgset "vlan_id 9999" > 4095 remove vlan and svlan tags + pgset "svlan 9999" > 4095 remove svlan tag + + + pgset "tos XX" set former IPv4 TOS field (e.g. "tos 28" for AF11 no ECN, default 00) + pgset "traffic_class XX" set former IPv6 TRAFFIC CLASS (e.g. "traffic_class B8" for EF no ECN, default 00) + + pgset "rate 300M" set rate to 300 Mb/s + pgset "ratep 1000000" set rate to 1Mpps + + pgset "xmit_mode netif_receive" RX inject into stack netif_receive_skb() + Works with "burst" but not with "clone_skb". + Default xmit_mode is "start_xmit". + +Sample scripts +============== + +A collection of tutorial scripts and helpers for pktgen is in the +samples/pktgen directory. The helper parameters.sh file support easy +and consistent parameter parsing across the sample scripts. + +Usage example and help:: + + ./pktgen_sample01_simple.sh -i eth4 -m 00:1B:21:3C:9D:F8 -d 192.168.8.2 + +Usage::: + + ./pktgen_sample01_simple.sh [-vx] -i ethX + + -i : ($DEV) output interface/device (required) + -s : ($PKT_SIZE) packet size + -d : ($DEST_IP) destination IP + -m : ($DST_MAC) destination MAC-addr + -t : ($THREADS) threads to start + -c : ($SKB_CLONE) SKB clones send before alloc new SKB + -b : ($BURST) HW level bursting of SKBs + -v : ($VERBOSE) verbose + -x : ($DEBUG) debug + +The global variables being set are also listed. E.g. the required +interface/device parameter "-i" sets variable $DEV. Copy the +pktgen_sampleXX scripts and modify them to fit your own needs. + +The old scripts:: + + pktgen.conf-1-2 # 1 CPU 2 dev + pktgen.conf-1-1-rdos # 1 CPU 1 dev w. route DoS + pktgen.conf-1-1-ip6 # 1 CPU 1 dev ipv6 + pktgen.conf-1-1-ip6-rdos # 1 CPU 1 dev ipv6 w. route DoS + pktgen.conf-1-1-flows # 1 CPU 1 dev multiple flows. + + +Interrupt affinity +=================== +Note that when adding devices to a specific CPU it is a good idea to +also assign /proc/irq/XX/smp_affinity so that the TX interrupts are bound +to the same CPU. This reduces cache bouncing when freeing skbs. + +Plus using the device flag QUEUE_MAP_CPU, which maps the SKBs TX queue +to the running threads CPU (directly from smp_processor_id()). + +Enable IPsec +============ +Default IPsec transformation with ESP encapsulation plus transport mode +can be enabled by simply setting:: + + pgset "flag IPSEC" + pgset "flows 1" + +To avoid breaking existing testbed scripts for using AH type and tunnel mode, +you can use "pgset spi SPI_VALUE" to specify which transformation mode +to employ. + + +Current commands and configuration options +========================================== + +**Pgcontrol commands**:: + + start + stop + reset + +**Thread commands**:: + + add_device + rem_device_all + + +**Device commands**:: + + count + clone_skb + burst + debug + + frags + delay + + src_mac_count + dst_mac_count + + pkt_size + min_pkt_size + max_pkt_size + + queue_map_min + queue_map_max + skb_priority + + tos (ipv4) + traffic_class (ipv6) + + mpls + + udp_src_min + udp_src_max + + udp_dst_min + udp_dst_max + + node + + flag + IPSRC_RND + IPDST_RND + UDPSRC_RND + UDPDST_RND + MACSRC_RND + MACDST_RND + TXSIZE_RND + IPV6 + MPLS_RND + VID_RND + SVID_RND + FLOW_SEQ + QUEUE_MAP_RND + QUEUE_MAP_CPU + UDPCSUM + IPSEC + NODE_ALLOC + NO_TIMESTAMP + + spi (ipsec) + + dst_min + dst_max + + src_min + src_max + + dst_mac + src_mac + + clear_counters + + src6 + dst6 + dst6_max + dst6_min + + flows + flowlen + + rate + ratep + + xmit_mode + + vlan_cfi + vlan_id + vlan_p + + svlan_cfi + svlan_id + svlan_p + + +References: + +- ftp://robur.slu.se/pub/Linux/net-development/pktgen-testing/ +- tp://robur.slu.se/pub/Linux/net-development/pktgen-testing/examples/ + +Paper from Linux-Kongress in Erlangen 2004. +- ftp://robur.slu.se/pub/Linux/net-development/pktgen-testing/pktgen_paper.pdf + +Thanks to: + +Grant Grundler for testing on IA-64 and parisc, Harald Welte, Lennert Buytenhek +Stephen Hemminger, Andi Kleen, Dave Miller and many others. + + +Good luck with the linux net-development. diff --git a/Documentation/networking/pktgen.txt b/Documentation/networking/pktgen.txt deleted file mode 100644 index d2fd78f85aa4..000000000000 --- a/Documentation/networking/pktgen.txt +++ /dev/null @@ -1,400 +0,0 @@ - - - HOWTO for the linux packet generator - ------------------------------------ - -Enable CONFIG_NET_PKTGEN to compile and build pktgen either in-kernel -or as a module. A module is preferred; modprobe pktgen if needed. Once -running, pktgen creates a thread for each CPU with affinity to that CPU. -Monitoring and controlling is done via /proc. It is easiest to select a -suitable sample script and configure that. - -On a dual CPU: - -ps aux | grep pkt -root 129 0.3 0.0 0 0 ? SW 2003 523:20 [kpktgend_0] -root 130 0.3 0.0 0 0 ? SW 2003 509:50 [kpktgend_1] - - -For monitoring and control pktgen creates: - /proc/net/pktgen/pgctrl - /proc/net/pktgen/kpktgend_X - /proc/net/pktgen/ethX - - -Tuning NIC for max performance -============================== - -The default NIC settings are (likely) not tuned for pktgen's artificial -overload type of benchmarking, as this could hurt the normal use-case. - -Specifically increasing the TX ring buffer in the NIC: - # ethtool -G ethX tx 1024 - -A larger TX ring can improve pktgen's performance, while it can hurt -in the general case, 1) because the TX ring buffer might get larger -than the CPU's L1/L2 cache, 2) because it allows more queueing in the -NIC HW layer (which is bad for bufferbloat). - -One should hesitate to conclude that packets/descriptors in the HW -TX ring cause delay. Drivers usually delay cleaning up the -ring-buffers for various performance reasons, and packets stalling -the TX ring might just be waiting for cleanup. - -This cleanup issue is specifically the case for the driver ixgbe -(Intel 82599 chip). This driver (ixgbe) combines TX+RX ring cleanups, -and the cleanup interval is affected by the ethtool --coalesce setting -of parameter "rx-usecs". - -For ixgbe use e.g. "30" resulting in approx 33K interrupts/sec (1/30*10^6): - # ethtool -C ethX rx-usecs 30 - - -Kernel threads -============== -Pktgen creates a thread for each CPU with affinity to that CPU. -Which is controlled through procfile /proc/net/pktgen/kpktgend_X. - -Example: /proc/net/pktgen/kpktgend_0 - - Running: - Stopped: eth4@0 - Result: OK: add_device=eth4@0 - -Most important are the devices assigned to the thread. - -The two basic thread commands are: - * add_device DEVICE@NAME -- adds a single device - * rem_device_all -- remove all associated devices - -When adding a device to a thread, a corresponding procfile is created -which is used for configuring this device. Thus, device names need to -be unique. - -To support adding the same device to multiple threads, which is useful -with multi queue NICs, the device naming scheme is extended with "@": - device@something - -The part after "@" can be anything, but it is custom to use the thread -number. - -Viewing devices -=============== - -The Params section holds configured information. The Current section -holds running statistics. The Result is printed after a run or after -interruption. Example: - -/proc/net/pktgen/eth4@0 - - Params: count 100000 min_pkt_size: 60 max_pkt_size: 60 - frags: 0 delay: 0 clone_skb: 64 ifname: eth4@0 - flows: 0 flowlen: 0 - queue_map_min: 0 queue_map_max: 0 - dst_min: 192.168.81.2 dst_max: - src_min: src_max: - src_mac: 90:e2:ba:0a:56:b4 dst_mac: 00:1b:21:3c:9d:f8 - udp_src_min: 9 udp_src_max: 109 udp_dst_min: 9 udp_dst_max: 9 - src_mac_count: 0 dst_mac_count: 0 - Flags: UDPSRC_RND NO_TIMESTAMP QUEUE_MAP_CPU - Current: - pkts-sofar: 100000 errors: 0 - started: 623913381008us stopped: 623913396439us idle: 25us - seq_num: 100001 cur_dst_mac_offset: 0 cur_src_mac_offset: 0 - cur_saddr: 192.168.8.3 cur_daddr: 192.168.81.2 - cur_udp_dst: 9 cur_udp_src: 42 - cur_queue_map: 0 - flows: 0 - Result: OK: 15430(c15405+d25) usec, 100000 (60byte,0frags) - 6480562pps 3110Mb/sec (3110669760bps) errors: 0 - - -Configuring devices -=================== -This is done via the /proc interface, and most easily done via pgset -as defined in the sample scripts. -You need to specify PGDEV environment variable to use functions from sample -scripts, i.e.: -export PGDEV=/proc/net/pktgen/eth4@0 -source samples/pktgen/functions.sh - -Examples: - - pg_ctrl start starts injection. - pg_ctrl stop aborts injection. Also, ^C aborts generator. - - pgset "clone_skb 1" sets the number of copies of the same packet - pgset "clone_skb 0" use single SKB for all transmits - pgset "burst 8" uses xmit_more API to queue 8 copies of the same - packet and update HW tx queue tail pointer once. - "burst 1" is the default - pgset "pkt_size 9014" sets packet size to 9014 - pgset "frags 5" packet will consist of 5 fragments - pgset "count 200000" sets number of packets to send, set to zero - for continuous sends until explicitly stopped. - - pgset "delay 5000" adds delay to hard_start_xmit(). nanoseconds - - pgset "dst 10.0.0.1" sets IP destination address - (BEWARE! This generator is very aggressive!) - - pgset "dst_min 10.0.0.1" Same as dst - pgset "dst_max 10.0.0.254" Set the maximum destination IP. - pgset "src_min 10.0.0.1" Set the minimum (or only) source IP. - pgset "src_max 10.0.0.254" Set the maximum source IP. - pgset "dst6 fec0::1" IPV6 destination address - pgset "src6 fec0::2" IPV6 source address - pgset "dstmac 00:00:00:00:00:00" sets MAC destination address - pgset "srcmac 00:00:00:00:00:00" sets MAC source address - - pgset "queue_map_min 0" Sets the min value of tx queue interval - pgset "queue_map_max 7" Sets the max value of tx queue interval, for multiqueue devices - To select queue 1 of a given device, - use queue_map_min=1 and queue_map_max=1 - - pgset "src_mac_count 1" Sets the number of MACs we'll range through. - The 'minimum' MAC is what you set with srcmac. - - pgset "dst_mac_count 1" Sets the number of MACs we'll range through. - The 'minimum' MAC is what you set with dstmac. - - pgset "flag [name]" Set a flag to determine behaviour. Current flags - are: IPSRC_RND # IP source is random (between min/max) - IPDST_RND # IP destination is random - UDPSRC_RND, UDPDST_RND, - MACSRC_RND, MACDST_RND - TXSIZE_RND, IPV6, - MPLS_RND, VID_RND, SVID_RND - FLOW_SEQ, - QUEUE_MAP_RND # queue map random - QUEUE_MAP_CPU # queue map mirrors smp_processor_id() - UDPCSUM, - IPSEC # IPsec encapsulation (needs CONFIG_XFRM) - NODE_ALLOC # node specific memory allocation - NO_TIMESTAMP # disable timestamping - pgset 'flag ![name]' Clear a flag to determine behaviour. - Note that you might need to use single quote in - interactive mode, so that your shell wouldn't expand - the specified flag as a history command. - - pgset "spi [SPI_VALUE]" Set specific SA used to transform packet. - - pgset "udp_src_min 9" set UDP source port min, If < udp_src_max, then - cycle through the port range. - - pgset "udp_src_max 9" set UDP source port max. - pgset "udp_dst_min 9" set UDP destination port min, If < udp_dst_max, then - cycle through the port range. - pgset "udp_dst_max 9" set UDP destination port max. - - pgset "mpls 0001000a,0002000a,0000000a" set MPLS labels (in this example - outer label=16,middle label=32, - inner label=0 (IPv4 NULL)) Note that - there must be no spaces between the - arguments. Leading zeros are required. - Do not set the bottom of stack bit, - that's done automatically. If you do - set the bottom of stack bit, that - indicates that you want to randomly - generate that address and the flag - MPLS_RND will be turned on. You - can have any mix of random and fixed - labels in the label stack. - - pgset "mpls 0" turn off mpls (or any invalid argument works too!) - - pgset "vlan_id 77" set VLAN ID 0-4095 - pgset "vlan_p 3" set priority bit 0-7 (default 0) - pgset "vlan_cfi 0" set canonical format identifier 0-1 (default 0) - - pgset "svlan_id 22" set SVLAN ID 0-4095 - pgset "svlan_p 3" set priority bit 0-7 (default 0) - pgset "svlan_cfi 0" set canonical format identifier 0-1 (default 0) - - pgset "vlan_id 9999" > 4095 remove vlan and svlan tags - pgset "svlan 9999" > 4095 remove svlan tag - - - pgset "tos XX" set former IPv4 TOS field (e.g. "tos 28" for AF11 no ECN, default 00) - pgset "traffic_class XX" set former IPv6 TRAFFIC CLASS (e.g. "traffic_class B8" for EF no ECN, default 00) - - pgset "rate 300M" set rate to 300 Mb/s - pgset "ratep 1000000" set rate to 1Mpps - - pgset "xmit_mode netif_receive" RX inject into stack netif_receive_skb() - Works with "burst" but not with "clone_skb". - Default xmit_mode is "start_xmit". - -Sample scripts -============== - -A collection of tutorial scripts and helpers for pktgen is in the -samples/pktgen directory. The helper parameters.sh file support easy -and consistent parameter parsing across the sample scripts. - -Usage example and help: - ./pktgen_sample01_simple.sh -i eth4 -m 00:1B:21:3C:9D:F8 -d 192.168.8.2 - -Usage: ./pktgen_sample01_simple.sh [-vx] -i ethX - -i : ($DEV) output interface/device (required) - -s : ($PKT_SIZE) packet size - -d : ($DEST_IP) destination IP - -m : ($DST_MAC) destination MAC-addr - -t : ($THREADS) threads to start - -c : ($SKB_CLONE) SKB clones send before alloc new SKB - -b : ($BURST) HW level bursting of SKBs - -v : ($VERBOSE) verbose - -x : ($DEBUG) debug - -The global variables being set are also listed. E.g. the required -interface/device parameter "-i" sets variable $DEV. Copy the -pktgen_sampleXX scripts and modify them to fit your own needs. - -The old scripts: - -pktgen.conf-1-2 # 1 CPU 2 dev -pktgen.conf-1-1-rdos # 1 CPU 1 dev w. route DoS -pktgen.conf-1-1-ip6 # 1 CPU 1 dev ipv6 -pktgen.conf-1-1-ip6-rdos # 1 CPU 1 dev ipv6 w. route DoS -pktgen.conf-1-1-flows # 1 CPU 1 dev multiple flows. - - -Interrupt affinity -=================== -Note that when adding devices to a specific CPU it is a good idea to -also assign /proc/irq/XX/smp_affinity so that the TX interrupts are bound -to the same CPU. This reduces cache bouncing when freeing skbs. - -Plus using the device flag QUEUE_MAP_CPU, which maps the SKBs TX queue -to the running threads CPU (directly from smp_processor_id()). - -Enable IPsec -============ -Default IPsec transformation with ESP encapsulation plus transport mode -can be enabled by simply setting: - -pgset "flag IPSEC" -pgset "flows 1" - -To avoid breaking existing testbed scripts for using AH type and tunnel mode, -you can use "pgset spi SPI_VALUE" to specify which transformation mode -to employ. - - -Current commands and configuration options -========================================== - -** Pgcontrol commands: - -start -stop -reset - -** Thread commands: - -add_device -rem_device_all - - -** Device commands: - -count -clone_skb -burst -debug - -frags -delay - -src_mac_count -dst_mac_count - -pkt_size -min_pkt_size -max_pkt_size - -queue_map_min -queue_map_max -skb_priority - -tos (ipv4) -traffic_class (ipv6) - -mpls - -udp_src_min -udp_src_max - -udp_dst_min -udp_dst_max - -node - -flag - IPSRC_RND - IPDST_RND - UDPSRC_RND - UDPDST_RND - MACSRC_RND - MACDST_RND - TXSIZE_RND - IPV6 - MPLS_RND - VID_RND - SVID_RND - FLOW_SEQ - QUEUE_MAP_RND - QUEUE_MAP_CPU - UDPCSUM - IPSEC - NODE_ALLOC - NO_TIMESTAMP - -spi (ipsec) - -dst_min -dst_max - -src_min -src_max - -dst_mac -src_mac - -clear_counters - -src6 -dst6 -dst6_max -dst6_min - -flows -flowlen - -rate -ratep - -xmit_mode - -vlan_cfi -vlan_id -vlan_p - -svlan_cfi -svlan_id -svlan_p - - -References: -ftp://robur.slu.se/pub/Linux/net-development/pktgen-testing/ -ftp://robur.slu.se/pub/Linux/net-development/pktgen-testing/examples/ - -Paper from Linux-Kongress in Erlangen 2004. -ftp://robur.slu.se/pub/Linux/net-development/pktgen-testing/pktgen_paper.pdf - -Thanks to: -Grant Grundler for testing on IA-64 and parisc, Harald Welte, Lennert Buytenhek -Stephen Hemminger, Andi Kleen, Dave Miller and many others. - - -Good luck with the linux net-development. diff --git a/net/Kconfig b/net/Kconfig index 8b1f85820a6b..c5ba2d180c43 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -344,7 +344,7 @@ config NET_PKTGEN what was just said, you don't need it: say N. Documentation on how to use the packet generator can be found - at . + at . To compile this code as a module, choose M here: the module will be called pktgen. diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 08e2811b5274..b53b6d38c4df 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -56,7 +56,7 @@ * Integrated to 2.5.x 021029 --Lucio Maciel (luciomaciel@zipmail.com.br) * * 021124 Finished major redesign and rewrite for new functionality. - * See Documentation/networking/pktgen.txt for how to use this. + * See Documentation/networking/pktgen.rst for how to use this. * * The new operation: * For each CPU one thread/process is created at start. This process checks diff --git a/samples/pktgen/README.rst b/samples/pktgen/README.rst index 3f6483e8b2df..f9c53ca5cf93 100644 --- a/samples/pktgen/README.rst +++ b/samples/pktgen/README.rst @@ -3,7 +3,7 @@ Sample and benchmark scripts for pktgen (packet generator) This directory contains some pktgen sample and benchmark scripts, that can easily be copied and adjusted for your own use-case. -General doc is located in kernel: Documentation/networking/pktgen.txt +General doc is located in kernel: Documentation/networking/pktgen.rst Helper include files ==================== -- cgit v1.2.3 From 66d495d0a5aecd1692f6b5e3190de14f9a31e14b Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 30 Apr 2020 18:04:17 +0200 Subject: docs: networking: convert radiotap-headers.txt to ReST - add SPDX header; - adjust title markup; - mark code blocks and literals as such; - adjust identation, whitespaces and blank lines where needed; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/mac80211-injection.rst | 2 +- Documentation/networking/radiotap-headers.rst | 159 ++++++++++++++++++++++++ Documentation/networking/radiotap-headers.txt | 152 ---------------------- include/net/cfg80211.h | 2 +- net/wireless/radiotap.c | 2 +- 6 files changed, 163 insertions(+), 155 deletions(-) create mode 100644 Documentation/networking/radiotap-headers.rst delete mode 100644 Documentation/networking/radiotap-headers.txt (limited to 'net') diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 0da7eb0ec85a..85bc52d0b3a6 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -95,6 +95,7 @@ Contents: plip ppp_generic proc_net_tcp + radiotap-headers .. only:: subproject and html diff --git a/Documentation/networking/mac80211-injection.rst b/Documentation/networking/mac80211-injection.rst index 75d4edcae852..be65f886ff1f 100644 --- a/Documentation/networking/mac80211-injection.rst +++ b/Documentation/networking/mac80211-injection.rst @@ -13,7 +13,7 @@ following format:: [ payload ] The radiotap format is discussed in -./Documentation/networking/radiotap-headers.txt. +./Documentation/networking/radiotap-headers.rst. Despite many radiotap parameters being currently defined, most only make sense to appear on received packets. The following information is parsed from the diff --git a/Documentation/networking/radiotap-headers.rst b/Documentation/networking/radiotap-headers.rst new file mode 100644 index 000000000000..1a1bd1ec0650 --- /dev/null +++ b/Documentation/networking/radiotap-headers.rst @@ -0,0 +1,159 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=========================== +How to use radiotap headers +=========================== + +Pointer to the radiotap include file +------------------------------------ + +Radiotap headers are variable-length and extensible, you can get most of the +information you need to know on them from:: + + ./include/net/ieee80211_radiotap.h + +This document gives an overview and warns on some corner cases. + + +Structure of the header +----------------------- + +There is a fixed portion at the start which contains a u32 bitmap that defines +if the possible argument associated with that bit is present or not. So if b0 +of the it_present member of ieee80211_radiotap_header is set, it means that +the header for argument index 0 (IEEE80211_RADIOTAP_TSFT) is present in the +argument area. + +:: + + < 8-byte ieee80211_radiotap_header > + [ ] + [ ... ] + +At the moment there are only 13 possible argument indexes defined, but in case +we run out of space in the u32 it_present member, it is defined that b31 set +indicates that there is another u32 bitmap following (shown as "possible +argument bitmap extensions..." above), and the start of the arguments is moved +forward 4 bytes each time. + +Note also that the it_len member __le16 is set to the total number of bytes +covered by the ieee80211_radiotap_header and any arguments following. + + +Requirements for arguments +-------------------------- + +After the fixed part of the header, the arguments follow for each argument +index whose matching bit is set in the it_present member of +ieee80211_radiotap_header. + + - the arguments are all stored little-endian! + + - the argument payload for a given argument index has a fixed size. So + IEEE80211_RADIOTAP_TSFT being present always indicates an 8-byte argument is + present. See the comments in ./include/net/ieee80211_radiotap.h for a nice + breakdown of all the argument sizes + + - the arguments must be aligned to a boundary of the argument size using + padding. So a u16 argument must start on the next u16 boundary if it isn't + already on one, a u32 must start on the next u32 boundary and so on. + + - "alignment" is relative to the start of the ieee80211_radiotap_header, ie, + the first byte of the radiotap header. The absolute alignment of that first + byte isn't defined. So even if the whole radiotap header is starting at, eg, + address 0x00000003, still the first byte of the radiotap header is treated as + 0 for alignment purposes. + + - the above point that there may be no absolute alignment for multibyte + entities in the fixed radiotap header or the argument region means that you + have to take special evasive action when trying to access these multibyte + entities. Some arches like Blackfin cannot deal with an attempt to + dereference, eg, a u16 pointer that is pointing to an odd address. Instead + you have to use a kernel API get_unaligned() to dereference the pointer, + which will do it bytewise on the arches that require that. + + - The arguments for a given argument index can be a compound of multiple types + together. For example IEEE80211_RADIOTAP_CHANNEL has an argument payload + consisting of two u16s of total length 4. When this happens, the padding + rule is applied dealing with a u16, NOT dealing with a 4-byte single entity. + + +Example valid radiotap header +----------------------------- + +:: + + 0x00, 0x00, // <-- radiotap version + pad byte + 0x0b, 0x00, // <- radiotap header length + 0x04, 0x0c, 0x00, 0x00, // <-- bitmap + 0x6c, // <-- rate (in 500kHz units) + 0x0c, //<-- tx power + 0x01 //<-- antenna + + +Using the Radiotap Parser +------------------------- + +If you are having to parse a radiotap struct, you can radically simplify the +job by using the radiotap parser that lives in net/wireless/radiotap.c and has +its prototypes available in include/net/cfg80211.h. You use it like this:: + + #include + + /* buf points to the start of the radiotap header part */ + + int MyFunction(u8 * buf, int buflen) + { + int pkt_rate_100kHz = 0, antenna = 0, pwr = 0; + struct ieee80211_radiotap_iterator iterator; + int ret = ieee80211_radiotap_iterator_init(&iterator, buf, buflen); + + while (!ret) { + + ret = ieee80211_radiotap_iterator_next(&iterator); + + if (ret) + continue; + + /* see if this argument is something we can use */ + + switch (iterator.this_arg_index) { + /* + * You must take care when dereferencing iterator.this_arg + * for multibyte types... the pointer is not aligned. Use + * get_unaligned((type *)iterator.this_arg) to dereference + * iterator.this_arg for type "type" safely on all arches. + */ + case IEEE80211_RADIOTAP_RATE: + /* radiotap "rate" u8 is in + * 500kbps units, eg, 0x02=1Mbps + */ + pkt_rate_100kHz = (*iterator.this_arg) * 5; + break; + + case IEEE80211_RADIOTAP_ANTENNA: + /* radiotap uses 0 for 1st ant */ + antenna = *iterator.this_arg); + break; + + case IEEE80211_RADIOTAP_DBM_TX_POWER: + pwr = *iterator.this_arg; + break; + + default: + break; + } + } /* while more rt headers */ + + if (ret != -ENOENT) + return TXRX_DROP; + + /* discard the radiotap header part */ + buf += iterator.max_length; + buflen -= iterator.max_length; + + ... + + } + +Andy Green diff --git a/Documentation/networking/radiotap-headers.txt b/Documentation/networking/radiotap-headers.txt deleted file mode 100644 index 953331c7984f..000000000000 --- a/Documentation/networking/radiotap-headers.txt +++ /dev/null @@ -1,152 +0,0 @@ -How to use radiotap headers -=========================== - -Pointer to the radiotap include file ------------------------------------- - -Radiotap headers are variable-length and extensible, you can get most of the -information you need to know on them from: - -./include/net/ieee80211_radiotap.h - -This document gives an overview and warns on some corner cases. - - -Structure of the header ------------------------ - -There is a fixed portion at the start which contains a u32 bitmap that defines -if the possible argument associated with that bit is present or not. So if b0 -of the it_present member of ieee80211_radiotap_header is set, it means that -the header for argument index 0 (IEEE80211_RADIOTAP_TSFT) is present in the -argument area. - - < 8-byte ieee80211_radiotap_header > - [ ] - [ ... ] - -At the moment there are only 13 possible argument indexes defined, but in case -we run out of space in the u32 it_present member, it is defined that b31 set -indicates that there is another u32 bitmap following (shown as "possible -argument bitmap extensions..." above), and the start of the arguments is moved -forward 4 bytes each time. - -Note also that the it_len member __le16 is set to the total number of bytes -covered by the ieee80211_radiotap_header and any arguments following. - - -Requirements for arguments --------------------------- - -After the fixed part of the header, the arguments follow for each argument -index whose matching bit is set in the it_present member of -ieee80211_radiotap_header. - - - the arguments are all stored little-endian! - - - the argument payload for a given argument index has a fixed size. So - IEEE80211_RADIOTAP_TSFT being present always indicates an 8-byte argument is - present. See the comments in ./include/net/ieee80211_radiotap.h for a nice - breakdown of all the argument sizes - - - the arguments must be aligned to a boundary of the argument size using - padding. So a u16 argument must start on the next u16 boundary if it isn't - already on one, a u32 must start on the next u32 boundary and so on. - - - "alignment" is relative to the start of the ieee80211_radiotap_header, ie, - the first byte of the radiotap header. The absolute alignment of that first - byte isn't defined. So even if the whole radiotap header is starting at, eg, - address 0x00000003, still the first byte of the radiotap header is treated as - 0 for alignment purposes. - - - the above point that there may be no absolute alignment for multibyte - entities in the fixed radiotap header or the argument region means that you - have to take special evasive action when trying to access these multibyte - entities. Some arches like Blackfin cannot deal with an attempt to - dereference, eg, a u16 pointer that is pointing to an odd address. Instead - you have to use a kernel API get_unaligned() to dereference the pointer, - which will do it bytewise on the arches that require that. - - - The arguments for a given argument index can be a compound of multiple types - together. For example IEEE80211_RADIOTAP_CHANNEL has an argument payload - consisting of two u16s of total length 4. When this happens, the padding - rule is applied dealing with a u16, NOT dealing with a 4-byte single entity. - - -Example valid radiotap header ------------------------------ - - 0x00, 0x00, // <-- radiotap version + pad byte - 0x0b, 0x00, // <- radiotap header length - 0x04, 0x0c, 0x00, 0x00, // <-- bitmap - 0x6c, // <-- rate (in 500kHz units) - 0x0c, //<-- tx power - 0x01 //<-- antenna - - -Using the Radiotap Parser -------------------------- - -If you are having to parse a radiotap struct, you can radically simplify the -job by using the radiotap parser that lives in net/wireless/radiotap.c and has -its prototypes available in include/net/cfg80211.h. You use it like this: - -#include - -/* buf points to the start of the radiotap header part */ - -int MyFunction(u8 * buf, int buflen) -{ - int pkt_rate_100kHz = 0, antenna = 0, pwr = 0; - struct ieee80211_radiotap_iterator iterator; - int ret = ieee80211_radiotap_iterator_init(&iterator, buf, buflen); - - while (!ret) { - - ret = ieee80211_radiotap_iterator_next(&iterator); - - if (ret) - continue; - - /* see if this argument is something we can use */ - - switch (iterator.this_arg_index) { - /* - * You must take care when dereferencing iterator.this_arg - * for multibyte types... the pointer is not aligned. Use - * get_unaligned((type *)iterator.this_arg) to dereference - * iterator.this_arg for type "type" safely on all arches. - */ - case IEEE80211_RADIOTAP_RATE: - /* radiotap "rate" u8 is in - * 500kbps units, eg, 0x02=1Mbps - */ - pkt_rate_100kHz = (*iterator.this_arg) * 5; - break; - - case IEEE80211_RADIOTAP_ANTENNA: - /* radiotap uses 0 for 1st ant */ - antenna = *iterator.this_arg); - break; - - case IEEE80211_RADIOTAP_DBM_TX_POWER: - pwr = *iterator.this_arg; - break; - - default: - break; - } - } /* while more rt headers */ - - if (ret != -ENOENT) - return TXRX_DROP; - - /* discard the radiotap header part */ - buf += iterator.max_length; - buflen -= iterator.max_length; - - ... - -} - -Andy Green diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 70e48f66dac8..46ac80423b28 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -5211,7 +5211,7 @@ u32 ieee80211_mandatory_rates(struct ieee80211_supported_band *sband, * Radiotap parsing functions -- for controlled injection support * * Implemented in net/wireless/radiotap.c - * Documentation in Documentation/networking/radiotap-headers.txt + * Documentation in Documentation/networking/radiotap-headers.rst */ struct radiotap_align_size { diff --git a/net/wireless/radiotap.c b/net/wireless/radiotap.c index 6582d155e2fc..d5e28239e030 100644 --- a/net/wireless/radiotap.c +++ b/net/wireless/radiotap.c @@ -90,7 +90,7 @@ static const struct ieee80211_radiotap_namespace radiotap_ns = { * iterator.this_arg for type "type" safely on all arches. * * Example code: - * See Documentation/networking/radiotap-headers.txt + * See Documentation/networking/radiotap-headers.rst */ int ieee80211_radiotap_iterator_init( -- cgit v1.2.3 From 9f72374cb5959556870be8078b128158edde5d3e Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 30 Apr 2020 18:04:21 +0200 Subject: docs: networking: convert rxrpc.txt to ReST - add SPDX header; - adjust title markup; - use autonumbered list markups; - mark code blocks and literals as such; - mark tables as such; - adjust identation, whitespaces and blank lines where needed; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/filesystems/afs.rst | 2 +- Documentation/networking/index.rst | 1 + Documentation/networking/rxrpc.rst | 1169 ++++++++++++++++++++++++++++++++++++ Documentation/networking/rxrpc.txt | 1155 ----------------------------------- MAINTAINERS | 2 +- net/rxrpc/Kconfig | 6 +- net/rxrpc/sysctl.c | 2 +- 7 files changed, 1176 insertions(+), 1161 deletions(-) create mode 100644 Documentation/networking/rxrpc.rst delete mode 100644 Documentation/networking/rxrpc.txt (limited to 'net') diff --git a/Documentation/filesystems/afs.rst b/Documentation/filesystems/afs.rst index c4ec39a5966e..cada9464d6bd 100644 --- a/Documentation/filesystems/afs.rst +++ b/Documentation/filesystems/afs.rst @@ -70,7 +70,7 @@ list of volume location server IP addresses:: The first module is the AF_RXRPC network protocol driver. This provides the RxRPC remote operation protocol and may also be accessed from userspace. See: - Documentation/networking/rxrpc.txt + Documentation/networking/rxrpc.rst The second module is the kerberos RxRPC security driver, and the third module is the actual filesystem driver for the AFS filesystem. diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index bc3b04a2edde..cd307b9601fa 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -99,6 +99,7 @@ Contents: ray_cs rds regulatory + rxrpc .. only:: subproject and html diff --git a/Documentation/networking/rxrpc.rst b/Documentation/networking/rxrpc.rst new file mode 100644 index 000000000000..5ad35113d0f4 --- /dev/null +++ b/Documentation/networking/rxrpc.rst @@ -0,0 +1,1169 @@ +.. SPDX-License-Identifier: GPL-2.0 + +====================== +RxRPC Network Protocol +====================== + +The RxRPC protocol driver provides a reliable two-phase transport on top of UDP +that can be used to perform RxRPC remote operations. This is done over sockets +of AF_RXRPC family, using sendmsg() and recvmsg() with control data to send and +receive data, aborts and errors. + +Contents of this document: + + (#) Overview. + + (#) RxRPC protocol summary. + + (#) AF_RXRPC driver model. + + (#) Control messages. + + (#) Socket options. + + (#) Security. + + (#) Example client usage. + + (#) Example server usage. + + (#) AF_RXRPC kernel interface. + + (#) Configurable parameters. + + +Overview +======== + +RxRPC is a two-layer protocol. There is a session layer which provides +reliable virtual connections using UDP over IPv4 (or IPv6) as the transport +layer, but implements a real network protocol; and there's the presentation +layer which renders structured data to binary blobs and back again using XDR +(as does SunRPC):: + + +-------------+ + | Application | + +-------------+ + | XDR | Presentation + +-------------+ + | RxRPC | Session + +-------------+ + | UDP | Transport + +-------------+ + + +AF_RXRPC provides: + + (1) Part of an RxRPC facility for both kernel and userspace applications by + making the session part of it a Linux network protocol (AF_RXRPC). + + (2) A two-phase protocol. The client transmits a blob (the request) and then + receives a blob (the reply), and the server receives the request and then + transmits the reply. + + (3) Retention of the reusable bits of the transport system set up for one call + to speed up subsequent calls. + + (4) A secure protocol, using the Linux kernel's key retention facility to + manage security on the client end. The server end must of necessity be + more active in security negotiations. + +AF_RXRPC does not provide XDR marshalling/presentation facilities. That is +left to the application. AF_RXRPC only deals in blobs. Even the operation ID +is just the first four bytes of the request blob, and as such is beyond the +kernel's interest. + + +Sockets of AF_RXRPC family are: + + (1) created as type SOCK_DGRAM; + + (2) provided with a protocol of the type of underlying transport they're going + to use - currently only PF_INET is supported. + + +The Andrew File System (AFS) is an example of an application that uses this and +that has both kernel (filesystem) and userspace (utility) components. + + +RxRPC Protocol Summary +====================== + +An overview of the RxRPC protocol: + + (#) RxRPC sits on top of another networking protocol (UDP is the only option + currently), and uses this to provide network transport. UDP ports, for + example, provide transport endpoints. + + (#) RxRPC supports multiple virtual "connections" from any given transport + endpoint, thus allowing the endpoints to be shared, even to the same + remote endpoint. + + (#) Each connection goes to a particular "service". A connection may not go + to multiple services. A service may be considered the RxRPC equivalent of + a port number. AF_RXRPC permits multiple services to share an endpoint. + + (#) Client-originating packets are marked, thus a transport endpoint can be + shared between client and server connections (connections have a + direction). + + (#) Up to a billion connections may be supported concurrently between one + local transport endpoint and one service on one remote endpoint. An RxRPC + connection is described by seven numbers:: + + Local address } + Local port } Transport (UDP) address + Remote address } + Remote port } + Direction + Connection ID + Service ID + + (#) Each RxRPC operation is a "call". A connection may make up to four + billion calls, but only up to four calls may be in progress on a + connection at any one time. + + (#) Calls are two-phase and asymmetric: the client sends its request data, + which the service receives; then the service transmits the reply data + which the client receives. + + (#) The data blobs are of indefinite size, the end of a phase is marked with a + flag in the packet. The number of packets of data making up one blob may + not exceed 4 billion, however, as this would cause the sequence number to + wrap. + + (#) The first four bytes of the request data are the service operation ID. + + (#) Security is negotiated on a per-connection basis. The connection is + initiated by the first data packet on it arriving. If security is + requested, the server then issues a "challenge" and then the client + replies with a "response". If the response is successful, the security is + set for the lifetime of that connection, and all subsequent calls made + upon it use that same security. In the event that the server lets a + connection lapse before the client, the security will be renegotiated if + the client uses the connection again. + + (#) Calls use ACK packets to handle reliability. Data packets are also + explicitly sequenced per call. + + (#) There are two types of positive acknowledgment: hard-ACKs and soft-ACKs. + A hard-ACK indicates to the far side that all the data received to a point + has been received and processed; a soft-ACK indicates that the data has + been received but may yet be discarded and re-requested. The sender may + not discard any transmittable packets until they've been hard-ACK'd. + + (#) Reception of a reply data packet implicitly hard-ACK's all the data + packets that make up the request. + + (#) An call is complete when the request has been sent, the reply has been + received and the final hard-ACK on the last packet of the reply has + reached the server. + + (#) An call may be aborted by either end at any time up to its completion. + + +AF_RXRPC Driver Model +===================== + +About the AF_RXRPC driver: + + (#) The AF_RXRPC protocol transparently uses internal sockets of the transport + protocol to represent transport endpoints. + + (#) AF_RXRPC sockets map onto RxRPC connection bundles. Actual RxRPC + connections are handled transparently. One client socket may be used to + make multiple simultaneous calls to the same service. One server socket + may handle calls from many clients. + + (#) Additional parallel client connections will be initiated to support extra + concurrent calls, up to a tunable limit. + + (#) Each connection is retained for a certain amount of time [tunable] after + the last call currently using it has completed in case a new call is made + that could reuse it. + + (#) Each internal UDP socket is retained [tunable] for a certain amount of + time [tunable] after the last connection using it discarded, in case a new + connection is made that could use it. + + (#) A client-side connection is only shared between calls if they have have + the same key struct describing their security (and assuming the calls + would otherwise share the connection). Non-secured calls would also be + able to share connections with each other. + + (#) A server-side connection is shared if the client says it is. + + (#) ACK'ing is handled by the protocol driver automatically, including ping + replying. + + (#) SO_KEEPALIVE automatically pings the other side to keep the connection + alive [TODO]. + + (#) If an ICMP error is received, all calls affected by that error will be + aborted with an appropriate network error passed through recvmsg(). + + +Interaction with the user of the RxRPC socket: + + (#) A socket is made into a server socket by binding an address with a + non-zero service ID. + + (#) In the client, sending a request is achieved with one or more sendmsgs, + followed by the reply being received with one or more recvmsgs. + + (#) The first sendmsg for a request to be sent from a client contains a tag to + be used in all other sendmsgs or recvmsgs associated with that call. The + tag is carried in the control data. + + (#) connect() is used to supply a default destination address for a client + socket. This may be overridden by supplying an alternate address to the + first sendmsg() of a call (struct msghdr::msg_name). + + (#) If connect() is called on an unbound client, a random local port will + bound before the operation takes place. + + (#) A server socket may also be used to make client calls. To do this, the + first sendmsg() of the call must specify the target address. The server's + transport endpoint is used to send the packets. + + (#) Once the application has received the last message associated with a call, + the tag is guaranteed not to be seen again, and so it can be used to pin + client resources. A new call can then be initiated with the same tag + without fear of interference. + + (#) In the server, a request is received with one or more recvmsgs, then the + the reply is transmitted with one or more sendmsgs, and then the final ACK + is received with a last recvmsg. + + (#) When sending data for a call, sendmsg is given MSG_MORE if there's more + data to come on that call. + + (#) When receiving data for a call, recvmsg flags MSG_MORE if there's more + data to come for that call. + + (#) When receiving data or messages for a call, MSG_EOR is flagged by recvmsg + to indicate the terminal message for that call. + + (#) A call may be aborted by adding an abort control message to the control + data. Issuing an abort terminates the kernel's use of that call's tag. + Any messages waiting in the receive queue for that call will be discarded. + + (#) Aborts, busy notifications and challenge packets are delivered by recvmsg, + and control data messages will be set to indicate the context. Receiving + an abort or a busy message terminates the kernel's use of that call's tag. + + (#) The control data part of the msghdr struct is used for a number of things: + + (#) The tag of the intended or affected call. + + (#) Sending or receiving errors, aborts and busy notifications. + + (#) Notifications of incoming calls. + + (#) Sending debug requests and receiving debug replies [TODO]. + + (#) When the kernel has received and set up an incoming call, it sends a + message to server application to let it know there's a new call awaiting + its acceptance [recvmsg reports a special control message]. The server + application then uses sendmsg to assign a tag to the new call. Once that + is done, the first part of the request data will be delivered by recvmsg. + + (#) The server application has to provide the server socket with a keyring of + secret keys corresponding to the security types it permits. When a secure + connection is being set up, the kernel looks up the appropriate secret key + in the keyring and then sends a challenge packet to the client and + receives a response packet. The kernel then checks the authorisation of + the packet and either aborts the connection or sets up the security. + + (#) The name of the key a client will use to secure its communications is + nominated by a socket option. + + +Notes on sendmsg: + + (#) MSG_WAITALL can be set to tell sendmsg to ignore signals if the peer is + making progress at accepting packets within a reasonable time such that we + manage to queue up all the data for transmission. This requires the + client to accept at least one packet per 2*RTT time period. + + If this isn't set, sendmsg() will return immediately, either returning + EINTR/ERESTARTSYS if nothing was consumed or returning the amount of data + consumed. + + +Notes on recvmsg: + + (#) If there's a sequence of data messages belonging to a particular call on + the receive queue, then recvmsg will keep working through them until: + + (a) it meets the end of that call's received data, + + (b) it meets a non-data message, + + (c) it meets a message belonging to a different call, or + + (d) it fills the user buffer. + + If recvmsg is called in blocking mode, it will keep sleeping, awaiting the + reception of further data, until one of the above four conditions is met. + + (2) MSG_PEEK operates similarly, but will return immediately if it has put any + data in the buffer rather than sleeping until it can fill the buffer. + + (3) If a data message is only partially consumed in filling a user buffer, + then the remainder of that message will be left on the front of the queue + for the next taker. MSG_TRUNC will never be flagged. + + (4) If there is more data to be had on a call (it hasn't copied the last byte + of the last data message in that phase yet), then MSG_MORE will be + flagged. + + +Control Messages +================ + +AF_RXRPC makes use of control messages in sendmsg() and recvmsg() to multiplex +calls, to invoke certain actions and to report certain conditions. These are: + + ======================= === =========== =============================== + MESSAGE ID SRT DATA MEANING + ======================= === =========== =============================== + RXRPC_USER_CALL_ID sr- User ID App's call specifier + RXRPC_ABORT srt Abort code Abort code to issue/received + RXRPC_ACK -rt n/a Final ACK received + RXRPC_NET_ERROR -rt error num Network error on call + RXRPC_BUSY -rt n/a Call rejected (server busy) + RXRPC_LOCAL_ERROR -rt error num Local error encountered + RXRPC_NEW_CALL -r- n/a New call received + RXRPC_ACCEPT s-- n/a Accept new call + RXRPC_EXCLUSIVE_CALL s-- n/a Make an exclusive client call + RXRPC_UPGRADE_SERVICE s-- n/a Client call can be upgraded + RXRPC_TX_LENGTH s-- data len Total length of Tx data + ======================= === =========== =============================== + + (SRT = usable in Sendmsg / delivered by Recvmsg / Terminal message) + + (#) RXRPC_USER_CALL_ID + + This is used to indicate the application's call ID. It's an unsigned long + that the app specifies in the client by attaching it to the first data + message or in the server by passing it in association with an RXRPC_ACCEPT + message. recvmsg() passes it in conjunction with all messages except + those of the RXRPC_NEW_CALL message. + + (#) RXRPC_ABORT + + This is can be used by an application to abort a call by passing it to + sendmsg, or it can be delivered by recvmsg to indicate a remote abort was + received. Either way, it must be associated with an RXRPC_USER_CALL_ID to + specify the call affected. If an abort is being sent, then error EBADSLT + will be returned if there is no call with that user ID. + + (#) RXRPC_ACK + + This is delivered to a server application to indicate that the final ACK + of a call was received from the client. It will be associated with an + RXRPC_USER_CALL_ID to indicate the call that's now complete. + + (#) RXRPC_NET_ERROR + + This is delivered to an application to indicate that an ICMP error message + was encountered in the process of trying to talk to the peer. An + errno-class integer value will be included in the control message data + indicating the problem, and an RXRPC_USER_CALL_ID will indicate the call + affected. + + (#) RXRPC_BUSY + + This is delivered to a client application to indicate that a call was + rejected by the server due to the server being busy. It will be + associated with an RXRPC_USER_CALL_ID to indicate the rejected call. + + (#) RXRPC_LOCAL_ERROR + + This is delivered to an application to indicate that a local error was + encountered and that a call has been aborted because of it. An + errno-class integer value will be included in the control message data + indicating the problem, and an RXRPC_USER_CALL_ID will indicate the call + affected. + + (#) RXRPC_NEW_CALL + + This is delivered to indicate to a server application that a new call has + arrived and is awaiting acceptance. No user ID is associated with this, + as a user ID must subsequently be assigned by doing an RXRPC_ACCEPT. + + (#) RXRPC_ACCEPT + + This is used by a server application to attempt to accept a call and + assign it a user ID. It should be associated with an RXRPC_USER_CALL_ID + to indicate the user ID to be assigned. If there is no call to be + accepted (it may have timed out, been aborted, etc.), then sendmsg will + return error ENODATA. If the user ID is already in use by another call, + then error EBADSLT will be returned. + + (#) RXRPC_EXCLUSIVE_CALL + + This is used to indicate that a client call should be made on a one-off + connection. The connection is discarded once the call has terminated. + + (#) RXRPC_UPGRADE_SERVICE + + This is used to make a client call to probe if the specified service ID + may be upgraded by the server. The caller must check msg_name returned to + recvmsg() for the service ID actually in use. The operation probed must + be one that takes the same arguments in both services. + + Once this has been used to establish the upgrade capability (or lack + thereof) of the server, the service ID returned should be used for all + future communication to that server and RXRPC_UPGRADE_SERVICE should no + longer be set. + + (#) RXRPC_TX_LENGTH + + This is used to inform the kernel of the total amount of data that is + going to be transmitted by a call (whether in a client request or a + service response). If given, it allows the kernel to encrypt from the + userspace buffer directly to the packet buffers, rather than copying into + the buffer and then encrypting in place. This may only be given with the + first sendmsg() providing data for a call. EMSGSIZE will be generated if + the amount of data actually given is different. + + This takes a parameter of __s64 type that indicates how much will be + transmitted. This may not be less than zero. + +The symbol RXRPC__SUPPORTED is defined as one more than the highest control +message type supported. At run time this can be queried by means of the +RXRPC_SUPPORTED_CMSG socket option (see below). + + +============== +SOCKET OPTIONS +============== + +AF_RXRPC sockets support a few socket options at the SOL_RXRPC level: + + (#) RXRPC_SECURITY_KEY + + This is used to specify the description of the key to be used. The key is + extracted from the calling process's keyrings with request_key() and + should be of "rxrpc" type. + + The optval pointer points to the description string, and optlen indicates + how long the string is, without the NUL terminator. + + (#) RXRPC_SECURITY_KEYRING + + Similar to above but specifies a keyring of server secret keys to use (key + type "keyring"). See the "Security" section. + + (#) RXRPC_EXCLUSIVE_CONNECTION + + This is used to request that new connections should be used for each call + made subsequently on this socket. optval should be NULL and optlen 0. + + (#) RXRPC_MIN_SECURITY_LEVEL + + This is used to specify the minimum security level required for calls on + this socket. optval must point to an int containing one of the following + values: + + (a) RXRPC_SECURITY_PLAIN + + Encrypted checksum only. + + (b) RXRPC_SECURITY_AUTH + + Encrypted checksum plus packet padded and first eight bytes of packet + encrypted - which includes the actual packet length. + + (c) RXRPC_SECURITY_ENCRYPTED + + Encrypted checksum plus entire packet padded and encrypted, including + actual packet length. + + (#) RXRPC_UPGRADEABLE_SERVICE + + This is used to indicate that a service socket with two bindings may + upgrade one bound service to the other if requested by the client. optval + must point to an array of two unsigned short ints. The first is the + service ID to upgrade from and the second the service ID to upgrade to. + + (#) RXRPC_SUPPORTED_CMSG + + This is a read-only option that writes an int into the buffer indicating + the highest control message type supported. + + +======== +SECURITY +======== + +Currently, only the kerberos 4 equivalent protocol has been implemented +(security index 2 - rxkad). This requires the rxkad module to be loaded and, +on the client, tickets of the appropriate type to be obtained from the AFS +kaserver or the kerberos server and installed as "rxrpc" type keys. This is +normally done using the klog program. An example simple klog program can be +found at: + + http://people.redhat.com/~dhowells/rxrpc/klog.c + +The payload provided to add_key() on the client should be of the following +form:: + + struct rxrpc_key_sec2_v1 { + uint16_t security_index; /* 2 */ + uint16_t ticket_length; /* length of ticket[] */ + uint32_t expiry; /* time at which expires */ + uint8_t kvno; /* key version number */ + uint8_t __pad[3]; + uint8_t session_key[8]; /* DES session key */ + uint8_t ticket[0]; /* the encrypted ticket */ + }; + +Where the ticket blob is just appended to the above structure. + + +For the server, keys of type "rxrpc_s" must be made available to the server. +They have a description of ":" (eg: "52:2" for an +rxkad key for the AFS VL service). When such a key is created, it should be +given the server's secret key as the instantiation data (see the example +below). + + add_key("rxrpc_s", "52:2", secret_key, 8, keyring); + +A keyring is passed to the server socket by naming it in a sockopt. The server +socket then looks the server secret keys up in this keyring when secure +incoming connections are made. This can be seen in an example program that can +be found at: + + http://people.redhat.com/~dhowells/rxrpc/listen.c + + +==================== +EXAMPLE CLIENT USAGE +==================== + +A client would issue an operation by: + + (1) An RxRPC socket is set up by:: + + client = socket(AF_RXRPC, SOCK_DGRAM, PF_INET); + + Where the third parameter indicates the protocol family of the transport + socket used - usually IPv4 but it can also be IPv6 [TODO]. + + (2) A local address can optionally be bound:: + + struct sockaddr_rxrpc srx = { + .srx_family = AF_RXRPC, + .srx_service = 0, /* we're a client */ + .transport_type = SOCK_DGRAM, /* type of transport socket */ + .transport.sin_family = AF_INET, + .transport.sin_port = htons(7000), /* AFS callback */ + .transport.sin_address = 0, /* all local interfaces */ + }; + bind(client, &srx, sizeof(srx)); + + This specifies the local UDP port to be used. If not given, a random + non-privileged port will be used. A UDP port may be shared between + several unrelated RxRPC sockets. Security is handled on a basis of + per-RxRPC virtual connection. + + (3) The security is set:: + + const char *key = "AFS:cambridge.redhat.com"; + setsockopt(client, SOL_RXRPC, RXRPC_SECURITY_KEY, key, strlen(key)); + + This issues a request_key() to get the key representing the security + context. The minimum security level can be set:: + + unsigned int sec = RXRPC_SECURITY_ENCRYPTED; + setsockopt(client, SOL_RXRPC, RXRPC_MIN_SECURITY_LEVEL, + &sec, sizeof(sec)); + + (4) The server to be contacted can then be specified (alternatively this can + be done through sendmsg):: + + struct sockaddr_rxrpc srx = { + .srx_family = AF_RXRPC, + .srx_service = VL_SERVICE_ID, + .transport_type = SOCK_DGRAM, /* type of transport socket */ + .transport.sin_family = AF_INET, + .transport.sin_port = htons(7005), /* AFS volume manager */ + .transport.sin_address = ..., + }; + connect(client, &srx, sizeof(srx)); + + (5) The request data should then be posted to the server socket using a series + of sendmsg() calls, each with the following control message attached: + + ================== =================================== + RXRPC_USER_CALL_ID specifies the user ID for this call + ================== =================================== + + MSG_MORE should be set in msghdr::msg_flags on all but the last part of + the request. Multiple requests may be made simultaneously. + + An RXRPC_TX_LENGTH control message can also be specified on the first + sendmsg() call. + + If a call is intended to go to a destination other than the default + specified through connect(), then msghdr::msg_name should be set on the + first request message of that call. + + (6) The reply data will then be posted to the server socket for recvmsg() to + pick up. MSG_MORE will be flagged by recvmsg() if there's more reply data + for a particular call to be read. MSG_EOR will be set on the terminal + read for a call. + + All data will be delivered with the following control message attached: + + RXRPC_USER_CALL_ID - specifies the user ID for this call + + If an abort or error occurred, this will be returned in the control data + buffer instead, and MSG_EOR will be flagged to indicate the end of that + call. + +A client may ask for a service ID it knows and ask that this be upgraded to a +better service if one is available by supplying RXRPC_UPGRADE_SERVICE on the +first sendmsg() of a call. The client should then check srx_service in the +msg_name filled in by recvmsg() when collecting the result. srx_service will +hold the same value as given to sendmsg() if the upgrade request was ignored by +the service - otherwise it will be altered to indicate the service ID the +server upgraded to. Note that the upgraded service ID is chosen by the server. +The caller has to wait until it sees the service ID in the reply before sending +any more calls (further calls to the same destination will be blocked until the +probe is concluded). + + +Example Server Usage +==================== + +A server would be set up to accept operations in the following manner: + + (1) An RxRPC socket is created by:: + + server = socket(AF_RXRPC, SOCK_DGRAM, PF_INET); + + Where the third parameter indicates the address type of the transport + socket used - usually IPv4. + + (2) Security is set up if desired by giving the socket a keyring with server + secret keys in it:: + + keyring = add_key("keyring", "AFSkeys", NULL, 0, + KEY_SPEC_PROCESS_KEYRING); + + const char secret_key[8] = { + 0xa7, 0x83, 0x8a, 0xcb, 0xc7, 0x83, 0xec, 0x94 }; + add_key("rxrpc_s", "52:2", secret_key, 8, keyring); + + setsockopt(server, SOL_RXRPC, RXRPC_SECURITY_KEYRING, "AFSkeys", 7); + + The keyring can be manipulated after it has been given to the socket. This + permits the server to add more keys, replace keys, etc. while it is live. + + (3) A local address must then be bound:: + + struct sockaddr_rxrpc srx = { + .srx_family = AF_RXRPC, + .srx_service = VL_SERVICE_ID, /* RxRPC service ID */ + .transport_type = SOCK_DGRAM, /* type of transport socket */ + .transport.sin_family = AF_INET, + .transport.sin_port = htons(7000), /* AFS callback */ + .transport.sin_address = 0, /* all local interfaces */ + }; + bind(server, &srx, sizeof(srx)); + + More than one service ID may be bound to a socket, provided the transport + parameters are the same. The limit is currently two. To do this, bind() + should be called twice. + + (4) If service upgrading is required, first two service IDs must have been + bound and then the following option must be set:: + + unsigned short service_ids[2] = { from_ID, to_ID }; + setsockopt(server, SOL_RXRPC, RXRPC_UPGRADEABLE_SERVICE, + service_ids, sizeof(service_ids)); + + This will automatically upgrade connections on service from_ID to service + to_ID if they request it. This will be reflected in msg_name obtained + through recvmsg() when the request data is delivered to userspace. + + (5) The server is then set to listen out for incoming calls:: + + listen(server, 100); + + (6) The kernel notifies the server of pending incoming connections by sending + it a message for each. This is received with recvmsg() on the server + socket. It has no data, and has a single dataless control message + attached:: + + RXRPC_NEW_CALL + + The address that can be passed back by recvmsg() at this point should be + ignored since the call for which the message was posted may have gone by + the time it is accepted - in which case the first call still on the queue + will be accepted. + + (7) The server then accepts the new call by issuing a sendmsg() with two + pieces of control data and no actual data: + + ================== ============================== + RXRPC_ACCEPT indicate connection acceptance + RXRPC_USER_CALL_ID specify user ID for this call + ================== ============================== + + (8) The first request data packet will then be posted to the server socket for + recvmsg() to pick up. At that point, the RxRPC address for the call can + be read from the address fields in the msghdr struct. + + Subsequent request data will be posted to the server socket for recvmsg() + to collect as it arrives. All but the last piece of the request data will + be delivered with MSG_MORE flagged. + + All data will be delivered with the following control message attached: + + + ================== =================================== + RXRPC_USER_CALL_ID specifies the user ID for this call + ================== =================================== + + (9) The reply data should then be posted to the server socket using a series + of sendmsg() calls, each with the following control messages attached: + + ================== =================================== + RXRPC_USER_CALL_ID specifies the user ID for this call + ================== =================================== + + MSG_MORE should be set in msghdr::msg_flags on all but the last message + for a particular call. + +(10) The final ACK from the client will be posted for retrieval by recvmsg() + when it is received. It will take the form of a dataless message with two + control messages attached: + + ================== =================================== + RXRPC_USER_CALL_ID specifies the user ID for this call + RXRPC_ACK indicates final ACK (no data) + ================== =================================== + + MSG_EOR will be flagged to indicate that this is the final message for + this call. + +(11) Up to the point the final packet of reply data is sent, the call can be + aborted by calling sendmsg() with a dataless message with the following + control messages attached: + + ================== =================================== + RXRPC_USER_CALL_ID specifies the user ID for this call + RXRPC_ABORT indicates abort code (4 byte data) + ================== =================================== + + Any packets waiting in the socket's receive queue will be discarded if + this is issued. + +Note that all the communications for a particular service take place through +the one server socket, using control messages on sendmsg() and recvmsg() to +determine the call affected. + + +AF_RXRPC Kernel Interface +========================= + +The AF_RXRPC module also provides an interface for use by in-kernel utilities +such as the AFS filesystem. This permits such a utility to: + + (1) Use different keys directly on individual client calls on one socket + rather than having to open a whole slew of sockets, one for each key it + might want to use. + + (2) Avoid having RxRPC call request_key() at the point of issue of a call or + opening of a socket. Instead the utility is responsible for requesting a + key at the appropriate point. AFS, for instance, would do this during VFS + operations such as open() or unlink(). The key is then handed through + when the call is initiated. + + (3) Request the use of something other than GFP_KERNEL to allocate memory. + + (4) Avoid the overhead of using the recvmsg() call. RxRPC messages can be + intercepted before they get put into the socket Rx queue and the socket + buffers manipulated directly. + +To use the RxRPC facility, a kernel utility must still open an AF_RXRPC socket, +bind an address as appropriate and listen if it's to be a server socket, but +then it passes this to the kernel interface functions. + +The kernel interface functions are as follows: + + (#) Begin a new client call:: + + struct rxrpc_call * + rxrpc_kernel_begin_call(struct socket *sock, + struct sockaddr_rxrpc *srx, + struct key *key, + unsigned long user_call_ID, + s64 tx_total_len, + gfp_t gfp, + rxrpc_notify_rx_t notify_rx, + bool upgrade, + bool intr, + unsigned int debug_id); + + This allocates the infrastructure to make a new RxRPC call and assigns + call and connection numbers. The call will be made on the UDP port that + the socket is bound to. The call will go to the destination address of a + connected client socket unless an alternative is supplied (srx is + non-NULL). + + If a key is supplied then this will be used to secure the call instead of + the key bound to the socket with the RXRPC_SECURITY_KEY sockopt. Calls + secured in this way will still share connections if at all possible. + + The user_call_ID is equivalent to that supplied to sendmsg() in the + control data buffer. It is entirely feasible to use this to point to a + kernel data structure. + + tx_total_len is the amount of data the caller is intending to transmit + with this call (or -1 if unknown at this point). Setting the data size + allows the kernel to encrypt directly to the packet buffers, thereby + saving a copy. The value may not be less than -1. + + notify_rx is a pointer to a function to be called when events such as + incoming data packets or remote aborts happen. + + upgrade should be set to true if a client operation should request that + the server upgrade the service to a better one. The resultant service ID + is returned by rxrpc_kernel_recv_data(). + + intr should be set to true if the call should be interruptible. If this + is not set, this function may not return until a channel has been + allocated; if it is set, the function may return -ERESTARTSYS. + + debug_id is the call debugging ID to be used for tracing. This can be + obtained by atomically incrementing rxrpc_debug_id. + + If this function is successful, an opaque reference to the RxRPC call is + returned. The caller now holds a reference on this and it must be + properly ended. + + (#) End a client call:: + + void rxrpc_kernel_end_call(struct socket *sock, + struct rxrpc_call *call); + + This is used to end a previously begun call. The user_call_ID is expunged + from AF_RXRPC's knowledge and will not be seen again in association with + the specified call. + + (#) Send data through a call:: + + typedef void (*rxrpc_notify_end_tx_t)(struct sock *sk, + unsigned long user_call_ID, + struct sk_buff *skb); + + int rxrpc_kernel_send_data(struct socket *sock, + struct rxrpc_call *call, + struct msghdr *msg, + size_t len, + rxrpc_notify_end_tx_t notify_end_rx); + + This is used to supply either the request part of a client call or the + reply part of a server call. msg.msg_iovlen and msg.msg_iov specify the + data buffers to be used. msg_iov may not be NULL and must point + exclusively to in-kernel virtual addresses. msg.msg_flags may be given + MSG_MORE if there will be subsequent data sends for this call. + + The msg must not specify a destination address, control data or any flags + other than MSG_MORE. len is the total amount of data to transmit. + + notify_end_rx can be NULL or it can be used to specify a function to be + called when the call changes state to end the Tx phase. This function is + called with the call-state spinlock held to prevent any reply or final ACK + from being delivered first. + + (#) Receive data from a call:: + + int rxrpc_kernel_recv_data(struct socket *sock, + struct rxrpc_call *call, + void *buf, + size_t size, + size_t *_offset, + bool want_more, + u32 *_abort, + u16 *_service) + + This is used to receive data from either the reply part of a client call + or the request part of a service call. buf and size specify how much + data is desired and where to store it. *_offset is added on to buf and + subtracted from size internally; the amount copied into the buffer is + added to *_offset before returning. + + want_more should be true if further data will be required after this is + satisfied and false if this is the last item of the receive phase. + + There are three normal returns: 0 if the buffer was filled and want_more + was true; 1 if the buffer was filled, the last DATA packet has been + emptied and want_more was false; and -EAGAIN if the function needs to be + called again. + + If the last DATA packet is processed but the buffer contains less than + the amount requested, EBADMSG is returned. If want_more wasn't set, but + more data was available, EMSGSIZE is returned. + + If a remote ABORT is detected, the abort code received will be stored in + ``*_abort`` and ECONNABORTED will be returned. + + The service ID that the call ended up with is returned into *_service. + This can be used to see if a call got a service upgrade. + + (#) Abort a call?? + + :: + + void rxrpc_kernel_abort_call(struct socket *sock, + struct rxrpc_call *call, + u32 abort_code); + + This is used to abort a call if it's still in an abortable state. The + abort code specified will be placed in the ABORT message sent. + + (#) Intercept received RxRPC messages:: + + typedef void (*rxrpc_interceptor_t)(struct sock *sk, + unsigned long user_call_ID, + struct sk_buff *skb); + + void + rxrpc_kernel_intercept_rx_messages(struct socket *sock, + rxrpc_interceptor_t interceptor); + + This installs an interceptor function on the specified AF_RXRPC socket. + All messages that would otherwise wind up in the socket's Rx queue are + then diverted to this function. Note that care must be taken to process + the messages in the right order to maintain DATA message sequentiality. + + The interceptor function itself is provided with the address of the socket + and handling the incoming message, the ID assigned by the kernel utility + to the call and the socket buffer containing the message. + + The skb->mark field indicates the type of message: + + =============================== ======================================= + Mark Meaning + =============================== ======================================= + RXRPC_SKB_MARK_DATA Data message + RXRPC_SKB_MARK_FINAL_ACK Final ACK received for an incoming call + RXRPC_SKB_MARK_BUSY Client call rejected as server busy + RXRPC_SKB_MARK_REMOTE_ABORT Call aborted by peer + RXRPC_SKB_MARK_NET_ERROR Network error detected + RXRPC_SKB_MARK_LOCAL_ERROR Local error encountered + RXRPC_SKB_MARK_NEW_CALL New incoming call awaiting acceptance + =============================== ======================================= + + The remote abort message can be probed with rxrpc_kernel_get_abort_code(). + The two error messages can be probed with rxrpc_kernel_get_error_number(). + A new call can be accepted with rxrpc_kernel_accept_call(). + + Data messages can have their contents extracted with the usual bunch of + socket buffer manipulation functions. A data message can be determined to + be the last one in a sequence with rxrpc_kernel_is_data_last(). When a + data message has been used up, rxrpc_kernel_data_consumed() should be + called on it. + + Messages should be handled to rxrpc_kernel_free_skb() to dispose of. It + is possible to get extra refs on all types of message for later freeing, + but this may pin the state of a call until the message is finally freed. + + (#) Accept an incoming call:: + + struct rxrpc_call * + rxrpc_kernel_accept_call(struct socket *sock, + unsigned long user_call_ID); + + This is used to accept an incoming call and to assign it a call ID. This + function is similar to rxrpc_kernel_begin_call() and calls accepted must + be ended in the same way. + + If this function is successful, an opaque reference to the RxRPC call is + returned. The caller now holds a reference on this and it must be + properly ended. + + (#) Reject an incoming call:: + + int rxrpc_kernel_reject_call(struct socket *sock); + + This is used to reject the first incoming call on the socket's queue with + a BUSY message. -ENODATA is returned if there were no incoming calls. + Other errors may be returned if the call had been aborted (-ECONNABORTED) + or had timed out (-ETIME). + + (#) Allocate a null key for doing anonymous security:: + + struct key *rxrpc_get_null_key(const char *keyname); + + This is used to allocate a null RxRPC key that can be used to indicate + anonymous security for a particular domain. + + (#) Get the peer address of a call:: + + void rxrpc_kernel_get_peer(struct socket *sock, struct rxrpc_call *call, + struct sockaddr_rxrpc *_srx); + + This is used to find the remote peer address of a call. + + (#) Set the total transmit data size on a call:: + + void rxrpc_kernel_set_tx_length(struct socket *sock, + struct rxrpc_call *call, + s64 tx_total_len); + + This sets the amount of data that the caller is intending to transmit on a + call. It's intended to be used for setting the reply size as the request + size should be set when the call is begun. tx_total_len may not be less + than zero. + + (#) Get call RTT:: + + u64 rxrpc_kernel_get_rtt(struct socket *sock, struct rxrpc_call *call); + + Get the RTT time to the peer in use by a call. The value returned is in + nanoseconds. + + (#) Check call still alive:: + + bool rxrpc_kernel_check_life(struct socket *sock, + struct rxrpc_call *call, + u32 *_life); + void rxrpc_kernel_probe_life(struct socket *sock, + struct rxrpc_call *call); + + The first function passes back in ``*_life`` a number that is updated when + ACKs are received from the peer (notably including PING RESPONSE ACKs + which we can elicit by sending PING ACKs to see if the call still exists + on the server). The caller should compare the numbers of two calls to see + if the call is still alive after waiting for a suitable interval. It also + returns true as long as the call hasn't yet reached the completed state. + + This allows the caller to work out if the server is still contactable and + if the call is still alive on the server while waiting for the server to + process a client operation. + + The second function causes a ping ACK to be transmitted to try to provoke + the peer into responding, which would then cause the value returned by the + first function to change. Note that this must be called in TASK_RUNNING + state. + + (#) Get reply timestamp:: + + bool rxrpc_kernel_get_reply_time(struct socket *sock, + struct rxrpc_call *call, + ktime_t *_ts) + + This allows the timestamp on the first DATA packet of the reply of a + client call to be queried, provided that it is still in the Rx ring. If + successful, the timestamp will be stored into ``*_ts`` and true will be + returned; false will be returned otherwise. + + (#) Get remote client epoch:: + + u32 rxrpc_kernel_get_epoch(struct socket *sock, + struct rxrpc_call *call) + + This allows the epoch that's contained in packets of an incoming client + call to be queried. This value is returned. The function always + successful if the call is still in progress. It shouldn't be called once + the call has expired. Note that calling this on a local client call only + returns the local epoch. + + This value can be used to determine if the remote client has been + restarted as it shouldn't change otherwise. + + (#) Set the maxmimum lifespan on a call:: + + void rxrpc_kernel_set_max_life(struct socket *sock, + struct rxrpc_call *call, + unsigned long hard_timeout) + + This sets the maximum lifespan on a call to hard_timeout (which is in + jiffies). In the event of the timeout occurring, the call will be + aborted and -ETIME or -ETIMEDOUT will be returned. + + +Configurable Parameters +======================= + +The RxRPC protocol driver has a number of configurable parameters that can be +adjusted through sysctls in /proc/net/rxrpc/: + + (#) req_ack_delay + + The amount of time in milliseconds after receiving a packet with the + request-ack flag set before we honour the flag and actually send the + requested ack. + + Usually the other side won't stop sending packets until the advertised + reception window is full (to a maximum of 255 packets), so delaying the + ACK permits several packets to be ACK'd in one go. + + (#) soft_ack_delay + + The amount of time in milliseconds after receiving a new packet before we + generate a soft-ACK to tell the sender that it doesn't need to resend. + + (#) idle_ack_delay + + The amount of time in milliseconds after all the packets currently in the + received queue have been consumed before we generate a hard-ACK to tell + the sender it can free its buffers, assuming no other reason occurs that + we would send an ACK. + + (#) resend_timeout + + The amount of time in milliseconds after transmitting a packet before we + transmit it again, assuming no ACK is received from the receiver telling + us they got it. + + (#) max_call_lifetime + + The maximum amount of time in seconds that a call may be in progress + before we preemptively kill it. + + (#) dead_call_expiry + + The amount of time in seconds before we remove a dead call from the call + list. Dead calls are kept around for a little while for the purpose of + repeating ACK and ABORT packets. + + (#) connection_expiry + + The amount of time in seconds after a connection was last used before we + remove it from the connection list. While a connection is in existence, + it serves as a placeholder for negotiated security; when it is deleted, + the security must be renegotiated. + + (#) transport_expiry + + The amount of time in seconds after a transport was last used before we + remove it from the transport list. While a transport is in existence, it + serves to anchor the peer data and keeps the connection ID counter. + + (#) rxrpc_rx_window_size + + The size of the receive window in packets. This is the maximum number of + unconsumed received packets we're willing to hold in memory for any + particular call. + + (#) rxrpc_rx_mtu + + The maximum packet MTU size that we're willing to receive in bytes. This + indicates to the peer whether we're willing to accept jumbo packets. + + (#) rxrpc_rx_jumbo_max + + The maximum number of packets that we're willing to accept in a jumbo + packet. Non-terminal packets in a jumbo packet must contain a four byte + header plus exactly 1412 bytes of data. The terminal packet must contain + a four byte header plus any amount of data. In any event, a jumbo packet + may not exceed rxrpc_rx_mtu in size. diff --git a/Documentation/networking/rxrpc.txt b/Documentation/networking/rxrpc.txt deleted file mode 100644 index 180e07d956a7..000000000000 --- a/Documentation/networking/rxrpc.txt +++ /dev/null @@ -1,1155 +0,0 @@ - ====================== - RxRPC NETWORK PROTOCOL - ====================== - -The RxRPC protocol driver provides a reliable two-phase transport on top of UDP -that can be used to perform RxRPC remote operations. This is done over sockets -of AF_RXRPC family, using sendmsg() and recvmsg() with control data to send and -receive data, aborts and errors. - -Contents of this document: - - (*) Overview. - - (*) RxRPC protocol summary. - - (*) AF_RXRPC driver model. - - (*) Control messages. - - (*) Socket options. - - (*) Security. - - (*) Example client usage. - - (*) Example server usage. - - (*) AF_RXRPC kernel interface. - - (*) Configurable parameters. - - -======== -OVERVIEW -======== - -RxRPC is a two-layer protocol. There is a session layer which provides -reliable virtual connections using UDP over IPv4 (or IPv6) as the transport -layer, but implements a real network protocol; and there's the presentation -layer which renders structured data to binary blobs and back again using XDR -(as does SunRPC): - - +-------------+ - | Application | - +-------------+ - | XDR | Presentation - +-------------+ - | RxRPC | Session - +-------------+ - | UDP | Transport - +-------------+ - - -AF_RXRPC provides: - - (1) Part of an RxRPC facility for both kernel and userspace applications by - making the session part of it a Linux network protocol (AF_RXRPC). - - (2) A two-phase protocol. The client transmits a blob (the request) and then - receives a blob (the reply), and the server receives the request and then - transmits the reply. - - (3) Retention of the reusable bits of the transport system set up for one call - to speed up subsequent calls. - - (4) A secure protocol, using the Linux kernel's key retention facility to - manage security on the client end. The server end must of necessity be - more active in security negotiations. - -AF_RXRPC does not provide XDR marshalling/presentation facilities. That is -left to the application. AF_RXRPC only deals in blobs. Even the operation ID -is just the first four bytes of the request blob, and as such is beyond the -kernel's interest. - - -Sockets of AF_RXRPC family are: - - (1) created as type SOCK_DGRAM; - - (2) provided with a protocol of the type of underlying transport they're going - to use - currently only PF_INET is supported. - - -The Andrew File System (AFS) is an example of an application that uses this and -that has both kernel (filesystem) and userspace (utility) components. - - -====================== -RXRPC PROTOCOL SUMMARY -====================== - -An overview of the RxRPC protocol: - - (*) RxRPC sits on top of another networking protocol (UDP is the only option - currently), and uses this to provide network transport. UDP ports, for - example, provide transport endpoints. - - (*) RxRPC supports multiple virtual "connections" from any given transport - endpoint, thus allowing the endpoints to be shared, even to the same - remote endpoint. - - (*) Each connection goes to a particular "service". A connection may not go - to multiple services. A service may be considered the RxRPC equivalent of - a port number. AF_RXRPC permits multiple services to share an endpoint. - - (*) Client-originating packets are marked, thus a transport endpoint can be - shared between client and server connections (connections have a - direction). - - (*) Up to a billion connections may be supported concurrently between one - local transport endpoint and one service on one remote endpoint. An RxRPC - connection is described by seven numbers: - - Local address } - Local port } Transport (UDP) address - Remote address } - Remote port } - Direction - Connection ID - Service ID - - (*) Each RxRPC operation is a "call". A connection may make up to four - billion calls, but only up to four calls may be in progress on a - connection at any one time. - - (*) Calls are two-phase and asymmetric: the client sends its request data, - which the service receives; then the service transmits the reply data - which the client receives. - - (*) The data blobs are of indefinite size, the end of a phase is marked with a - flag in the packet. The number of packets of data making up one blob may - not exceed 4 billion, however, as this would cause the sequence number to - wrap. - - (*) The first four bytes of the request data are the service operation ID. - - (*) Security is negotiated on a per-connection basis. The connection is - initiated by the first data packet on it arriving. If security is - requested, the server then issues a "challenge" and then the client - replies with a "response". If the response is successful, the security is - set for the lifetime of that connection, and all subsequent calls made - upon it use that same security. In the event that the server lets a - connection lapse before the client, the security will be renegotiated if - the client uses the connection again. - - (*) Calls use ACK packets to handle reliability. Data packets are also - explicitly sequenced per call. - - (*) There are two types of positive acknowledgment: hard-ACKs and soft-ACKs. - A hard-ACK indicates to the far side that all the data received to a point - has been received and processed; a soft-ACK indicates that the data has - been received but may yet be discarded and re-requested. The sender may - not discard any transmittable packets until they've been hard-ACK'd. - - (*) Reception of a reply data packet implicitly hard-ACK's all the data - packets that make up the request. - - (*) An call is complete when the request has been sent, the reply has been - received and the final hard-ACK on the last packet of the reply has - reached the server. - - (*) An call may be aborted by either end at any time up to its completion. - - -===================== -AF_RXRPC DRIVER MODEL -===================== - -About the AF_RXRPC driver: - - (*) The AF_RXRPC protocol transparently uses internal sockets of the transport - protocol to represent transport endpoints. - - (*) AF_RXRPC sockets map onto RxRPC connection bundles. Actual RxRPC - connections are handled transparently. One client socket may be used to - make multiple simultaneous calls to the same service. One server socket - may handle calls from many clients. - - (*) Additional parallel client connections will be initiated to support extra - concurrent calls, up to a tunable limit. - - (*) Each connection is retained for a certain amount of time [tunable] after - the last call currently using it has completed in case a new call is made - that could reuse it. - - (*) Each internal UDP socket is retained [tunable] for a certain amount of - time [tunable] after the last connection using it discarded, in case a new - connection is made that could use it. - - (*) A client-side connection is only shared between calls if they have have - the same key struct describing their security (and assuming the calls - would otherwise share the connection). Non-secured calls would also be - able to share connections with each other. - - (*) A server-side connection is shared if the client says it is. - - (*) ACK'ing is handled by the protocol driver automatically, including ping - replying. - - (*) SO_KEEPALIVE automatically pings the other side to keep the connection - alive [TODO]. - - (*) If an ICMP error is received, all calls affected by that error will be - aborted with an appropriate network error passed through recvmsg(). - - -Interaction with the user of the RxRPC socket: - - (*) A socket is made into a server socket by binding an address with a - non-zero service ID. - - (*) In the client, sending a request is achieved with one or more sendmsgs, - followed by the reply being received with one or more recvmsgs. - - (*) The first sendmsg for a request to be sent from a client contains a tag to - be used in all other sendmsgs or recvmsgs associated with that call. The - tag is carried in the control data. - - (*) connect() is used to supply a default destination address for a client - socket. This may be overridden by supplying an alternate address to the - first sendmsg() of a call (struct msghdr::msg_name). - - (*) If connect() is called on an unbound client, a random local port will - bound before the operation takes place. - - (*) A server socket may also be used to make client calls. To do this, the - first sendmsg() of the call must specify the target address. The server's - transport endpoint is used to send the packets. - - (*) Once the application has received the last message associated with a call, - the tag is guaranteed not to be seen again, and so it can be used to pin - client resources. A new call can then be initiated with the same tag - without fear of interference. - - (*) In the server, a request is received with one or more recvmsgs, then the - the reply is transmitted with one or more sendmsgs, and then the final ACK - is received with a last recvmsg. - - (*) When sending data for a call, sendmsg is given MSG_MORE if there's more - data to come on that call. - - (*) When receiving data for a call, recvmsg flags MSG_MORE if there's more - data to come for that call. - - (*) When receiving data or messages for a call, MSG_EOR is flagged by recvmsg - to indicate the terminal message for that call. - - (*) A call may be aborted by adding an abort control message to the control - data. Issuing an abort terminates the kernel's use of that call's tag. - Any messages waiting in the receive queue for that call will be discarded. - - (*) Aborts, busy notifications and challenge packets are delivered by recvmsg, - and control data messages will be set to indicate the context. Receiving - an abort or a busy message terminates the kernel's use of that call's tag. - - (*) The control data part of the msghdr struct is used for a number of things: - - (*) The tag of the intended or affected call. - - (*) Sending or receiving errors, aborts and busy notifications. - - (*) Notifications of incoming calls. - - (*) Sending debug requests and receiving debug replies [TODO]. - - (*) When the kernel has received and set up an incoming call, it sends a - message to server application to let it know there's a new call awaiting - its acceptance [recvmsg reports a special control message]. The server - application then uses sendmsg to assign a tag to the new call. Once that - is done, the first part of the request data will be delivered by recvmsg. - - (*) The server application has to provide the server socket with a keyring of - secret keys corresponding to the security types it permits. When a secure - connection is being set up, the kernel looks up the appropriate secret key - in the keyring and then sends a challenge packet to the client and - receives a response packet. The kernel then checks the authorisation of - the packet and either aborts the connection or sets up the security. - - (*) The name of the key a client will use to secure its communications is - nominated by a socket option. - - -Notes on sendmsg: - - (*) MSG_WAITALL can be set to tell sendmsg to ignore signals if the peer is - making progress at accepting packets within a reasonable time such that we - manage to queue up all the data for transmission. This requires the - client to accept at least one packet per 2*RTT time period. - - If this isn't set, sendmsg() will return immediately, either returning - EINTR/ERESTARTSYS if nothing was consumed or returning the amount of data - consumed. - - -Notes on recvmsg: - - (*) If there's a sequence of data messages belonging to a particular call on - the receive queue, then recvmsg will keep working through them until: - - (a) it meets the end of that call's received data, - - (b) it meets a non-data message, - - (c) it meets a message belonging to a different call, or - - (d) it fills the user buffer. - - If recvmsg is called in blocking mode, it will keep sleeping, awaiting the - reception of further data, until one of the above four conditions is met. - - (2) MSG_PEEK operates similarly, but will return immediately if it has put any - data in the buffer rather than sleeping until it can fill the buffer. - - (3) If a data message is only partially consumed in filling a user buffer, - then the remainder of that message will be left on the front of the queue - for the next taker. MSG_TRUNC will never be flagged. - - (4) If there is more data to be had on a call (it hasn't copied the last byte - of the last data message in that phase yet), then MSG_MORE will be - flagged. - - -================ -CONTROL MESSAGES -================ - -AF_RXRPC makes use of control messages in sendmsg() and recvmsg() to multiplex -calls, to invoke certain actions and to report certain conditions. These are: - - MESSAGE ID SRT DATA MEANING - ======================= === =========== =============================== - RXRPC_USER_CALL_ID sr- User ID App's call specifier - RXRPC_ABORT srt Abort code Abort code to issue/received - RXRPC_ACK -rt n/a Final ACK received - RXRPC_NET_ERROR -rt error num Network error on call - RXRPC_BUSY -rt n/a Call rejected (server busy) - RXRPC_LOCAL_ERROR -rt error num Local error encountered - RXRPC_NEW_CALL -r- n/a New call received - RXRPC_ACCEPT s-- n/a Accept new call - RXRPC_EXCLUSIVE_CALL s-- n/a Make an exclusive client call - RXRPC_UPGRADE_SERVICE s-- n/a Client call can be upgraded - RXRPC_TX_LENGTH s-- data len Total length of Tx data - - (SRT = usable in Sendmsg / delivered by Recvmsg / Terminal message) - - (*) RXRPC_USER_CALL_ID - - This is used to indicate the application's call ID. It's an unsigned long - that the app specifies in the client by attaching it to the first data - message or in the server by passing it in association with an RXRPC_ACCEPT - message. recvmsg() passes it in conjunction with all messages except - those of the RXRPC_NEW_CALL message. - - (*) RXRPC_ABORT - - This is can be used by an application to abort a call by passing it to - sendmsg, or it can be delivered by recvmsg to indicate a remote abort was - received. Either way, it must be associated with an RXRPC_USER_CALL_ID to - specify the call affected. If an abort is being sent, then error EBADSLT - will be returned if there is no call with that user ID. - - (*) RXRPC_ACK - - This is delivered to a server application to indicate that the final ACK - of a call was received from the client. It will be associated with an - RXRPC_USER_CALL_ID to indicate the call that's now complete. - - (*) RXRPC_NET_ERROR - - This is delivered to an application to indicate that an ICMP error message - was encountered in the process of trying to talk to the peer. An - errno-class integer value will be included in the control message data - indicating the problem, and an RXRPC_USER_CALL_ID will indicate the call - affected. - - (*) RXRPC_BUSY - - This is delivered to a client application to indicate that a call was - rejected by the server due to the server being busy. It will be - associated with an RXRPC_USER_CALL_ID to indicate the rejected call. - - (*) RXRPC_LOCAL_ERROR - - This is delivered to an application to indicate that a local error was - encountered and that a call has been aborted because of it. An - errno-class integer value will be included in the control message data - indicating the problem, and an RXRPC_USER_CALL_ID will indicate the call - affected. - - (*) RXRPC_NEW_CALL - - This is delivered to indicate to a server application that a new call has - arrived and is awaiting acceptance. No user ID is associated with this, - as a user ID must subsequently be assigned by doing an RXRPC_ACCEPT. - - (*) RXRPC_ACCEPT - - This is used by a server application to attempt to accept a call and - assign it a user ID. It should be associated with an RXRPC_USER_CALL_ID - to indicate the user ID to be assigned. If there is no call to be - accepted (it may have timed out, been aborted, etc.), then sendmsg will - return error ENODATA. If the user ID is already in use by another call, - then error EBADSLT will be returned. - - (*) RXRPC_EXCLUSIVE_CALL - - This is used to indicate that a client call should be made on a one-off - connection. The connection is discarded once the call has terminated. - - (*) RXRPC_UPGRADE_SERVICE - - This is used to make a client call to probe if the specified service ID - may be upgraded by the server. The caller must check msg_name returned to - recvmsg() for the service ID actually in use. The operation probed must - be one that takes the same arguments in both services. - - Once this has been used to establish the upgrade capability (or lack - thereof) of the server, the service ID returned should be used for all - future communication to that server and RXRPC_UPGRADE_SERVICE should no - longer be set. - - (*) RXRPC_TX_LENGTH - - This is used to inform the kernel of the total amount of data that is - going to be transmitted by a call (whether in a client request or a - service response). If given, it allows the kernel to encrypt from the - userspace buffer directly to the packet buffers, rather than copying into - the buffer and then encrypting in place. This may only be given with the - first sendmsg() providing data for a call. EMSGSIZE will be generated if - the amount of data actually given is different. - - This takes a parameter of __s64 type that indicates how much will be - transmitted. This may not be less than zero. - -The symbol RXRPC__SUPPORTED is defined as one more than the highest control -message type supported. At run time this can be queried by means of the -RXRPC_SUPPORTED_CMSG socket option (see below). - - -============== -SOCKET OPTIONS -============== - -AF_RXRPC sockets support a few socket options at the SOL_RXRPC level: - - (*) RXRPC_SECURITY_KEY - - This is used to specify the description of the key to be used. The key is - extracted from the calling process's keyrings with request_key() and - should be of "rxrpc" type. - - The optval pointer points to the description string, and optlen indicates - how long the string is, without the NUL terminator. - - (*) RXRPC_SECURITY_KEYRING - - Similar to above but specifies a keyring of server secret keys to use (key - type "keyring"). See the "Security" section. - - (*) RXRPC_EXCLUSIVE_CONNECTION - - This is used to request that new connections should be used for each call - made subsequently on this socket. optval should be NULL and optlen 0. - - (*) RXRPC_MIN_SECURITY_LEVEL - - This is used to specify the minimum security level required for calls on - this socket. optval must point to an int containing one of the following - values: - - (a) RXRPC_SECURITY_PLAIN - - Encrypted checksum only. - - (b) RXRPC_SECURITY_AUTH - - Encrypted checksum plus packet padded and first eight bytes of packet - encrypted - which includes the actual packet length. - - (c) RXRPC_SECURITY_ENCRYPTED - - Encrypted checksum plus entire packet padded and encrypted, including - actual packet length. - - (*) RXRPC_UPGRADEABLE_SERVICE - - This is used to indicate that a service socket with two bindings may - upgrade one bound service to the other if requested by the client. optval - must point to an array of two unsigned short ints. The first is the - service ID to upgrade from and the second the service ID to upgrade to. - - (*) RXRPC_SUPPORTED_CMSG - - This is a read-only option that writes an int into the buffer indicating - the highest control message type supported. - - -======== -SECURITY -======== - -Currently, only the kerberos 4 equivalent protocol has been implemented -(security index 2 - rxkad). This requires the rxkad module to be loaded and, -on the client, tickets of the appropriate type to be obtained from the AFS -kaserver or the kerberos server and installed as "rxrpc" type keys. This is -normally done using the klog program. An example simple klog program can be -found at: - - http://people.redhat.com/~dhowells/rxrpc/klog.c - -The payload provided to add_key() on the client should be of the following -form: - - struct rxrpc_key_sec2_v1 { - uint16_t security_index; /* 2 */ - uint16_t ticket_length; /* length of ticket[] */ - uint32_t expiry; /* time at which expires */ - uint8_t kvno; /* key version number */ - uint8_t __pad[3]; - uint8_t session_key[8]; /* DES session key */ - uint8_t ticket[0]; /* the encrypted ticket */ - }; - -Where the ticket blob is just appended to the above structure. - - -For the server, keys of type "rxrpc_s" must be made available to the server. -They have a description of ":" (eg: "52:2" for an -rxkad key for the AFS VL service). When such a key is created, it should be -given the server's secret key as the instantiation data (see the example -below). - - add_key("rxrpc_s", "52:2", secret_key, 8, keyring); - -A keyring is passed to the server socket by naming it in a sockopt. The server -socket then looks the server secret keys up in this keyring when secure -incoming connections are made. This can be seen in an example program that can -be found at: - - http://people.redhat.com/~dhowells/rxrpc/listen.c - - -==================== -EXAMPLE CLIENT USAGE -==================== - -A client would issue an operation by: - - (1) An RxRPC socket is set up by: - - client = socket(AF_RXRPC, SOCK_DGRAM, PF_INET); - - Where the third parameter indicates the protocol family of the transport - socket used - usually IPv4 but it can also be IPv6 [TODO]. - - (2) A local address can optionally be bound: - - struct sockaddr_rxrpc srx = { - .srx_family = AF_RXRPC, - .srx_service = 0, /* we're a client */ - .transport_type = SOCK_DGRAM, /* type of transport socket */ - .transport.sin_family = AF_INET, - .transport.sin_port = htons(7000), /* AFS callback */ - .transport.sin_address = 0, /* all local interfaces */ - }; - bind(client, &srx, sizeof(srx)); - - This specifies the local UDP port to be used. If not given, a random - non-privileged port will be used. A UDP port may be shared between - several unrelated RxRPC sockets. Security is handled on a basis of - per-RxRPC virtual connection. - - (3) The security is set: - - const char *key = "AFS:cambridge.redhat.com"; - setsockopt(client, SOL_RXRPC, RXRPC_SECURITY_KEY, key, strlen(key)); - - This issues a request_key() to get the key representing the security - context. The minimum security level can be set: - - unsigned int sec = RXRPC_SECURITY_ENCRYPTED; - setsockopt(client, SOL_RXRPC, RXRPC_MIN_SECURITY_LEVEL, - &sec, sizeof(sec)); - - (4) The server to be contacted can then be specified (alternatively this can - be done through sendmsg): - - struct sockaddr_rxrpc srx = { - .srx_family = AF_RXRPC, - .srx_service = VL_SERVICE_ID, - .transport_type = SOCK_DGRAM, /* type of transport socket */ - .transport.sin_family = AF_INET, - .transport.sin_port = htons(7005), /* AFS volume manager */ - .transport.sin_address = ..., - }; - connect(client, &srx, sizeof(srx)); - - (5) The request data should then be posted to the server socket using a series - of sendmsg() calls, each with the following control message attached: - - RXRPC_USER_CALL_ID - specifies the user ID for this call - - MSG_MORE should be set in msghdr::msg_flags on all but the last part of - the request. Multiple requests may be made simultaneously. - - An RXRPC_TX_LENGTH control message can also be specified on the first - sendmsg() call. - - If a call is intended to go to a destination other than the default - specified through connect(), then msghdr::msg_name should be set on the - first request message of that call. - - (6) The reply data will then be posted to the server socket for recvmsg() to - pick up. MSG_MORE will be flagged by recvmsg() if there's more reply data - for a particular call to be read. MSG_EOR will be set on the terminal - read for a call. - - All data will be delivered with the following control message attached: - - RXRPC_USER_CALL_ID - specifies the user ID for this call - - If an abort or error occurred, this will be returned in the control data - buffer instead, and MSG_EOR will be flagged to indicate the end of that - call. - -A client may ask for a service ID it knows and ask that this be upgraded to a -better service if one is available by supplying RXRPC_UPGRADE_SERVICE on the -first sendmsg() of a call. The client should then check srx_service in the -msg_name filled in by recvmsg() when collecting the result. srx_service will -hold the same value as given to sendmsg() if the upgrade request was ignored by -the service - otherwise it will be altered to indicate the service ID the -server upgraded to. Note that the upgraded service ID is chosen by the server. -The caller has to wait until it sees the service ID in the reply before sending -any more calls (further calls to the same destination will be blocked until the -probe is concluded). - - -==================== -EXAMPLE SERVER USAGE -==================== - -A server would be set up to accept operations in the following manner: - - (1) An RxRPC socket is created by: - - server = socket(AF_RXRPC, SOCK_DGRAM, PF_INET); - - Where the third parameter indicates the address type of the transport - socket used - usually IPv4. - - (2) Security is set up if desired by giving the socket a keyring with server - secret keys in it: - - keyring = add_key("keyring", "AFSkeys", NULL, 0, - KEY_SPEC_PROCESS_KEYRING); - - const char secret_key[8] = { - 0xa7, 0x83, 0x8a, 0xcb, 0xc7, 0x83, 0xec, 0x94 }; - add_key("rxrpc_s", "52:2", secret_key, 8, keyring); - - setsockopt(server, SOL_RXRPC, RXRPC_SECURITY_KEYRING, "AFSkeys", 7); - - The keyring can be manipulated after it has been given to the socket. This - permits the server to add more keys, replace keys, etc. while it is live. - - (3) A local address must then be bound: - - struct sockaddr_rxrpc srx = { - .srx_family = AF_RXRPC, - .srx_service = VL_SERVICE_ID, /* RxRPC service ID */ - .transport_type = SOCK_DGRAM, /* type of transport socket */ - .transport.sin_family = AF_INET, - .transport.sin_port = htons(7000), /* AFS callback */ - .transport.sin_address = 0, /* all local interfaces */ - }; - bind(server, &srx, sizeof(srx)); - - More than one service ID may be bound to a socket, provided the transport - parameters are the same. The limit is currently two. To do this, bind() - should be called twice. - - (4) If service upgrading is required, first two service IDs must have been - bound and then the following option must be set: - - unsigned short service_ids[2] = { from_ID, to_ID }; - setsockopt(server, SOL_RXRPC, RXRPC_UPGRADEABLE_SERVICE, - service_ids, sizeof(service_ids)); - - This will automatically upgrade connections on service from_ID to service - to_ID if they request it. This will be reflected in msg_name obtained - through recvmsg() when the request data is delivered to userspace. - - (5) The server is then set to listen out for incoming calls: - - listen(server, 100); - - (6) The kernel notifies the server of pending incoming connections by sending - it a message for each. This is received with recvmsg() on the server - socket. It has no data, and has a single dataless control message - attached: - - RXRPC_NEW_CALL - - The address that can be passed back by recvmsg() at this point should be - ignored since the call for which the message was posted may have gone by - the time it is accepted - in which case the first call still on the queue - will be accepted. - - (7) The server then accepts the new call by issuing a sendmsg() with two - pieces of control data and no actual data: - - RXRPC_ACCEPT - indicate connection acceptance - RXRPC_USER_CALL_ID - specify user ID for this call - - (8) The first request data packet will then be posted to the server socket for - recvmsg() to pick up. At that point, the RxRPC address for the call can - be read from the address fields in the msghdr struct. - - Subsequent request data will be posted to the server socket for recvmsg() - to collect as it arrives. All but the last piece of the request data will - be delivered with MSG_MORE flagged. - - All data will be delivered with the following control message attached: - - RXRPC_USER_CALL_ID - specifies the user ID for this call - - (9) The reply data should then be posted to the server socket using a series - of sendmsg() calls, each with the following control messages attached: - - RXRPC_USER_CALL_ID - specifies the user ID for this call - - MSG_MORE should be set in msghdr::msg_flags on all but the last message - for a particular call. - -(10) The final ACK from the client will be posted for retrieval by recvmsg() - when it is received. It will take the form of a dataless message with two - control messages attached: - - RXRPC_USER_CALL_ID - specifies the user ID for this call - RXRPC_ACK - indicates final ACK (no data) - - MSG_EOR will be flagged to indicate that this is the final message for - this call. - -(11) Up to the point the final packet of reply data is sent, the call can be - aborted by calling sendmsg() with a dataless message with the following - control messages attached: - - RXRPC_USER_CALL_ID - specifies the user ID for this call - RXRPC_ABORT - indicates abort code (4 byte data) - - Any packets waiting in the socket's receive queue will be discarded if - this is issued. - -Note that all the communications for a particular service take place through -the one server socket, using control messages on sendmsg() and recvmsg() to -determine the call affected. - - -========================= -AF_RXRPC KERNEL INTERFACE -========================= - -The AF_RXRPC module also provides an interface for use by in-kernel utilities -such as the AFS filesystem. This permits such a utility to: - - (1) Use different keys directly on individual client calls on one socket - rather than having to open a whole slew of sockets, one for each key it - might want to use. - - (2) Avoid having RxRPC call request_key() at the point of issue of a call or - opening of a socket. Instead the utility is responsible for requesting a - key at the appropriate point. AFS, for instance, would do this during VFS - operations such as open() or unlink(). The key is then handed through - when the call is initiated. - - (3) Request the use of something other than GFP_KERNEL to allocate memory. - - (4) Avoid the overhead of using the recvmsg() call. RxRPC messages can be - intercepted before they get put into the socket Rx queue and the socket - buffers manipulated directly. - -To use the RxRPC facility, a kernel utility must still open an AF_RXRPC socket, -bind an address as appropriate and listen if it's to be a server socket, but -then it passes this to the kernel interface functions. - -The kernel interface functions are as follows: - - (*) Begin a new client call. - - struct rxrpc_call * - rxrpc_kernel_begin_call(struct socket *sock, - struct sockaddr_rxrpc *srx, - struct key *key, - unsigned long user_call_ID, - s64 tx_total_len, - gfp_t gfp, - rxrpc_notify_rx_t notify_rx, - bool upgrade, - bool intr, - unsigned int debug_id); - - This allocates the infrastructure to make a new RxRPC call and assigns - call and connection numbers. The call will be made on the UDP port that - the socket is bound to. The call will go to the destination address of a - connected client socket unless an alternative is supplied (srx is - non-NULL). - - If a key is supplied then this will be used to secure the call instead of - the key bound to the socket with the RXRPC_SECURITY_KEY sockopt. Calls - secured in this way will still share connections if at all possible. - - The user_call_ID is equivalent to that supplied to sendmsg() in the - control data buffer. It is entirely feasible to use this to point to a - kernel data structure. - - tx_total_len is the amount of data the caller is intending to transmit - with this call (or -1 if unknown at this point). Setting the data size - allows the kernel to encrypt directly to the packet buffers, thereby - saving a copy. The value may not be less than -1. - - notify_rx is a pointer to a function to be called when events such as - incoming data packets or remote aborts happen. - - upgrade should be set to true if a client operation should request that - the server upgrade the service to a better one. The resultant service ID - is returned by rxrpc_kernel_recv_data(). - - intr should be set to true if the call should be interruptible. If this - is not set, this function may not return until a channel has been - allocated; if it is set, the function may return -ERESTARTSYS. - - debug_id is the call debugging ID to be used for tracing. This can be - obtained by atomically incrementing rxrpc_debug_id. - - If this function is successful, an opaque reference to the RxRPC call is - returned. The caller now holds a reference on this and it must be - properly ended. - - (*) End a client call. - - void rxrpc_kernel_end_call(struct socket *sock, - struct rxrpc_call *call); - - This is used to end a previously begun call. The user_call_ID is expunged - from AF_RXRPC's knowledge and will not be seen again in association with - the specified call. - - (*) Send data through a call. - - typedef void (*rxrpc_notify_end_tx_t)(struct sock *sk, - unsigned long user_call_ID, - struct sk_buff *skb); - - int rxrpc_kernel_send_data(struct socket *sock, - struct rxrpc_call *call, - struct msghdr *msg, - size_t len, - rxrpc_notify_end_tx_t notify_end_rx); - - This is used to supply either the request part of a client call or the - reply part of a server call. msg.msg_iovlen and msg.msg_iov specify the - data buffers to be used. msg_iov may not be NULL and must point - exclusively to in-kernel virtual addresses. msg.msg_flags may be given - MSG_MORE if there will be subsequent data sends for this call. - - The msg must not specify a destination address, control data or any flags - other than MSG_MORE. len is the total amount of data to transmit. - - notify_end_rx can be NULL or it can be used to specify a function to be - called when the call changes state to end the Tx phase. This function is - called with the call-state spinlock held to prevent any reply or final ACK - from being delivered first. - - (*) Receive data from a call. - - int rxrpc_kernel_recv_data(struct socket *sock, - struct rxrpc_call *call, - void *buf, - size_t size, - size_t *_offset, - bool want_more, - u32 *_abort, - u16 *_service) - - This is used to receive data from either the reply part of a client call - or the request part of a service call. buf and size specify how much - data is desired and where to store it. *_offset is added on to buf and - subtracted from size internally; the amount copied into the buffer is - added to *_offset before returning. - - want_more should be true if further data will be required after this is - satisfied and false if this is the last item of the receive phase. - - There are three normal returns: 0 if the buffer was filled and want_more - was true; 1 if the buffer was filled, the last DATA packet has been - emptied and want_more was false; and -EAGAIN if the function needs to be - called again. - - If the last DATA packet is processed but the buffer contains less than - the amount requested, EBADMSG is returned. If want_more wasn't set, but - more data was available, EMSGSIZE is returned. - - If a remote ABORT is detected, the abort code received will be stored in - *_abort and ECONNABORTED will be returned. - - The service ID that the call ended up with is returned into *_service. - This can be used to see if a call got a service upgrade. - - (*) Abort a call. - - void rxrpc_kernel_abort_call(struct socket *sock, - struct rxrpc_call *call, - u32 abort_code); - - This is used to abort a call if it's still in an abortable state. The - abort code specified will be placed in the ABORT message sent. - - (*) Intercept received RxRPC messages. - - typedef void (*rxrpc_interceptor_t)(struct sock *sk, - unsigned long user_call_ID, - struct sk_buff *skb); - - void - rxrpc_kernel_intercept_rx_messages(struct socket *sock, - rxrpc_interceptor_t interceptor); - - This installs an interceptor function on the specified AF_RXRPC socket. - All messages that would otherwise wind up in the socket's Rx queue are - then diverted to this function. Note that care must be taken to process - the messages in the right order to maintain DATA message sequentiality. - - The interceptor function itself is provided with the address of the socket - and handling the incoming message, the ID assigned by the kernel utility - to the call and the socket buffer containing the message. - - The skb->mark field indicates the type of message: - - MARK MEANING - =============================== ======================================= - RXRPC_SKB_MARK_DATA Data message - RXRPC_SKB_MARK_FINAL_ACK Final ACK received for an incoming call - RXRPC_SKB_MARK_BUSY Client call rejected as server busy - RXRPC_SKB_MARK_REMOTE_ABORT Call aborted by peer - RXRPC_SKB_MARK_NET_ERROR Network error detected - RXRPC_SKB_MARK_LOCAL_ERROR Local error encountered - RXRPC_SKB_MARK_NEW_CALL New incoming call awaiting acceptance - - The remote abort message can be probed with rxrpc_kernel_get_abort_code(). - The two error messages can be probed with rxrpc_kernel_get_error_number(). - A new call can be accepted with rxrpc_kernel_accept_call(). - - Data messages can have their contents extracted with the usual bunch of - socket buffer manipulation functions. A data message can be determined to - be the last one in a sequence with rxrpc_kernel_is_data_last(). When a - data message has been used up, rxrpc_kernel_data_consumed() should be - called on it. - - Messages should be handled to rxrpc_kernel_free_skb() to dispose of. It - is possible to get extra refs on all types of message for later freeing, - but this may pin the state of a call until the message is finally freed. - - (*) Accept an incoming call. - - struct rxrpc_call * - rxrpc_kernel_accept_call(struct socket *sock, - unsigned long user_call_ID); - - This is used to accept an incoming call and to assign it a call ID. This - function is similar to rxrpc_kernel_begin_call() and calls accepted must - be ended in the same way. - - If this function is successful, an opaque reference to the RxRPC call is - returned. The caller now holds a reference on this and it must be - properly ended. - - (*) Reject an incoming call. - - int rxrpc_kernel_reject_call(struct socket *sock); - - This is used to reject the first incoming call on the socket's queue with - a BUSY message. -ENODATA is returned if there were no incoming calls. - Other errors may be returned if the call had been aborted (-ECONNABORTED) - or had timed out (-ETIME). - - (*) Allocate a null key for doing anonymous security. - - struct key *rxrpc_get_null_key(const char *keyname); - - This is used to allocate a null RxRPC key that can be used to indicate - anonymous security for a particular domain. - - (*) Get the peer address of a call. - - void rxrpc_kernel_get_peer(struct socket *sock, struct rxrpc_call *call, - struct sockaddr_rxrpc *_srx); - - This is used to find the remote peer address of a call. - - (*) Set the total transmit data size on a call. - - void rxrpc_kernel_set_tx_length(struct socket *sock, - struct rxrpc_call *call, - s64 tx_total_len); - - This sets the amount of data that the caller is intending to transmit on a - call. It's intended to be used for setting the reply size as the request - size should be set when the call is begun. tx_total_len may not be less - than zero. - - (*) Get call RTT. - - u64 rxrpc_kernel_get_rtt(struct socket *sock, struct rxrpc_call *call); - - Get the RTT time to the peer in use by a call. The value returned is in - nanoseconds. - - (*) Check call still alive. - - bool rxrpc_kernel_check_life(struct socket *sock, - struct rxrpc_call *call, - u32 *_life); - void rxrpc_kernel_probe_life(struct socket *sock, - struct rxrpc_call *call); - - The first function passes back in *_life a number that is updated when - ACKs are received from the peer (notably including PING RESPONSE ACKs - which we can elicit by sending PING ACKs to see if the call still exists - on the server). The caller should compare the numbers of two calls to see - if the call is still alive after waiting for a suitable interval. It also - returns true as long as the call hasn't yet reached the completed state. - - This allows the caller to work out if the server is still contactable and - if the call is still alive on the server while waiting for the server to - process a client operation. - - The second function causes a ping ACK to be transmitted to try to provoke - the peer into responding, which would then cause the value returned by the - first function to change. Note that this must be called in TASK_RUNNING - state. - - (*) Get reply timestamp. - - bool rxrpc_kernel_get_reply_time(struct socket *sock, - struct rxrpc_call *call, - ktime_t *_ts) - - This allows the timestamp on the first DATA packet of the reply of a - client call to be queried, provided that it is still in the Rx ring. If - successful, the timestamp will be stored into *_ts and true will be - returned; false will be returned otherwise. - - (*) Get remote client epoch. - - u32 rxrpc_kernel_get_epoch(struct socket *sock, - struct rxrpc_call *call) - - This allows the epoch that's contained in packets of an incoming client - call to be queried. This value is returned. The function always - successful if the call is still in progress. It shouldn't be called once - the call has expired. Note that calling this on a local client call only - returns the local epoch. - - This value can be used to determine if the remote client has been - restarted as it shouldn't change otherwise. - - (*) Set the maxmimum lifespan on a call. - - void rxrpc_kernel_set_max_life(struct socket *sock, - struct rxrpc_call *call, - unsigned long hard_timeout) - - This sets the maximum lifespan on a call to hard_timeout (which is in - jiffies). In the event of the timeout occurring, the call will be - aborted and -ETIME or -ETIMEDOUT will be returned. - - -======================= -CONFIGURABLE PARAMETERS -======================= - -The RxRPC protocol driver has a number of configurable parameters that can be -adjusted through sysctls in /proc/net/rxrpc/: - - (*) req_ack_delay - - The amount of time in milliseconds after receiving a packet with the - request-ack flag set before we honour the flag and actually send the - requested ack. - - Usually the other side won't stop sending packets until the advertised - reception window is full (to a maximum of 255 packets), so delaying the - ACK permits several packets to be ACK'd in one go. - - (*) soft_ack_delay - - The amount of time in milliseconds after receiving a new packet before we - generate a soft-ACK to tell the sender that it doesn't need to resend. - - (*) idle_ack_delay - - The amount of time in milliseconds after all the packets currently in the - received queue have been consumed before we generate a hard-ACK to tell - the sender it can free its buffers, assuming no other reason occurs that - we would send an ACK. - - (*) resend_timeout - - The amount of time in milliseconds after transmitting a packet before we - transmit it again, assuming no ACK is received from the receiver telling - us they got it. - - (*) max_call_lifetime - - The maximum amount of time in seconds that a call may be in progress - before we preemptively kill it. - - (*) dead_call_expiry - - The amount of time in seconds before we remove a dead call from the call - list. Dead calls are kept around for a little while for the purpose of - repeating ACK and ABORT packets. - - (*) connection_expiry - - The amount of time in seconds after a connection was last used before we - remove it from the connection list. While a connection is in existence, - it serves as a placeholder for negotiated security; when it is deleted, - the security must be renegotiated. - - (*) transport_expiry - - The amount of time in seconds after a transport was last used before we - remove it from the transport list. While a transport is in existence, it - serves to anchor the peer data and keeps the connection ID counter. - - (*) rxrpc_rx_window_size - - The size of the receive window in packets. This is the maximum number of - unconsumed received packets we're willing to hold in memory for any - particular call. - - (*) rxrpc_rx_mtu - - The maximum packet MTU size that we're willing to receive in bytes. This - indicates to the peer whether we're willing to accept jumbo packets. - - (*) rxrpc_rx_jumbo_max - - The maximum number of packets that we're willing to accept in a jumbo - packet. Non-terminal packets in a jumbo packet must contain a four byte - header plus exactly 1412 bytes of data. The terminal packet must contain - a four byte header plus any amount of data. In any event, a jumbo packet - may not exceed rxrpc_rx_mtu in size. diff --git a/MAINTAINERS b/MAINTAINERS index b28823ab48c5..866a0dcd66ef 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14593,7 +14593,7 @@ M: David Howells L: linux-afs@lists.infradead.org S: Supported W: https://www.infradead.org/~dhowells/kafs/ -F: Documentation/networking/rxrpc.txt +F: Documentation/networking/rxrpc.rst F: include/keys/rxrpc-type.h F: include/net/af_rxrpc.h F: include/trace/events/rxrpc.h diff --git a/net/rxrpc/Kconfig b/net/rxrpc/Kconfig index 57ebb29c26ad..d706bb408365 100644 --- a/net/rxrpc/Kconfig +++ b/net/rxrpc/Kconfig @@ -18,7 +18,7 @@ config AF_RXRPC This module at the moment only supports client operations and is currently incomplete. - See Documentation/networking/rxrpc.txt. + See Documentation/networking/rxrpc.rst. config AF_RXRPC_IPV6 bool "IPv6 support for RxRPC" @@ -41,7 +41,7 @@ config AF_RXRPC_DEBUG help Say Y here to make runtime controllable debugging messages appear. - See Documentation/networking/rxrpc.txt. + See Documentation/networking/rxrpc.rst. config RXKAD @@ -56,4 +56,4 @@ config RXKAD Provide kerberos 4 and AFS kaserver security handling for AF_RXRPC through the use of the key retention service. - See Documentation/networking/rxrpc.txt. + See Documentation/networking/rxrpc.rst. diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c index 2bbb38161851..174e903e18de 100644 --- a/net/rxrpc/sysctl.c +++ b/net/rxrpc/sysctl.c @@ -21,7 +21,7 @@ static const unsigned long max_jiffies = MAX_JIFFY_OFFSET; /* * RxRPC operating parameters. * - * See Documentation/networking/rxrpc.txt and the variable definitions for more + * See Documentation/networking/rxrpc.rst and the variable definitions for more * information on the individual parameters. */ static struct ctl_table rxrpc_sysctl_table[] = { -- cgit v1.2.3 From 4ac0b122ee63d89b5aaf2e3e376092d8ac02a567 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 30 Apr 2020 18:04:32 +0200 Subject: docs: networking: convert tproxy.txt to ReST - add SPDX header; - adjust title markup; - mark code blocks and literals as such; - adjust identation, whitespaces and blank lines where needed; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/tproxy.rst | 109 ++++++++++++++++++++++++++++++++++++ Documentation/networking/tproxy.txt | 104 ---------------------------------- net/netfilter/Kconfig | 2 +- 4 files changed, 111 insertions(+), 105 deletions(-) create mode 100644 Documentation/networking/tproxy.rst delete mode 100644 Documentation/networking/tproxy.txt (limited to 'net') diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 8f9a84b8e3f2..b423b2db5f96 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -110,6 +110,7 @@ Contents: tcp-thin team timestamping + tproxy .. only:: subproject and html diff --git a/Documentation/networking/tproxy.rst b/Documentation/networking/tproxy.rst new file mode 100644 index 000000000000..00dc3a1a66b4 --- /dev/null +++ b/Documentation/networking/tproxy.rst @@ -0,0 +1,109 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================= +Transparent proxy support +========================= + +This feature adds Linux 2.2-like transparent proxy support to current kernels. +To use it, enable the socket match and the TPROXY target in your kernel config. +You will need policy routing too, so be sure to enable that as well. + +From Linux 4.18 transparent proxy support is also available in nf_tables. + +1. Making non-local sockets work +================================ + +The idea is that you identify packets with destination address matching a local +socket on your box, set the packet mark to a certain value:: + + # iptables -t mangle -N DIVERT + # iptables -t mangle -A PREROUTING -p tcp -m socket -j DIVERT + # iptables -t mangle -A DIVERT -j MARK --set-mark 1 + # iptables -t mangle -A DIVERT -j ACCEPT + +Alternatively you can do this in nft with the following commands:: + + # nft add table filter + # nft add chain filter divert "{ type filter hook prerouting priority -150; }" + # nft add rule filter divert meta l4proto tcp socket transparent 1 meta mark set 1 accept + +And then match on that value using policy routing to have those packets +delivered locally:: + + # ip rule add fwmark 1 lookup 100 + # ip route add local 0.0.0.0/0 dev lo table 100 + +Because of certain restrictions in the IPv4 routing output code you'll have to +modify your application to allow it to send datagrams _from_ non-local IP +addresses. All you have to do is enable the (SOL_IP, IP_TRANSPARENT) socket +option before calling bind:: + + fd = socket(AF_INET, SOCK_STREAM, 0); + /* - 8< -*/ + int value = 1; + setsockopt(fd, SOL_IP, IP_TRANSPARENT, &value, sizeof(value)); + /* - 8< -*/ + name.sin_family = AF_INET; + name.sin_port = htons(0xCAFE); + name.sin_addr.s_addr = htonl(0xDEADBEEF); + bind(fd, &name, sizeof(name)); + +A trivial patch for netcat is available here: +http://people.netfilter.org/hidden/tproxy/netcat-ip_transparent-support.patch + + +2. Redirecting traffic +====================== + +Transparent proxying often involves "intercepting" traffic on a router. This is +usually done with the iptables REDIRECT target; however, there are serious +limitations of that method. One of the major issues is that it actually +modifies the packets to change the destination address -- which might not be +acceptable in certain situations. (Think of proxying UDP for example: you won't +be able to find out the original destination address. Even in case of TCP +getting the original destination address is racy.) + +The 'TPROXY' target provides similar functionality without relying on NAT. Simply +add rules like this to the iptables ruleset above:: + + # iptables -t mangle -A PREROUTING -p tcp --dport 80 -j TPROXY \ + --tproxy-mark 0x1/0x1 --on-port 50080 + +Or the following rule to nft: + +# nft add rule filter divert tcp dport 80 tproxy to :50080 meta mark set 1 accept + +Note that for this to work you'll have to modify the proxy to enable (SOL_IP, +IP_TRANSPARENT) for the listening socket. + +As an example implementation, tcprdr is available here: +https://git.breakpoint.cc/cgit/fw/tcprdr.git/ +This tool is written by Florian Westphal and it was used for testing during the +nf_tables implementation. + +3. Iptables and nf_tables extensions +==================================== + +To use tproxy you'll need to have the following modules compiled for iptables: + + - NETFILTER_XT_MATCH_SOCKET + - NETFILTER_XT_TARGET_TPROXY + +Or the floowing modules for nf_tables: + + - NFT_SOCKET + - NFT_TPROXY + +4. Application support +====================== + +4.1. Squid +---------- + +Squid 3.HEAD has support built-in. To use it, pass +'--enable-linux-netfilter' to configure and set the 'tproxy' option on +the HTTP listener you redirect traffic to with the TPROXY iptables +target. + +For more information please consult the following page on the Squid +wiki: http://wiki.squid-cache.org/Features/Tproxy4 diff --git a/Documentation/networking/tproxy.txt b/Documentation/networking/tproxy.txt deleted file mode 100644 index b9a188823d9f..000000000000 --- a/Documentation/networking/tproxy.txt +++ /dev/null @@ -1,104 +0,0 @@ -Transparent proxy support -========================= - -This feature adds Linux 2.2-like transparent proxy support to current kernels. -To use it, enable the socket match and the TPROXY target in your kernel config. -You will need policy routing too, so be sure to enable that as well. - -From Linux 4.18 transparent proxy support is also available in nf_tables. - -1. Making non-local sockets work -================================ - -The idea is that you identify packets with destination address matching a local -socket on your box, set the packet mark to a certain value: - -# iptables -t mangle -N DIVERT -# iptables -t mangle -A PREROUTING -p tcp -m socket -j DIVERT -# iptables -t mangle -A DIVERT -j MARK --set-mark 1 -# iptables -t mangle -A DIVERT -j ACCEPT - -Alternatively you can do this in nft with the following commands: - -# nft add table filter -# nft add chain filter divert "{ type filter hook prerouting priority -150; }" -# nft add rule filter divert meta l4proto tcp socket transparent 1 meta mark set 1 accept - -And then match on that value using policy routing to have those packets -delivered locally: - -# ip rule add fwmark 1 lookup 100 -# ip route add local 0.0.0.0/0 dev lo table 100 - -Because of certain restrictions in the IPv4 routing output code you'll have to -modify your application to allow it to send datagrams _from_ non-local IP -addresses. All you have to do is enable the (SOL_IP, IP_TRANSPARENT) socket -option before calling bind: - -fd = socket(AF_INET, SOCK_STREAM, 0); -/* - 8< -*/ -int value = 1; -setsockopt(fd, SOL_IP, IP_TRANSPARENT, &value, sizeof(value)); -/* - 8< -*/ -name.sin_family = AF_INET; -name.sin_port = htons(0xCAFE); -name.sin_addr.s_addr = htonl(0xDEADBEEF); -bind(fd, &name, sizeof(name)); - -A trivial patch for netcat is available here: -http://people.netfilter.org/hidden/tproxy/netcat-ip_transparent-support.patch - - -2. Redirecting traffic -====================== - -Transparent proxying often involves "intercepting" traffic on a router. This is -usually done with the iptables REDIRECT target; however, there are serious -limitations of that method. One of the major issues is that it actually -modifies the packets to change the destination address -- which might not be -acceptable in certain situations. (Think of proxying UDP for example: you won't -be able to find out the original destination address. Even in case of TCP -getting the original destination address is racy.) - -The 'TPROXY' target provides similar functionality without relying on NAT. Simply -add rules like this to the iptables ruleset above: - -# iptables -t mangle -A PREROUTING -p tcp --dport 80 -j TPROXY \ - --tproxy-mark 0x1/0x1 --on-port 50080 - -Or the following rule to nft: - -# nft add rule filter divert tcp dport 80 tproxy to :50080 meta mark set 1 accept - -Note that for this to work you'll have to modify the proxy to enable (SOL_IP, -IP_TRANSPARENT) for the listening socket. - -As an example implementation, tcprdr is available here: -https://git.breakpoint.cc/cgit/fw/tcprdr.git/ -This tool is written by Florian Westphal and it was used for testing during the -nf_tables implementation. - -3. Iptables and nf_tables extensions -==================================== - -To use tproxy you'll need to have the following modules compiled for iptables: - - NETFILTER_XT_MATCH_SOCKET - - NETFILTER_XT_TARGET_TPROXY - -Or the floowing modules for nf_tables: - - NFT_SOCKET - - NFT_TPROXY - -4. Application support -====================== - -4.1. Squid ----------- - -Squid 3.HEAD has support built-in. To use it, pass -'--enable-linux-netfilter' to configure and set the 'tproxy' option on -the HTTP listener you redirect traffic to with the TPROXY iptables -target. - -For more information please consult the following page on the Squid -wiki: http://wiki.squid-cache.org/Features/Tproxy4 diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 468fea1aebba..3a3915d2e1ea 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -1043,7 +1043,7 @@ config NETFILTER_XT_TARGET_TPROXY on Netfilter connection tracking and NAT, unlike REDIRECT. For it to work you will have to configure certain iptables rules and use policy routing. For more information on how to set it up - see Documentation/networking/tproxy.txt. + see Documentation/networking/tproxy.rst. To compile it as a module, choose M here. If unsure, say N. -- cgit v1.2.3 From 2b195850128f5bafde177b12489d9fa27962cc1e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 30 Apr 2020 10:35:41 -0700 Subject: tcp: add tp->dup_ack_counter In commit 86de5921a3d5 ("tcp: defer SACK compression after DupThresh") I added a TCP_FASTRETRANS_THRESH bias to tp->compressed_ack in order to enable sack compression only after 3 dupacks. Since we plan to relax this rule for flows that involve stacks not requiring this old rule, this patch adds a distinct tp->dup_ack_counter. This means the TCP_FASTRETRANS_THRESH value is now used in a single location that a future patch can adjust: if (tp->dup_ack_counter < TCP_FASTRETRANS_THRESH) { tp->dup_ack_counter++; goto send_now; } This patch also introduces tcp_sack_compress_send_ack() helper to ease following patch comprehension. This patch refines LINUX_MIB_TCPACKCOMPRESSED to not count the acks that we had to send if the timer expires or tcp_sack_compress_send_ack() is sending an ack. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- include/linux/tcp.h | 1 + net/ipv4/tcp_input.c | 36 +++++++++++++++++++++++++++--------- net/ipv4/tcp_output.c | 6 +++--- net/ipv4/tcp_timer.c | 8 +++++++- 4 files changed, 38 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 421c99c12291..2c6f87e9f0cf 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -268,6 +268,7 @@ struct tcp_sock { } rack; u16 advmss; /* Advertised MSS */ u8 compressed_ack; + u8 dup_ack_counter; u32 chrono_start; /* Start time in jiffies of a TCP chrono */ u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */ u8 chrono_type:2, /* current chronograph type */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index bf4ced9273e8..da777df0a0ba 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4327,6 +4327,27 @@ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp) } } +static void tcp_sack_compress_send_ack(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (!tp->compressed_ack) + return; + + if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1) + __sock_put(sk); + + /* Since we have to send one ack finally, + * substract one from tp->compressed_ack to keep + * LINUX_MIB_TCPACKCOMPRESSED accurate. + */ + NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED, + tp->compressed_ack - 1); + + tp->compressed_ack = 0; + tcp_send_ack(sk); +} + static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) { struct tcp_sock *tp = tcp_sk(sk); @@ -4355,8 +4376,7 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) * If the sack array is full, forget about the last one. */ if (this_sack >= TCP_NUM_SACKS) { - if (tp->compressed_ack > TCP_FASTRETRANS_THRESH) - tcp_send_ack(sk); + tcp_sack_compress_send_ack(sk); this_sack--; tp->rx_opt.num_sacks--; sp--; @@ -5275,15 +5295,13 @@ send_now: if (tp->compressed_ack_rcv_nxt != tp->rcv_nxt) { tp->compressed_ack_rcv_nxt = tp->rcv_nxt; - if (tp->compressed_ack > TCP_FASTRETRANS_THRESH) - NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED, - tp->compressed_ack - TCP_FASTRETRANS_THRESH); - tp->compressed_ack = 0; + tp->dup_ack_counter = 0; } - - if (++tp->compressed_ack <= TCP_FASTRETRANS_THRESH) + if (tp->dup_ack_counter < TCP_FASTRETRANS_THRESH) { + tp->dup_ack_counter++; goto send_now; - + } + tp->compressed_ack++; if (hrtimer_is_queued(&tp->compressed_ack_timer)) return; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index ba4482130f08..c414aeb1efa9 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -184,10 +184,10 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts, { struct tcp_sock *tp = tcp_sk(sk); - if (unlikely(tp->compressed_ack > TCP_FASTRETRANS_THRESH)) { + if (unlikely(tp->compressed_ack)) { NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED, - tp->compressed_ack - TCP_FASTRETRANS_THRESH); - tp->compressed_ack = TCP_FASTRETRANS_THRESH; + tp->compressed_ack); + tp->compressed_ack = 0; if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1) __sock_put(sk); } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index c3f26dcd6704..ada046f425d2 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -753,8 +753,14 @@ static enum hrtimer_restart tcp_compressed_ack_kick(struct hrtimer *timer) bh_lock_sock(sk); if (!sock_owned_by_user(sk)) { - if (tp->compressed_ack > TCP_FASTRETRANS_THRESH) + if (tp->compressed_ack) { + /* Since we have to send one ack finally, + * substract one from tp->compressed_ack to keep + * LINUX_MIB_TCPACKCOMPRESSED accurate. + */ + tp->compressed_ack--; tcp_send_ack(sk); + } } else { if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &sk->sk_tsq_flags)) -- cgit v1.2.3 From ccd0628fca440268711560a1dbacc727b4f9e214 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 30 Apr 2020 10:35:42 -0700 Subject: tcp: tcp_sack_new_ofo_skb() should be more conservative Currently, tcp_sack_new_ofo_skb() sends an ack if prior acks were 'compressed', if room has to be made in tp->selective_acks[] But there is no guarantee all four sack ranges can be included in SACK option. As a matter of fact, when TCP timestamps option is used, only three SACK ranges can be included. Lets assume only two ranges can be included, and force the ack: - When we touch more than 2 ranges in the reordering done if tcp_sack_extend() could be done. - If we have at least 2 ranges when adding a new one. This enforces that before a range is in third or fourth position, at least one ACK packet included it in first/second position. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index da777df0a0ba..ef921ecba415 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4348,6 +4348,12 @@ static void tcp_sack_compress_send_ack(struct sock *sk) tcp_send_ack(sk); } +/* Reasonable amount of sack blocks included in TCP SACK option + * The max is 4, but this becomes 3 if TCP timestamps are there. + * Given that SACK packets might be lost, be conservative and use 2. + */ +#define TCP_SACK_BLOCKS_EXPECTED 2 + static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) { struct tcp_sock *tp = tcp_sk(sk); @@ -4360,6 +4366,8 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) for (this_sack = 0; this_sack < cur_sacks; this_sack++, sp++) { if (tcp_sack_extend(sp, seq, end_seq)) { + if (this_sack >= TCP_SACK_BLOCKS_EXPECTED) + tcp_sack_compress_send_ack(sk); /* Rotate this_sack to the first one. */ for (; this_sack > 0; this_sack--, sp--) swap(*sp, *(sp - 1)); @@ -4369,6 +4377,9 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) } } + if (this_sack >= TCP_SACK_BLOCKS_EXPECTED) + tcp_sack_compress_send_ack(sk); + /* Could not find an adjacent existing SACK, build a new one, * put it at the front, and shift everyone else down. We * always know there is at least one SACK present already here. @@ -4376,7 +4387,6 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) * If the sack array is full, forget about the last one. */ if (this_sack >= TCP_NUM_SACKS) { - tcp_sack_compress_send_ack(sk); this_sack--; tp->rx_opt.num_sacks--; sp--; -- cgit v1.2.3 From a70437cc09a11771870e9f6bfc0ba1237161daa8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 30 Apr 2020 10:35:43 -0700 Subject: tcp: add hrtimer slack to sack compression Add a sysctl to control hrtimer slack, default of 100 usec. This gives the opportunity to reduce system overhead, and help very short RTT flows. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.rst | 8 ++++++++ include/net/netns/ipv4.h | 1 + net/ipv4/sysctl_net_ipv4.c | 7 +++++++ net/ipv4/tcp_input.c | 5 +++-- net/ipv4/tcp_ipv4.c | 1 + 5 files changed, 20 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 3266aee9e052..50b440d29a13 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -651,6 +651,14 @@ tcp_comp_sack_delay_ns - LONG INTEGER Default : 1,000,000 ns (1 ms) +tcp_comp_sack_slack_ns - LONG INTEGER + This sysctl control the slack used when arming the + timer used by SACK compression. This gives extra time + for small RTT flows, and reduces system overhead by allowing + opportunistic reduction of timer interrupts. + + Default : 100,000 ns (100 us) + tcp_comp_sack_nr - INTEGER Max number of SACK that can be compressed. Using 0 disables SACK compression. diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 5acdb4d414c4..9e36738c1fe1 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -173,6 +173,7 @@ struct netns_ipv4 { int sysctl_tcp_rmem[3]; int sysctl_tcp_comp_sack_nr; unsigned long sysctl_tcp_comp_sack_delay_ns; + unsigned long sysctl_tcp_comp_sack_slack_ns; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 95ad71e76cc3..3a628423d27b 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -1329,6 +1329,13 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, + { + .procname = "tcp_comp_sack_slack_ns", + .data = &init_net.ipv4.sysctl_tcp_comp_sack_slack_ns, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, { .procname = "tcp_comp_sack_nr", .data = &init_net.ipv4.sysctl_tcp_comp_sack_nr, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index ef921ecba415..d68128a672ab 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5324,8 +5324,9 @@ send_now: delay = min_t(unsigned long, sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns, rtt * (NSEC_PER_USEC >> 3)/20); sock_hold(sk); - hrtimer_start(&tp->compressed_ack_timer, ns_to_ktime(delay), - HRTIMER_MODE_REL_PINNED_SOFT); + hrtimer_start_range_ns(&tp->compressed_ack_timer, ns_to_ktime(delay), + sock_net(sk)->ipv4.sysctl_tcp_comp_sack_slack_ns, + HRTIMER_MODE_REL_PINNED_SOFT); } static inline void tcp_ack_snd_check(struct sock *sk) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 83a5d24e13b8..6c05f1ceb538 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2780,6 +2780,7 @@ static int __net_init tcp_sk_init(struct net *net) sizeof(init_net.ipv4.sysctl_tcp_wmem)); } net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; + net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC; net->ipv4.sysctl_tcp_comp_sack_nr = 44; net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); -- cgit v1.2.3 From 34a9c361dd480041d790fff3d6ea58513c8769e8 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Thu, 30 Apr 2020 17:37:02 +0000 Subject: hsr: remove hsr interface if all slaves are removed When all hsr slave interfaces are removed, hsr interface doesn't work. At that moment, it's fine to remove an unused hsr interface automatically for saving resources. That's a common behavior of virtual interfaces. Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- net/hsr/hsr_main.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c index 26d6c39f24e1..e2564de67603 100644 --- a/net/hsr/hsr_main.c +++ b/net/hsr/hsr_main.c @@ -15,12 +15,23 @@ #include "hsr_framereg.h" #include "hsr_slave.h" +static bool hsr_slave_empty(struct hsr_priv *hsr) +{ + struct hsr_port *port; + + hsr_for_each_port(hsr, port) + if (port->type != HSR_PT_MASTER) + return false; + return true; +} + static int hsr_netdev_notify(struct notifier_block *nb, unsigned long event, void *ptr) { - struct net_device *dev; struct hsr_port *port, *master; + struct net_device *dev; struct hsr_priv *hsr; + LIST_HEAD(list_kill); int mtu_max; int res; @@ -85,8 +96,15 @@ static int hsr_netdev_notify(struct notifier_block *nb, unsigned long event, master->dev->mtu = mtu_max; break; case NETDEV_UNREGISTER: - if (!is_hsr_master(dev)) + if (!is_hsr_master(dev)) { + master = hsr_port_get_hsr(port->hsr, HSR_PT_MASTER); hsr_del_port(port); + if (hsr_slave_empty(master->hsr)) { + unregister_netdevice_queue(master->dev, + &list_kill); + unregister_netdevice_many(&list_kill); + } + } break; case NETDEV_PRE_TYPE_CHANGE: /* HSR works only on Ethernet devices. Refuse slave to change -- cgit v1.2.3 From 47a1494b8208461094923400c396ce4b8163c064 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 30 Apr 2020 22:13:05 +0200 Subject: netlink: remove type-unsafe validation_data pointer In the netlink policy, we currently have a void *validation_data that's pointing to different things: * a u32 value for bitfield32, * the netlink policy for nested/nested array * the string for NLA_REJECT Remove the pointer and place appropriate type-safe items in the union instead. While at it, completely dissolve the pointer for the bitfield32 case and just put the value there directly. Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- include/net/netlink.h | 60 +++++++++++++++++++++++++++------------------------ lib/nlattr.c | 20 ++++++++--------- net/sched/act_api.c | 13 +++-------- net/sched/sch_red.c | 9 ++++---- 4 files changed, 49 insertions(+), 53 deletions(-) (limited to 'net') diff --git a/include/net/netlink.h b/include/net/netlink.h index 67c57d6942e3..671b29d170a8 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -217,7 +217,7 @@ enum nla_policy_validation { * NLA_NESTED, * NLA_NESTED_ARRAY Length verification is done by checking len of * nested header (or empty); len field is used if - * validation_data is also used, for the max attr + * nested_policy is also used, for the max attr * number in the nested policy. * NLA_U8, NLA_U16, * NLA_U32, NLA_U64, @@ -235,27 +235,25 @@ enum nla_policy_validation { * NLA_MIN_LEN Minimum length of attribute payload * All other Minimum length of attribute payload * - * Meaning of `validation_data' field: + * Meaning of validation union: * NLA_BITFIELD32 This is a 32-bit bitmap/bitselector attribute and - * validation data must point to a u32 value of valid - * flags - * NLA_REJECT This attribute is always rejected and validation data + * `bitfield32_valid' is the u32 value of valid flags + * NLA_REJECT This attribute is always rejected and `reject_message' * may point to a string to report as the error instead * of the generic one in extended ACK. - * NLA_NESTED Points to a nested policy to validate, must also set - * `len' to the max attribute number. + * NLA_NESTED `nested_policy' to a nested policy to validate, must + * also set `len' to the max attribute number. Use the + * provided NLA_POLICY_NESTED() macro. * Note that nla_parse() will validate, but of course not * parse, the nested sub-policies. - * NLA_NESTED_ARRAY Points to a nested policy to validate, must also set - * `len' to the max attribute number. The difference to - * NLA_NESTED is the structure - NLA_NESTED has the - * nested attributes directly inside, while an array has - * the nested attributes at another level down and the - * attributes directly in the nesting don't matter. - * All other Unused - but note that it's a union - * - * Meaning of `min' and `max' fields, use via NLA_POLICY_MIN, NLA_POLICY_MAX - * and NLA_POLICY_RANGE: + * NLA_NESTED_ARRAY `nested_policy' points to a nested policy to validate, + * must also set `len' to the max attribute number. Use + * the provided NLA_POLICY_NESTED_ARRAY() macro. + * The difference to NLA_NESTED is the structure: + * NLA_NESTED has the nested attributes directly inside + * while an array has the nested attributes at another + * level down and the attribute types directly in the + * nesting don't matter. * NLA_U8, * NLA_U16, * NLA_U32, @@ -263,29 +261,31 @@ enum nla_policy_validation { * NLA_S8, * NLA_S16, * NLA_S32, - * NLA_S64 These are used depending on the validation_type - * field, if that is min/max/range then the minimum, - * maximum and both are used (respectively) to check + * NLA_S64 The `min' and `max' fields are used depending on the + * validation_type field, if that is min/max/range then + * the min, max or both are used (respectively) to check * the value of the integer attribute. * Note that in the interest of code simplicity and * struct size both limits are s16, so you cannot * enforce a range that doesn't fall within the range * of s16 - do that as usual in the code instead. + * Use the NLA_POLICY_MIN(), NLA_POLICY_MAX() and + * NLA_POLICY_RANGE() macros. * All other Unused - but note that it's a union * * Meaning of `validate' field, use via NLA_POLICY_VALIDATE_FN: - * NLA_BINARY Validation function called for the attribute, - * not compatible with use of the validation_data - * as in NLA_BITFIELD32, NLA_REJECT, NLA_NESTED and - * NLA_NESTED_ARRAY. + * NLA_BINARY Validation function called for the attribute. * All other Unused - but note that it's a union * * Example: + * + * static const u32 myvalidflags = 0xff231023; + * * static const struct nla_policy my_policy[ATTR_MAX+1] = { * [ATTR_FOO] = { .type = NLA_U16 }, * [ATTR_BAR] = { .type = NLA_STRING, .len = BARSIZ }, * [ATTR_BAZ] = { .type = NLA_EXACT_LEN, .len = sizeof(struct mystruct) }, - * [ATTR_GOO] = { .type = NLA_BITFIELD32, .validation_data = &myvalidflags }, + * [ATTR_GOO] = NLA_POLICY_BITFIELD32(myvalidflags), * }; */ struct nla_policy { @@ -293,7 +293,9 @@ struct nla_policy { u8 validation_type; u16 len; union { - const void *validation_data; + const u32 bitfield32_valid; + const char *reject_message; + const struct nla_policy *nested_policy; struct { s16 min, max; }; @@ -329,13 +331,15 @@ struct nla_policy { #define NLA_POLICY_ETH_ADDR_COMPAT NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN) #define _NLA_POLICY_NESTED(maxattr, policy) \ - { .type = NLA_NESTED, .validation_data = policy, .len = maxattr } + { .type = NLA_NESTED, .nested_policy = policy, .len = maxattr } #define _NLA_POLICY_NESTED_ARRAY(maxattr, policy) \ - { .type = NLA_NESTED_ARRAY, .validation_data = policy, .len = maxattr } + { .type = NLA_NESTED_ARRAY, .nested_policy = policy, .len = maxattr } #define NLA_POLICY_NESTED(policy) \ _NLA_POLICY_NESTED(ARRAY_SIZE(policy) - 1, policy) #define NLA_POLICY_NESTED_ARRAY(policy) \ _NLA_POLICY_NESTED_ARRAY(ARRAY_SIZE(policy) - 1, policy) +#define NLA_POLICY_BITFIELD32(valid) \ + { .type = NLA_BITFIELD32, .bitfield32_valid = valid } #define __NLA_ENSURE(condition) BUILD_BUG_ON_ZERO(!(condition)) #define NLA_ENSURE_INT_TYPE(tp) \ diff --git a/lib/nlattr.c b/lib/nlattr.c index cace9b307781..3df05db732ca 100644 --- a/lib/nlattr.c +++ b/lib/nlattr.c @@ -45,7 +45,7 @@ static const u8 nla_attr_minlen[NLA_TYPE_MAX+1] = { }; static int validate_nla_bitfield32(const struct nlattr *nla, - const u32 *valid_flags_mask) + const u32 valid_flags_mask) { const struct nla_bitfield32 *bf = nla_data(nla); @@ -53,11 +53,11 @@ static int validate_nla_bitfield32(const struct nlattr *nla, return -EINVAL; /*disallow invalid bit selector */ - if (bf->selector & ~*valid_flags_mask) + if (bf->selector & ~valid_flags_mask) return -EINVAL; /*disallow invalid bit values */ - if (bf->value & ~*valid_flags_mask) + if (bf->value & ~valid_flags_mask) return -EINVAL; /*disallow valid bit values that are not selected*/ @@ -206,9 +206,9 @@ static int validate_nla(const struct nlattr *nla, int maxtype, break; case NLA_REJECT: - if (extack && pt->validation_data) { + if (extack && pt->reject_message) { NL_SET_BAD_ATTR(extack, nla); - extack->_msg = pt->validation_data; + extack->_msg = pt->reject_message; return -EINVAL; } err = -EINVAL; @@ -223,7 +223,7 @@ static int validate_nla(const struct nlattr *nla, int maxtype, if (attrlen != sizeof(struct nla_bitfield32)) goto out_err; - err = validate_nla_bitfield32(nla, pt->validation_data); + err = validate_nla_bitfield32(nla, pt->bitfield32_valid); if (err) goto out_err; break; @@ -268,9 +268,9 @@ static int validate_nla(const struct nlattr *nla, int maxtype, break; if (attrlen < NLA_HDRLEN) goto out_err; - if (pt->validation_data) { + if (pt->nested_policy) { err = __nla_validate(nla_data(nla), nla_len(nla), pt->len, - pt->validation_data, validate, + pt->nested_policy, validate, extack); if (err < 0) { /* @@ -289,11 +289,11 @@ static int validate_nla(const struct nlattr *nla, int maxtype, break; if (attrlen < NLA_HDRLEN) goto out_err; - if (pt->validation_data) { + if (pt->nested_policy) { int err; err = nla_validate_array(nla_data(nla), nla_len(nla), - pt->len, pt->validation_data, + pt->len, pt->nested_policy, extack, validate); if (err < 0) { /* diff --git a/net/sched/act_api.c b/net/sched/act_api.c index df4560909157..fbbec2e562f5 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -876,19 +876,14 @@ static u8 tcf_action_hw_stats_get(struct nlattr *hw_stats_attr) return hw_stats_bf.value; } -static const u32 tca_act_flags_allowed = TCA_ACT_FLAGS_NO_PERCPU_STATS; -static const u32 tca_act_hw_stats_allowed = TCA_ACT_HW_STATS_ANY; - static const struct nla_policy tcf_action_policy[TCA_ACT_MAX + 1] = { [TCA_ACT_KIND] = { .type = NLA_STRING }, [TCA_ACT_INDEX] = { .type = NLA_U32 }, [TCA_ACT_COOKIE] = { .type = NLA_BINARY, .len = TC_COOKIE_MAX_SIZE }, [TCA_ACT_OPTIONS] = { .type = NLA_NESTED }, - [TCA_ACT_FLAGS] = { .type = NLA_BITFIELD32, - .validation_data = &tca_act_flags_allowed }, - [TCA_ACT_HW_STATS] = { .type = NLA_BITFIELD32, - .validation_data = &tca_act_hw_stats_allowed }, + [TCA_ACT_FLAGS] = NLA_POLICY_BITFIELD32(TCA_ACT_FLAGS_NO_PERCPU_STATS), + [TCA_ACT_HW_STATS] = NLA_POLICY_BITFIELD32(TCA_ACT_HW_STATS_ANY), }; struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, @@ -1454,10 +1449,8 @@ static int tcf_action_add(struct net *net, struct nlattr *nla, return ret; } -static u32 tcaa_root_flags_allowed = TCA_FLAG_LARGE_DUMP_ON; static const struct nla_policy tcaa_policy[TCA_ROOT_MAX + 1] = { - [TCA_ROOT_FLAGS] = { .type = NLA_BITFIELD32, - .validation_data = &tcaa_root_flags_allowed }, + [TCA_ROOT_FLAGS] = NLA_POLICY_BITFIELD32(TCA_FLAG_LARGE_DUMP_ON), [TCA_ROOT_TIME_DELTA] = { .type = NLA_U32 }, }; diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index c7de47c942e3..555a1b9e467f 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -48,7 +48,7 @@ struct red_sched_data { struct Qdisc *qdisc; }; -static const u32 red_supported_flags = TC_RED_HISTORIC_FLAGS | TC_RED_NODROP; +#define TC_RED_SUPPORTED_FLAGS (TC_RED_HISTORIC_FLAGS | TC_RED_NODROP) static inline int red_use_ecn(struct red_sched_data *q) { @@ -212,8 +212,7 @@ static const struct nla_policy red_policy[TCA_RED_MAX + 1] = { [TCA_RED_PARMS] = { .len = sizeof(struct tc_red_qopt) }, [TCA_RED_STAB] = { .len = RED_STAB_SIZE }, [TCA_RED_MAX_P] = { .type = NLA_U32 }, - [TCA_RED_FLAGS] = { .type = NLA_BITFIELD32, - .validation_data = &red_supported_flags }, + [TCA_RED_FLAGS] = NLA_POLICY_BITFIELD32(TC_RED_SUPPORTED_FLAGS), }; static int red_change(struct Qdisc *sch, struct nlattr *opt, @@ -248,7 +247,7 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt, return -EINVAL; err = red_get_flags(ctl->flags, TC_RED_HISTORIC_FLAGS, - tb[TCA_RED_FLAGS], red_supported_flags, + tb[TCA_RED_FLAGS], TC_RED_SUPPORTED_FLAGS, &flags_bf, &userbits, extack); if (err) return err; @@ -372,7 +371,7 @@ static int red_dump(struct Qdisc *sch, struct sk_buff *skb) if (nla_put(skb, TCA_RED_PARMS, sizeof(opt), &opt) || nla_put_u32(skb, TCA_RED_MAX_P, q->parms.max_P) || nla_put_bitfield32(skb, TCA_RED_FLAGS, - q->flags, red_supported_flags)) + q->flags, TC_RED_SUPPORTED_FLAGS)) goto nla_put_failure; return nla_nest_end(skb, opts); -- cgit v1.2.3 From d15da2a2e813679aeac8bff3be38d3adc849c1a6 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 30 Apr 2020 22:13:07 +0200 Subject: nl80211: link recursive netlink nested policy Now that we have limited recursive policy validation to avoid stack overflows, change nl80211 to actually link the nested policy (linking back to itself eventually), which allows some code cleanups. Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- net/wireless/nl80211.c | 10 ++++------ net/wireless/nl80211.h | 2 -- net/wireless/pmsr.c | 3 +-- 3 files changed, 5 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 692bcd35f809..57c618b6cb0e 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -253,6 +253,8 @@ static int validate_ie_attr(const struct nlattr *attr, } /* policy for the attributes */ +static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR]; + static const struct nla_policy nl80211_ftm_responder_policy[NL80211_FTM_RESP_ATTR_MAX + 1] = { [NL80211_FTM_RESP_ATTR_ENABLED] = { .type = NLA_FLAG, }, @@ -296,11 +298,7 @@ nl80211_pmsr_req_attr_policy[NL80211_PMSR_REQ_ATTR_MAX + 1] = { static const struct nla_policy nl80211_psmr_peer_attr_policy[NL80211_PMSR_PEER_ATTR_MAX + 1] = { [NL80211_PMSR_PEER_ATTR_ADDR] = NLA_POLICY_ETH_ADDR, - /* - * we could specify this again to be the top-level policy, - * but that would open us up to recursion problems ... - */ - [NL80211_PMSR_PEER_ATTR_CHAN] = { .type = NLA_NESTED }, + [NL80211_PMSR_PEER_ATTR_CHAN] = NLA_POLICY_NESTED(nl80211_policy), [NL80211_PMSR_PEER_ATTR_REQ] = NLA_POLICY_NESTED(nl80211_pmsr_req_attr_policy), [NL80211_PMSR_PEER_ATTR_RESP] = { .type = NLA_REJECT }, @@ -347,7 +345,7 @@ nl80211_tid_config_attr_policy[NL80211_TID_CONFIG_ATTR_MAX + 1] = { NLA_POLICY_MAX(NLA_U8, NL80211_TID_CONFIG_DISABLE), }; -const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { +static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [0] = { .strict_start_type = NL80211_ATTR_HE_OBSS_PD }, [NL80211_ATTR_WIPHY] = { .type = NLA_U32 }, [NL80211_ATTR_WIPHY_NAME] = { .type = NLA_NUL_STRING, diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h index a41e94a49a89..d3e8e426c486 100644 --- a/net/wireless/nl80211.h +++ b/net/wireless/nl80211.h @@ -11,8 +11,6 @@ int nl80211_init(void); void nl80211_exit(void); -extern const struct nla_policy nl80211_policy[NUM_NL80211_ATTR]; - void *nl80211hdr_put(struct sk_buff *skb, u32 portid, u32 seq, int flags, u8 cmd); bool nl80211_put_sta_rate(struct sk_buff *msg, struct rate_info *info, diff --git a/net/wireless/pmsr.c b/net/wireless/pmsr.c index 63dc8023447f..a95c79d18349 100644 --- a/net/wireless/pmsr.c +++ b/net/wireless/pmsr.c @@ -187,10 +187,9 @@ static int pmsr_parse_peer(struct cfg80211_registered_device *rdev, /* reuse info->attrs */ memset(info->attrs, 0, sizeof(*info->attrs) * (NL80211_ATTR_MAX + 1)); - /* need to validate here, we don't want to have validation recursion */ err = nla_parse_nested_deprecated(info->attrs, NL80211_ATTR_MAX, tb[NL80211_PMSR_PEER_ATTR_CHAN], - nl80211_policy, info->extack); + NULL, info->extack); if (err) return err; -- cgit v1.2.3 From c7721c05a6217491810f406ec28df80a9bcf3546 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 30 Apr 2020 22:13:10 +0200 Subject: netlink: remove NLA_EXACT_LEN_WARN Use a validation type instead, so we can later expose the NLA_* values to userspace for policy descriptions. Some transformations were done with this spatch: @@ identifier p; expression X, L, A; @@ struct nla_policy p[X] = { [A] = -{ .type = NLA_EXACT_LEN_WARN, .len = L }, +NLA_POLICY_EXACT_LEN_WARN(L), ... }; Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- include/net/netlink.h | 15 +++++----- lib/nlattr.c | 16 ++++++---- net/wireless/nl80211.c | 81 +++++++++++--------------------------------------- 3 files changed, 36 insertions(+), 76 deletions(-) (limited to 'net') diff --git a/include/net/netlink.h b/include/net/netlink.h index 4acd7165e900..4d4a733f1e8d 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -182,7 +182,6 @@ enum { NLA_BITFIELD32, NLA_REJECT, NLA_EXACT_LEN, - NLA_EXACT_LEN_WARN, NLA_MIN_LEN, __NLA_TYPE_MAX, }; @@ -204,6 +203,7 @@ enum nla_policy_validation { NLA_VALIDATE_MAX, NLA_VALIDATE_RANGE_PTR, NLA_VALIDATE_FUNCTION, + NLA_VALIDATE_WARN_TOO_LONG, }; /** @@ -237,10 +237,10 @@ enum nla_policy_validation { * just like "All other" * NLA_BITFIELD32 Unused * NLA_REJECT Unused - * NLA_EXACT_LEN Attribute must have exactly this length, otherwise - * it is rejected. - * NLA_EXACT_LEN_WARN Attribute should have exactly this length, a warning - * is logged if it is longer, shorter is rejected. + * NLA_EXACT_LEN Attribute should have exactly this length, otherwise + * it is rejected or warned about, the latter happening + * if and only if the `validation_type' is set to + * NLA_VALIDATE_WARN_TOO_LONG. * NLA_MIN_LEN Minimum length of attribute payload * All other Minimum length of attribute payload * @@ -350,8 +350,9 @@ struct nla_policy { }; #define NLA_POLICY_EXACT_LEN(_len) { .type = NLA_EXACT_LEN, .len = _len } -#define NLA_POLICY_EXACT_LEN_WARN(_len) { .type = NLA_EXACT_LEN_WARN, \ - .len = _len } +#define NLA_POLICY_EXACT_LEN_WARN(_len) \ + { .type = NLA_EXACT_LEN, .len = _len, \ + .validation_type = NLA_VALIDATE_WARN_TOO_LONG, } #define NLA_POLICY_MIN_LEN(_len) { .type = NLA_MIN_LEN, .len = _len } #define NLA_POLICY_ETH_ADDR NLA_POLICY_EXACT_LEN(ETH_ALEN) diff --git a/lib/nlattr.c b/lib/nlattr.c index 21ef3998b9d9..6dcbe1bedd3b 100644 --- a/lib/nlattr.c +++ b/lib/nlattr.c @@ -261,7 +261,9 @@ static int validate_nla(const struct nlattr *nla, int maxtype, BUG_ON(pt->type > NLA_TYPE_MAX); if ((nla_attr_len[pt->type] && attrlen != nla_attr_len[pt->type]) || - (pt->type == NLA_EXACT_LEN_WARN && attrlen != pt->len)) { + (pt->type == NLA_EXACT_LEN && + pt->validation_type == NLA_VALIDATE_WARN_TOO_LONG && + attrlen != pt->len)) { pr_warn_ratelimited("netlink: '%s': attribute type %d has an invalid length.\n", current->comm, type); if (validate & NL_VALIDATE_STRICT_ATTRS) { @@ -287,11 +289,6 @@ static int validate_nla(const struct nlattr *nla, int maxtype, } switch (pt->type) { - case NLA_EXACT_LEN: - if (attrlen != pt->len) - goto out_err; - break; - case NLA_REJECT: if (extack && pt->reject_message) { NL_SET_BAD_ATTR(extack, nla); @@ -405,6 +402,13 @@ static int validate_nla(const struct nlattr *nla, int maxtype, goto out_err; break; + case NLA_EXACT_LEN: + if (pt->validation_type != NLA_VALIDATE_WARN_TOO_LONG) { + if (attrlen != pt->len) + goto out_err; + break; + } + /* fall through */ default: if (pt->len) minlen = pt->len; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 57c618b6cb0e..519414468b5d 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -376,11 +376,8 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_IFINDEX] = { .type = NLA_U32 }, [NL80211_ATTR_IFNAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ-1 }, - [NL80211_ATTR_MAC] = { .type = NLA_EXACT_LEN_WARN, .len = ETH_ALEN }, - [NL80211_ATTR_PREV_BSSID] = { - .type = NLA_EXACT_LEN_WARN, - .len = ETH_ALEN - }, + [NL80211_ATTR_MAC] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN), + [NL80211_ATTR_PREV_BSSID] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN), [NL80211_ATTR_KEY] = { .type = NLA_NESTED, }, [NL80211_ATTR_KEY_DATA] = { .type = NLA_BINARY, @@ -432,10 +429,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_MESH_CONFIG] = { .type = NLA_NESTED }, [NL80211_ATTR_SUPPORT_MESH_AUTH] = { .type = NLA_FLAG }, - [NL80211_ATTR_HT_CAPABILITY] = { - .type = NLA_EXACT_LEN_WARN, - .len = NL80211_HT_CAPABILITY_LEN - }, + [NL80211_ATTR_HT_CAPABILITY] = NLA_POLICY_EXACT_LEN_WARN(NL80211_HT_CAPABILITY_LEN), [NL80211_ATTR_MGMT_SUBTYPE] = { .type = NLA_U8 }, [NL80211_ATTR_IE] = NLA_POLICY_VALIDATE_FN(NLA_BINARY, @@ -466,10 +460,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_WPA_VERSIONS] = { .type = NLA_U32 }, [NL80211_ATTR_PID] = { .type = NLA_U32 }, [NL80211_ATTR_4ADDR] = { .type = NLA_U8 }, - [NL80211_ATTR_PMKID] = { - .type = NLA_EXACT_LEN_WARN, - .len = WLAN_PMKID_LEN - }, + [NL80211_ATTR_PMKID] = NLA_POLICY_EXACT_LEN_WARN(WLAN_PMKID_LEN), [NL80211_ATTR_DURATION] = { .type = NLA_U32 }, [NL80211_ATTR_COOKIE] = { .type = NLA_U64 }, [NL80211_ATTR_TX_RATES] = { .type = NLA_NESTED }, @@ -533,10 +524,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_WDEV] = { .type = NLA_U64 }, [NL80211_ATTR_USER_REG_HINT_TYPE] = { .type = NLA_U32 }, [NL80211_ATTR_AUTH_DATA] = { .type = NLA_BINARY, }, - [NL80211_ATTR_VHT_CAPABILITY] = { - .type = NLA_EXACT_LEN_WARN, - .len = NL80211_VHT_CAPABILITY_LEN - }, + [NL80211_ATTR_VHT_CAPABILITY] = NLA_POLICY_EXACT_LEN_WARN(NL80211_VHT_CAPABILITY_LEN), [NL80211_ATTR_SCAN_FLAGS] = { .type = NLA_U32 }, [NL80211_ATTR_P2P_CTWINDOW] = NLA_POLICY_MAX(NLA_U8, 127), [NL80211_ATTR_P2P_OPPPS] = NLA_POLICY_MAX(NLA_U8, 1), @@ -574,10 +562,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_VENDOR_DATA] = { .type = NLA_BINARY }, [NL80211_ATTR_QOS_MAP] = { .type = NLA_BINARY, .len = IEEE80211_QOS_MAP_LEN_MAX }, - [NL80211_ATTR_MAC_HINT] = { - .type = NLA_EXACT_LEN_WARN, - .len = ETH_ALEN - }, + [NL80211_ATTR_MAC_HINT] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN), [NL80211_ATTR_WIPHY_FREQ_HINT] = { .type = NLA_U32 }, [NL80211_ATTR_TDLS_PEER_CAPABILITY] = { .type = NLA_U32 }, [NL80211_ATTR_SOCKET_OWNER] = { .type = NLA_FLAG }, @@ -589,10 +574,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_ADMITTED_TIME] = { .type = NLA_U16 }, [NL80211_ATTR_SMPS_MODE] = { .type = NLA_U8 }, [NL80211_ATTR_OPER_CLASS] = { .type = NLA_U8 }, - [NL80211_ATTR_MAC_MASK] = { - .type = NLA_EXACT_LEN_WARN, - .len = ETH_ALEN - }, + [NL80211_ATTR_MAC_MASK] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN), [NL80211_ATTR_WIPHY_SELF_MANAGED_REG] = { .type = NLA_FLAG }, [NL80211_ATTR_NETNS_FD] = { .type = NLA_U32 }, [NL80211_ATTR_SCHED_SCAN_DELAY] = { .type = NLA_U32 }, @@ -604,21 +586,15 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_MU_MIMO_GROUP_DATA] = { .len = VHT_MUMIMO_GROUPS_DATA_LEN }, - [NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR] = { - .type = NLA_EXACT_LEN_WARN, - .len = ETH_ALEN - }, + [NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN), [NL80211_ATTR_NAN_MASTER_PREF] = NLA_POLICY_MIN(NLA_U8, 1), [NL80211_ATTR_BANDS] = { .type = NLA_U32 }, [NL80211_ATTR_NAN_FUNC] = { .type = NLA_NESTED }, [NL80211_ATTR_FILS_KEK] = { .type = NLA_BINARY, .len = FILS_MAX_KEK_LEN }, - [NL80211_ATTR_FILS_NONCES] = { - .type = NLA_EXACT_LEN_WARN, - .len = 2 * FILS_NONCE_LEN - }, + [NL80211_ATTR_FILS_NONCES] = NLA_POLICY_EXACT_LEN_WARN(2 * FILS_NONCE_LEN), [NL80211_ATTR_MULTICAST_TO_UNICAST_ENABLED] = { .type = NLA_FLAG, }, - [NL80211_ATTR_BSSID] = { .type = NLA_EXACT_LEN_WARN, .len = ETH_ALEN }, + [NL80211_ATTR_BSSID] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN), [NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI] = { .type = NLA_S8 }, [NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST] = { .len = sizeof(struct nl80211_bss_select_rssi_adjust) @@ -631,7 +607,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM] = { .type = NLA_U16 }, [NL80211_ATTR_FILS_ERP_RRK] = { .type = NLA_BINARY, .len = FILS_ERP_MAX_RRK_LEN }, - [NL80211_ATTR_FILS_CACHE_ID] = { .type = NLA_EXACT_LEN_WARN, .len = 2 }, + [NL80211_ATTR_FILS_CACHE_ID] = NLA_POLICY_EXACT_LEN_WARN(2), [NL80211_ATTR_PMK] = { .type = NLA_BINARY, .len = PMK_MAX_LEN }, [NL80211_ATTR_SCHED_SCAN_MULTI] = { .type = NLA_FLAG }, [NL80211_ATTR_EXTERNAL_AUTH_SUPPORT] = { .type = NLA_FLAG }, @@ -701,10 +677,7 @@ static const struct nla_policy nl80211_wowlan_tcp_policy[NUM_NL80211_WOWLAN_TCP] = { [NL80211_WOWLAN_TCP_SRC_IPV4] = { .type = NLA_U32 }, [NL80211_WOWLAN_TCP_DST_IPV4] = { .type = NLA_U32 }, - [NL80211_WOWLAN_TCP_DST_MAC] = { - .type = NLA_EXACT_LEN_WARN, - .len = ETH_ALEN - }, + [NL80211_WOWLAN_TCP_DST_MAC] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN), [NL80211_WOWLAN_TCP_SRC_PORT] = { .type = NLA_U16 }, [NL80211_WOWLAN_TCP_DST_PORT] = { .type = NLA_U16 }, [NL80211_WOWLAN_TCP_DATA_PAYLOAD] = { .type = NLA_MIN_LEN, .len = 1 }, @@ -734,18 +707,9 @@ nl80211_coalesce_policy[NUM_NL80211_ATTR_COALESCE_RULE] = { /* policy for GTK rekey offload attributes */ static const struct nla_policy nl80211_rekey_policy[NUM_NL80211_REKEY_DATA] = { - [NL80211_REKEY_DATA_KEK] = { - .type = NLA_EXACT_LEN_WARN, - .len = NL80211_KEK_LEN, - }, - [NL80211_REKEY_DATA_KCK] = { - .type = NLA_EXACT_LEN_WARN, - .len = NL80211_KCK_LEN, - }, - [NL80211_REKEY_DATA_REPLAY_CTR] = { - .type = NLA_EXACT_LEN_WARN, - .len = NL80211_REPLAY_CTR_LEN - }, + [NL80211_REKEY_DATA_KEK] = NLA_POLICY_EXACT_LEN_WARN(NL80211_KEK_LEN), + [NL80211_REKEY_DATA_KCK] = NLA_POLICY_EXACT_LEN_WARN(NL80211_KCK_LEN), + [NL80211_REKEY_DATA_REPLAY_CTR] = NLA_POLICY_EXACT_LEN_WARN(NL80211_REPLAY_CTR_LEN), }; static const struct nla_policy @@ -760,10 +724,7 @@ static const struct nla_policy nl80211_match_policy[NL80211_SCHED_SCAN_MATCH_ATTR_MAX + 1] = { [NL80211_SCHED_SCAN_MATCH_ATTR_SSID] = { .type = NLA_BINARY, .len = IEEE80211_MAX_SSID_LEN }, - [NL80211_SCHED_SCAN_MATCH_ATTR_BSSID] = { - .type = NLA_EXACT_LEN_WARN, - .len = ETH_ALEN - }, + [NL80211_SCHED_SCAN_MATCH_ATTR_BSSID] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN), [NL80211_SCHED_SCAN_MATCH_ATTR_RSSI] = { .type = NLA_U32 }, [NL80211_SCHED_SCAN_MATCH_PER_BAND_RSSI] = NLA_POLICY_NESTED(nl80211_match_band_rssi_policy), @@ -795,10 +756,7 @@ nl80211_nan_func_policy[NL80211_NAN_FUNC_ATTR_MAX + 1] = { [NL80211_NAN_FUNC_SUBSCRIBE_ACTIVE] = { .type = NLA_FLAG }, [NL80211_NAN_FUNC_FOLLOW_UP_ID] = { .type = NLA_U8 }, [NL80211_NAN_FUNC_FOLLOW_UP_REQ_ID] = { .type = NLA_U8 }, - [NL80211_NAN_FUNC_FOLLOW_UP_DEST] = { - .type = NLA_EXACT_LEN_WARN, - .len = ETH_ALEN - }, + [NL80211_NAN_FUNC_FOLLOW_UP_DEST] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN), [NL80211_NAN_FUNC_CLOSE_RANGE] = { .type = NLA_FLAG }, [NL80211_NAN_FUNC_TTL] = { .type = NLA_U32 }, [NL80211_NAN_FUNC_SERVICE_INFO] = { .type = NLA_BINARY, @@ -4404,10 +4362,7 @@ static const struct nla_policy nl80211_txattr_policy[NL80211_TXRATE_MAX + 1] = { .len = NL80211_MAX_SUPP_RATES }, [NL80211_TXRATE_HT] = { .type = NLA_BINARY, .len = NL80211_MAX_SUPP_HT_RATES }, - [NL80211_TXRATE_VHT] = { - .type = NLA_EXACT_LEN_WARN, - .len = sizeof(struct nl80211_txrate_vht), - }, + [NL80211_TXRATE_VHT] = NLA_POLICY_EXACT_LEN_WARN(sizeof(struct nl80211_txrate_vht)), [NL80211_TXRATE_GI] = { .type = NLA_U8 }, }; -- cgit v1.2.3 From d07dcf9aadd6b2842b439e8668ff7ea2873f28d7 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 30 Apr 2020 22:13:12 +0200 Subject: netlink: add infrastructure to expose policies to userspace Add, and use in generic netlink, helpers to dump out a netlink policy to userspace, including all the range validation data, nested policies etc. This lets userspace discover what the kernel understands. For families/commands other than generic netlink, the helpers need to be used directly in an appropriate command, or we can add some infrastructure (a new netlink family) that those can register their policies with for introspection. I'm not that familiar with non-generic netlink, so that's left out for now. The data exposed to userspace also includes min and max length for binary/string data, I've done that instead of letting the userspace tools figure out whether min/max is intended based on the type so that we can extend this later in the kernel, we might want to just use the range data for example. Because of this, I opted to not directly expose the NLA_* values, even if some of them are already exposed via BPF, as with min/max length we don't need to have different types here for NLA_BINARY/NLA_MIN_LEN/NLA_EXACT_LEN, we just make them all NL_ATTR_TYPE_BINARY with min/max length optionally set. Similarly, we don't really need NLA_MSECS, and perhaps can remove it in the future - but not if we encode it into the userspace API now. It gets mapped to NL_ATTR_TYPE_U64 here. Note that the exposing here corresponds to the strict policy interpretation, and NLA_UNSPEC items are omitted entirely. To get those, change them to NLA_MIN_LEN which behaves in exactly the same way, but is exposed. Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- include/net/netlink.h | 6 + include/uapi/linux/genetlink.h | 2 + include/uapi/linux/netlink.h | 103 ++++++++++++++ net/netlink/Makefile | 2 +- net/netlink/genetlink.c | 78 +++++++++++ net/netlink/policy.c | 308 +++++++++++++++++++++++++++++++++++++++++ 6 files changed, 498 insertions(+), 1 deletion(-) create mode 100644 net/netlink/policy.c (limited to 'net') diff --git a/include/net/netlink.h b/include/net/netlink.h index 557b67f1db99..c0411f14fb53 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -1933,4 +1933,10 @@ void nla_get_range_unsigned(const struct nla_policy *pt, void nla_get_range_signed(const struct nla_policy *pt, struct netlink_range_validation_signed *range); +int netlink_policy_dump_start(const struct nla_policy *policy, + unsigned int maxtype, + unsigned long *state); +bool netlink_policy_dump_loop(unsigned long *state); +int netlink_policy_dump_write(struct sk_buff *skb, unsigned long state); + #endif diff --git a/include/uapi/linux/genetlink.h b/include/uapi/linux/genetlink.h index 877f7fa95466..9c0636ec2286 100644 --- a/include/uapi/linux/genetlink.h +++ b/include/uapi/linux/genetlink.h @@ -48,6 +48,7 @@ enum { CTRL_CMD_NEWMCAST_GRP, CTRL_CMD_DELMCAST_GRP, CTRL_CMD_GETMCAST_GRP, /* unused */ + CTRL_CMD_GETPOLICY, __CTRL_CMD_MAX, }; @@ -62,6 +63,7 @@ enum { CTRL_ATTR_MAXATTR, CTRL_ATTR_OPS, CTRL_ATTR_MCAST_GROUPS, + CTRL_ATTR_POLICY, __CTRL_ATTR_MAX, }; diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h index 0a4d73317759..eac8a6a648ea 100644 --- a/include/uapi/linux/netlink.h +++ b/include/uapi/linux/netlink.h @@ -249,4 +249,107 @@ struct nla_bitfield32 { __u32 selector; }; +/* + * policy descriptions - it's specific to each family how this is used + * Normally, it should be retrieved via a dump inside another attribute + * specifying where it applies. + */ + +/** + * enum netlink_attribute_type - type of an attribute + * @NL_ATTR_TYPE_INVALID: unused + * @NL_ATTR_TYPE_FLAG: flag attribute (present/not present) + * @NL_ATTR_TYPE_U8: 8-bit unsigned attribute + * @NL_ATTR_TYPE_U16: 16-bit unsigned attribute + * @NL_ATTR_TYPE_U32: 32-bit unsigned attribute + * @NL_ATTR_TYPE_U64: 64-bit unsigned attribute + * @NL_ATTR_TYPE_S8: 8-bit signed attribute + * @NL_ATTR_TYPE_S16: 16-bit signed attribute + * @NL_ATTR_TYPE_S32: 32-bit signed attribute + * @NL_ATTR_TYPE_S64: 64-bit signed attribute + * @NL_ATTR_TYPE_BINARY: binary data, min/max length may be specified + * @NL_ATTR_TYPE_STRING: string, min/max length may be specified + * @NL_ATTR_TYPE_NUL_STRING: NUL-terminated string, + * min/max length may be specified + * @NL_ATTR_TYPE_NESTED: nested, i.e. the content of this attribute + * consists of sub-attributes. The nested policy and maxtype + * inside may be specified. + * @NL_ATTR_TYPE_NESTED_ARRAY: nested array, i.e. the content of this + * attribute contains sub-attributes whose type is irrelevant + * (just used to separate the array entries) and each such array + * entry has attributes again, the policy for those inner ones + * and the corresponding maxtype may be specified. + * @NL_ATTR_TYPE_BITFIELD32: &struct nla_bitfield32 attribute + */ +enum netlink_attribute_type { + NL_ATTR_TYPE_INVALID, + + NL_ATTR_TYPE_FLAG, + + NL_ATTR_TYPE_U8, + NL_ATTR_TYPE_U16, + NL_ATTR_TYPE_U32, + NL_ATTR_TYPE_U64, + + NL_ATTR_TYPE_S8, + NL_ATTR_TYPE_S16, + NL_ATTR_TYPE_S32, + NL_ATTR_TYPE_S64, + + NL_ATTR_TYPE_BINARY, + NL_ATTR_TYPE_STRING, + NL_ATTR_TYPE_NUL_STRING, + + NL_ATTR_TYPE_NESTED, + NL_ATTR_TYPE_NESTED_ARRAY, + + NL_ATTR_TYPE_BITFIELD32, +}; + +/** + * enum netlink_policy_type_attr - policy type attributes + * @NL_POLICY_TYPE_ATTR_UNSPEC: unused + * @NL_POLICY_TYPE_ATTR_TYPE: type of the attribute, + * &enum netlink_attribute_type (U32) + * @NL_POLICY_TYPE_ATTR_MIN_VALUE_S: minimum value for signed + * integers (S64) + * @NL_POLICY_TYPE_ATTR_MAX_VALUE_S: maximum value for signed + * integers (S64) + * @NL_POLICY_TYPE_ATTR_MIN_VALUE_U: minimum value for unsigned + * integers (U64) + * @NL_POLICY_TYPE_ATTR_MAX_VALUE_U: maximum value for unsigned + * integers (U64) + * @NL_POLICY_TYPE_ATTR_MIN_LENGTH: minimum length for binary + * attributes, no minimum if not given (U32) + * @NL_POLICY_TYPE_ATTR_MAX_LENGTH: maximum length for binary + * attributes, no maximum if not given (U32) + * @NL_POLICY_TYPE_ATTR_POLICY_IDX: sub policy for nested and + * nested array types (U32) + * @NL_POLICY_TYPE_ATTR_POLICY_MAXTYPE: maximum sub policy + * attribute for nested and nested array types, this can + * in theory be < the size of the policy pointed to by + * the index, if limited inside the nesting (U32) + * @NL_POLICY_TYPE_ATTR_BITFIELD32_MASK: valid mask for the + * bitfield32 type (U32) + * @NL_POLICY_TYPE_ATTR_PAD: pad attribute for 64-bit alignment + */ +enum netlink_policy_type_attr { + NL_POLICY_TYPE_ATTR_UNSPEC, + NL_POLICY_TYPE_ATTR_TYPE, + NL_POLICY_TYPE_ATTR_MIN_VALUE_S, + NL_POLICY_TYPE_ATTR_MAX_VALUE_S, + NL_POLICY_TYPE_ATTR_MIN_VALUE_U, + NL_POLICY_TYPE_ATTR_MAX_VALUE_U, + NL_POLICY_TYPE_ATTR_MIN_LENGTH, + NL_POLICY_TYPE_ATTR_MAX_LENGTH, + NL_POLICY_TYPE_ATTR_POLICY_IDX, + NL_POLICY_TYPE_ATTR_POLICY_MAXTYPE, + NL_POLICY_TYPE_ATTR_BITFIELD32_MASK, + NL_POLICY_TYPE_ATTR_PAD, + + /* keep last */ + __NL_POLICY_TYPE_ATTR_MAX, + NL_POLICY_TYPE_ATTR_MAX = __NL_POLICY_TYPE_ATTR_MAX - 1 +}; + #endif /* _UAPI__LINUX_NETLINK_H */ diff --git a/net/netlink/Makefile b/net/netlink/Makefile index de42df7f0068..e05202708c90 100644 --- a/net/netlink/Makefile +++ b/net/netlink/Makefile @@ -3,7 +3,7 @@ # Makefile for the netlink driver. # -obj-y := af_netlink.o genetlink.o +obj-y := af_netlink.o genetlink.o policy.o obj-$(CONFIG_NETLINK_DIAG) += netlink_diag.o netlink_diag-y := diag.o diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 9f357aa22b94..2f049692e012 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -1043,6 +1043,80 @@ static int genl_ctrl_event(int event, const struct genl_family *family, return 0; } +static int ctrl_dumppolicy(struct sk_buff *skb, struct netlink_callback *cb) +{ + const struct genl_family *rt; + unsigned int fam_id = cb->args[0]; + int err; + + if (!fam_id) { + struct nlattr *tb[CTRL_ATTR_MAX + 1]; + + err = genlmsg_parse(cb->nlh, &genl_ctrl, tb, + genl_ctrl.maxattr, + genl_ctrl.policy, cb->extack); + if (err) + return err; + + if (!tb[CTRL_ATTR_FAMILY_ID] && !tb[CTRL_ATTR_FAMILY_NAME]) + return -EINVAL; + + if (tb[CTRL_ATTR_FAMILY_ID]) { + fam_id = nla_get_u16(tb[CTRL_ATTR_FAMILY_ID]); + } else { + rt = genl_family_find_byname( + nla_data(tb[CTRL_ATTR_FAMILY_NAME])); + if (!rt) + return -ENOENT; + fam_id = rt->id; + } + } + + rt = genl_family_find_byid(fam_id); + if (!rt) + return -ENOENT; + + if (!rt->policy) + return -ENODATA; + + err = netlink_policy_dump_start(rt->policy, rt->maxattr, &cb->args[1]); + if (err) + return err; + + while (netlink_policy_dump_loop(&cb->args[1])) { + void *hdr; + struct nlattr *nest; + + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, &genl_ctrl, + NLM_F_MULTI, CTRL_CMD_GETPOLICY); + if (!hdr) + goto nla_put_failure; + + if (nla_put_u16(skb, CTRL_ATTR_FAMILY_ID, rt->id)) + goto nla_put_failure; + + nest = nla_nest_start(skb, CTRL_ATTR_POLICY); + if (!nest) + goto nla_put_failure; + + if (netlink_policy_dump_write(skb, cb->args[1])) + goto nla_put_failure; + + nla_nest_end(skb, nest); + + genlmsg_end(skb, hdr); + continue; + +nla_put_failure: + genlmsg_cancel(skb, hdr); + break; + } + + cb->args[0] = fam_id; + return skb->len; +} + static const struct genl_ops genl_ctrl_ops[] = { { .cmd = CTRL_CMD_GETFAMILY, @@ -1050,6 +1124,10 @@ static const struct genl_ops genl_ctrl_ops[] = { .doit = ctrl_getfamily, .dumpit = ctrl_dumpfamily, }, + { + .cmd = CTRL_CMD_GETPOLICY, + .dumpit = ctrl_dumppolicy, + }, }; static const struct genl_multicast_group genl_ctrl_groups[] = { diff --git a/net/netlink/policy.c b/net/netlink/policy.c new file mode 100644 index 000000000000..f6491853c797 --- /dev/null +++ b/net/netlink/policy.c @@ -0,0 +1,308 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NETLINK Policy advertisement to userspace + * + * Authors: Johannes Berg + * + * Copyright 2019 Intel Corporation + */ + +#include +#include +#include +#include + +#define INITIAL_POLICIES_ALLOC 10 + +struct nl_policy_dump { + unsigned int policy_idx; + unsigned int attr_idx; + unsigned int n_alloc; + struct { + const struct nla_policy *policy; + unsigned int maxtype; + } policies[]; +}; + +static int add_policy(struct nl_policy_dump **statep, + const struct nla_policy *policy, + unsigned int maxtype) +{ + struct nl_policy_dump *state = *statep; + unsigned int n_alloc, i; + + if (!policy || !maxtype) + return 0; + + for (i = 0; i < state->n_alloc; i++) { + if (state->policies[i].policy == policy) + return 0; + + if (!state->policies[i].policy) { + state->policies[i].policy = policy; + state->policies[i].maxtype = maxtype; + return 0; + } + } + + n_alloc = state->n_alloc + INITIAL_POLICIES_ALLOC; + state = krealloc(state, struct_size(state, policies, n_alloc), + GFP_KERNEL); + if (!state) + return -ENOMEM; + + state->policies[state->n_alloc].policy = policy; + state->policies[state->n_alloc].maxtype = maxtype; + state->n_alloc = n_alloc; + *statep = state; + + return 0; +} + +static unsigned int get_policy_idx(struct nl_policy_dump *state, + const struct nla_policy *policy) +{ + unsigned int i; + + for (i = 0; i < state->n_alloc; i++) { + if (state->policies[i].policy == policy) + return i; + } + + WARN_ON_ONCE(1); + return -1; +} + +int netlink_policy_dump_start(const struct nla_policy *policy, + unsigned int maxtype, + unsigned long *_state) +{ + struct nl_policy_dump *state; + unsigned int policy_idx; + int err; + + /* also returns 0 if "*_state" is our ERR_PTR() end marker */ + if (*_state) + return 0; + + /* + * walk the policies and nested ones first, and build + * a linear list of them. + */ + + state = kzalloc(struct_size(state, policies, INITIAL_POLICIES_ALLOC), + GFP_KERNEL); + if (!state) + return -ENOMEM; + state->n_alloc = INITIAL_POLICIES_ALLOC; + + err = add_policy(&state, policy, maxtype); + if (err) + return err; + + for (policy_idx = 0; + policy_idx < state->n_alloc && state->policies[policy_idx].policy; + policy_idx++) { + const struct nla_policy *policy; + unsigned int type; + + policy = state->policies[policy_idx].policy; + + for (type = 0; + type <= state->policies[policy_idx].maxtype; + type++) { + switch (policy[type].type) { + case NLA_NESTED: + case NLA_NESTED_ARRAY: + err = add_policy(&state, + policy[type].nested_policy, + policy[type].len); + if (err) + return err; + break; + default: + break; + } + } + } + + *_state = (unsigned long)state; + + return 0; +} + +static bool netlink_policy_dump_finished(struct nl_policy_dump *state) +{ + return state->policy_idx >= state->n_alloc || + !state->policies[state->policy_idx].policy; +} + +bool netlink_policy_dump_loop(unsigned long *_state) +{ + struct nl_policy_dump *state = (void *)*_state; + + if (IS_ERR(state)) + return false; + + if (netlink_policy_dump_finished(state)) { + kfree(state); + /* store end marker instead of freed state */ + *_state = (unsigned long)ERR_PTR(-ENOENT); + return false; + } + + return true; +} + +int netlink_policy_dump_write(struct sk_buff *skb, unsigned long _state) +{ + struct nl_policy_dump *state = (void *)_state; + const struct nla_policy *pt; + struct nlattr *policy, *attr; + enum netlink_attribute_type type; + bool again; + +send_attribute: + again = false; + + pt = &state->policies[state->policy_idx].policy[state->attr_idx]; + + policy = nla_nest_start(skb, state->policy_idx); + if (!policy) + return -ENOBUFS; + + attr = nla_nest_start(skb, state->attr_idx); + if (!attr) + goto nla_put_failure; + + switch (pt->type) { + default: + case NLA_UNSPEC: + case NLA_REJECT: + /* skip - use NLA_MIN_LEN to advertise such */ + nla_nest_cancel(skb, policy); + again = true; + goto next; + case NLA_NESTED: + type = NL_ATTR_TYPE_NESTED; + /* fall through */ + case NLA_NESTED_ARRAY: + if (pt->type == NLA_NESTED_ARRAY) + type = NL_ATTR_TYPE_NESTED_ARRAY; + if (pt->nested_policy && pt->len && + (nla_put_u32(skb, NL_POLICY_TYPE_ATTR_POLICY_IDX, + get_policy_idx(state, pt->nested_policy)) || + nla_put_u32(skb, NL_POLICY_TYPE_ATTR_POLICY_MAXTYPE, + pt->len))) + goto nla_put_failure; + break; + case NLA_U8: + case NLA_U16: + case NLA_U32: + case NLA_U64: + case NLA_MSECS: { + struct netlink_range_validation range; + + if (pt->type == NLA_U8) + type = NL_ATTR_TYPE_U8; + else if (pt->type == NLA_U16) + type = NL_ATTR_TYPE_U16; + else if (pt->type == NLA_U32) + type = NL_ATTR_TYPE_U32; + else + type = NL_ATTR_TYPE_U64; + + nla_get_range_unsigned(pt, &range); + + if (nla_put_u64_64bit(skb, NL_POLICY_TYPE_ATTR_MIN_VALUE_U, + range.min, NL_POLICY_TYPE_ATTR_PAD) || + nla_put_u64_64bit(skb, NL_POLICY_TYPE_ATTR_MAX_VALUE_U, + range.max, NL_POLICY_TYPE_ATTR_PAD)) + goto nla_put_failure; + break; + } + case NLA_S8: + case NLA_S16: + case NLA_S32: + case NLA_S64: { + struct netlink_range_validation_signed range; + + if (pt->type == NLA_S8) + type = NL_ATTR_TYPE_S8; + else if (pt->type == NLA_S16) + type = NL_ATTR_TYPE_S16; + else if (pt->type == NLA_S32) + type = NL_ATTR_TYPE_S32; + else + type = NL_ATTR_TYPE_S64; + + nla_get_range_signed(pt, &range); + + if (nla_put_s64(skb, NL_POLICY_TYPE_ATTR_MIN_VALUE_S, + range.min, NL_POLICY_TYPE_ATTR_PAD) || + nla_put_s64(skb, NL_POLICY_TYPE_ATTR_MAX_VALUE_S, + range.max, NL_POLICY_TYPE_ATTR_PAD)) + goto nla_put_failure; + break; + } + case NLA_BITFIELD32: + type = NL_ATTR_TYPE_BITFIELD32; + if (nla_put_u32(skb, NL_POLICY_TYPE_ATTR_BITFIELD32_MASK, + pt->bitfield32_valid)) + goto nla_put_failure; + break; + case NLA_EXACT_LEN: + type = NL_ATTR_TYPE_BINARY; + if (nla_put_u32(skb, NL_POLICY_TYPE_ATTR_MIN_LENGTH, pt->len) || + nla_put_u32(skb, NL_POLICY_TYPE_ATTR_MAX_LENGTH, pt->len)) + goto nla_put_failure; + break; + case NLA_STRING: + case NLA_NUL_STRING: + case NLA_BINARY: + if (pt->type == NLA_STRING) + type = NL_ATTR_TYPE_STRING; + else if (pt->type == NLA_NUL_STRING) + type = NL_ATTR_TYPE_NUL_STRING; + else + type = NL_ATTR_TYPE_BINARY; + if (pt->len && nla_put_u32(skb, NL_POLICY_TYPE_ATTR_MAX_LENGTH, + pt->len)) + goto nla_put_failure; + break; + case NLA_MIN_LEN: + type = NL_ATTR_TYPE_BINARY; + if (nla_put_u32(skb, NL_POLICY_TYPE_ATTR_MIN_LENGTH, pt->len)) + goto nla_put_failure; + break; + case NLA_FLAG: + type = NL_ATTR_TYPE_FLAG; + break; + } + + if (nla_put_u32(skb, NL_POLICY_TYPE_ATTR_TYPE, type)) + goto nla_put_failure; + + /* finish and move state to next attribute */ + nla_nest_end(skb, attr); + nla_nest_end(skb, policy); + +next: + state->attr_idx += 1; + if (state->attr_idx > state->policies[state->policy_idx].maxtype) { + state->attr_idx = 0; + state->policy_idx++; + } + + if (again) { + if (netlink_policy_dump_finished(state)) + return -ENODATA; + goto send_attribute; + } + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, policy); + return -ENOBUFS; +} -- cgit v1.2.3 From cff9f12b18915d957a2130885a00f8ab15cff7e4 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Thu, 30 Apr 2020 22:21:31 +0300 Subject: net/core: Introduce netdev_get_xmit_slave Add new ndo to get the xmit slave of master device. The reference counters are not incremented so the caller must be careful with locks. User can ask to get the xmit slave assume all the slaves can transmit by set all_slaves arg to true. Signed-off-by: Maor Gottlieb Reviewed-by: Jiri Pirko Reviewed-by: David Ahern Acked-by: David S. Miller Signed-off-by: Saeed Mahameed --- include/linux/netdevice.h | 12 ++++++++++++ net/core/dev.c | 22 ++++++++++++++++++++++ 2 files changed, 34 insertions(+) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 130a668049ab..26bc0f11b7ad 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1146,6 +1146,12 @@ struct netdev_net_notifier { * int (*ndo_del_slave)(struct net_device *dev, struct net_device *slave_dev); * Called to release previously enslaved netdev. * + * struct net_device *(*ndo_get_xmit_slave)(struct net_device *dev, + * struct sk_buff *skb, + * bool all_slaves); + * Get the xmit slave of master device. If all_slaves is true, function + * assume all the slaves can transmit. + * * Feature/offload setting functions. * netdev_features_t (*ndo_fix_features)(struct net_device *dev, * netdev_features_t features); @@ -1389,6 +1395,9 @@ struct net_device_ops { struct netlink_ext_ack *extack); int (*ndo_del_slave)(struct net_device *dev, struct net_device *slave_dev); + struct net_device* (*ndo_get_xmit_slave)(struct net_device *dev, + struct sk_buff *skb, + bool all_slaves); netdev_features_t (*ndo_fix_features)(struct net_device *dev, netdev_features_t features); int (*ndo_set_features)(struct net_device *dev, @@ -2731,6 +2740,9 @@ void netdev_freemem(struct net_device *dev); void synchronize_net(void); int init_dummy_netdev(struct net_device *dev); +struct net_device *netdev_get_xmit_slave(struct net_device *dev, + struct sk_buff *skb, + bool all_slaves); struct net_device *dev_get_by_index(struct net *net, int ifindex); struct net_device *__dev_get_by_index(struct net *net, int ifindex); struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex); diff --git a/net/core/dev.c b/net/core/dev.c index 9c9e763bfe0e..e6c10980abfd 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7785,6 +7785,28 @@ void netdev_bonding_info_change(struct net_device *dev, } EXPORT_SYMBOL(netdev_bonding_info_change); +/** + * netdev_get_xmit_slave - Get the xmit slave of master device + * @skb: The packet + * @all_slaves: assume all the slaves are active + * + * The reference counters are not incremented so the caller must be + * careful with locks. The caller must hold RCU lock. + * %NULL is returned if no slave is found. + */ + +struct net_device *netdev_get_xmit_slave(struct net_device *dev, + struct sk_buff *skb, + bool all_slaves) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (!ops->ndo_get_xmit_slave) + return NULL; + return ops->ndo_get_xmit_slave(dev, skb, all_slaves); +} +EXPORT_SYMBOL(netdev_get_xmit_slave); + static void netdev_adjacent_add_links(struct net_device *dev) { struct netdev_adjacent *iter; -- cgit v1.2.3 From 883780af72090daf9ab53779a3085a6ddfc468ca Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 1 May 2020 16:44:27 +0200 Subject: docs: networking: convert x25-iface.txt to ReST Not much to be done here: - add SPDX header; - adjust title markup; - remove a tail whitespace; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/x25-iface.rst | 129 +++++++++++++++++++++++++++++++++ Documentation/networking/x25-iface.txt | 123 ------------------------------- include/uapi/linux/if_x25.h | 2 +- net/x25/Kconfig | 2 +- 5 files changed, 132 insertions(+), 125 deletions(-) create mode 100644 Documentation/networking/x25-iface.rst delete mode 100644 Documentation/networking/x25-iface.txt (limited to 'net') diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index a72fdfb391b6..7a4bdbc111b0 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -115,6 +115,7 @@ Contents: udplite vrf vxlan + x25-iface .. only:: subproject and html diff --git a/Documentation/networking/x25-iface.rst b/Documentation/networking/x25-iface.rst new file mode 100644 index 000000000000..df401891dce6 --- /dev/null +++ b/Documentation/networking/x25-iface.rst @@ -0,0 +1,129 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============================- +X.25 Device Driver Interface +============================- + +Version 1.1 + + Jonathan Naylor 26.12.96 + +This is a description of the messages to be passed between the X.25 Packet +Layer and the X.25 device driver. They are designed to allow for the easy +setting of the LAPB mode from within the Packet Layer. + +The X.25 device driver will be coded normally as per the Linux device driver +standards. Most X.25 device drivers will be moderately similar to the +already existing Ethernet device drivers. However unlike those drivers, the +X.25 device driver has a state associated with it, and this information +needs to be passed to and from the Packet Layer for proper operation. + +All messages are held in sk_buff's just like real data to be transmitted +over the LAPB link. The first byte of the skbuff indicates the meaning of +the rest of the skbuff, if any more information does exist. + + +Packet Layer to Device Driver +----------------------------- + +First Byte = 0x00 (X25_IFACE_DATA) + +This indicates that the rest of the skbuff contains data to be transmitted +over the LAPB link. The LAPB link should already exist before any data is +passed down. + +First Byte = 0x01 (X25_IFACE_CONNECT) + +Establish the LAPB link. If the link is already established then the connect +confirmation message should be returned as soon as possible. + +First Byte = 0x02 (X25_IFACE_DISCONNECT) + +Terminate the LAPB link. If it is already disconnected then the disconnect +confirmation message should be returned as soon as possible. + +First Byte = 0x03 (X25_IFACE_PARAMS) + +LAPB parameters. To be defined. + + +Device Driver to Packet Layer +----------------------------- + +First Byte = 0x00 (X25_IFACE_DATA) + +This indicates that the rest of the skbuff contains data that has been +received over the LAPB link. + +First Byte = 0x01 (X25_IFACE_CONNECT) + +LAPB link has been established. The same message is used for both a LAPB +link connect_confirmation and a connect_indication. + +First Byte = 0x02 (X25_IFACE_DISCONNECT) + +LAPB link has been terminated. This same message is used for both a LAPB +link disconnect_confirmation and a disconnect_indication. + +First Byte = 0x03 (X25_IFACE_PARAMS) + +LAPB parameters. To be defined. + + + +Possible Problems +================= + +(Henner Eisen, 2000-10-28) + +The X.25 packet layer protocol depends on a reliable datalink service. +The LAPB protocol provides such reliable service. But this reliability +is not preserved by the Linux network device driver interface: + +- With Linux 2.4.x (and above) SMP kernels, packet ordering is not + preserved. Even if a device driver calls netif_rx(skb1) and later + netif_rx(skb2), skb2 might be delivered to the network layer + earlier that skb1. +- Data passed upstream by means of netif_rx() might be dropped by the + kernel if the backlog queue is congested. + +The X.25 packet layer protocol will detect this and reset the virtual +call in question. But many upper layer protocols are not designed to +handle such N-Reset events gracefully. And frequent N-Reset events +will always degrade performance. + +Thus, driver authors should make netif_rx() as reliable as possible: + +SMP re-ordering will not occur if the driver's interrupt handler is +always executed on the same CPU. Thus, + +- Driver authors should use irq affinity for the interrupt handler. + +The probability of packet loss due to backlog congestion can be +reduced by the following measures or a combination thereof: + +(1) Drivers for kernel versions 2.4.x and above should always check the + return value of netif_rx(). If it returns NET_RX_DROP, the + driver's LAPB protocol must not confirm reception of the frame + to the peer. + This will reliably suppress packet loss. The LAPB protocol will + automatically cause the peer to re-transmit the dropped packet + later. + The lapb module interface was modified to support this. Its + data_indication() method should now transparently pass the + netif_rx() return value to the (lapb module) caller. +(2) Drivers for kernel versions 2.2.x should always check the global + variable netdev_dropping when a new frame is received. The driver + should only call netif_rx() if netdev_dropping is zero. Otherwise + the driver should not confirm delivery of the frame and drop it. + Alternatively, the driver can queue the frame internally and call + netif_rx() later when netif_dropping is 0 again. In that case, delivery + confirmation should also be deferred such that the internal queue + cannot grow to much. + This will not reliably avoid packet loss, but the probability + of packet loss in netif_rx() path will be significantly reduced. +(3) Additionally, driver authors might consider to support + CONFIG_NET_HW_FLOWCONTROL. This allows the driver to be woken up + when a previously congested backlog queue becomes empty again. + The driver could uses this for flow-controlling the peer by means + of the LAPB protocol's flow-control service. diff --git a/Documentation/networking/x25-iface.txt b/Documentation/networking/x25-iface.txt deleted file mode 100644 index 7f213b556e85..000000000000 --- a/Documentation/networking/x25-iface.txt +++ /dev/null @@ -1,123 +0,0 @@ - X.25 Device Driver Interface 1.1 - - Jonathan Naylor 26.12.96 - -This is a description of the messages to be passed between the X.25 Packet -Layer and the X.25 device driver. They are designed to allow for the easy -setting of the LAPB mode from within the Packet Layer. - -The X.25 device driver will be coded normally as per the Linux device driver -standards. Most X.25 device drivers will be moderately similar to the -already existing Ethernet device drivers. However unlike those drivers, the -X.25 device driver has a state associated with it, and this information -needs to be passed to and from the Packet Layer for proper operation. - -All messages are held in sk_buff's just like real data to be transmitted -over the LAPB link. The first byte of the skbuff indicates the meaning of -the rest of the skbuff, if any more information does exist. - - -Packet Layer to Device Driver ------------------------------ - -First Byte = 0x00 (X25_IFACE_DATA) - -This indicates that the rest of the skbuff contains data to be transmitted -over the LAPB link. The LAPB link should already exist before any data is -passed down. - -First Byte = 0x01 (X25_IFACE_CONNECT) - -Establish the LAPB link. If the link is already established then the connect -confirmation message should be returned as soon as possible. - -First Byte = 0x02 (X25_IFACE_DISCONNECT) - -Terminate the LAPB link. If it is already disconnected then the disconnect -confirmation message should be returned as soon as possible. - -First Byte = 0x03 (X25_IFACE_PARAMS) - -LAPB parameters. To be defined. - - -Device Driver to Packet Layer ------------------------------ - -First Byte = 0x00 (X25_IFACE_DATA) - -This indicates that the rest of the skbuff contains data that has been -received over the LAPB link. - -First Byte = 0x01 (X25_IFACE_CONNECT) - -LAPB link has been established. The same message is used for both a LAPB -link connect_confirmation and a connect_indication. - -First Byte = 0x02 (X25_IFACE_DISCONNECT) - -LAPB link has been terminated. This same message is used for both a LAPB -link disconnect_confirmation and a disconnect_indication. - -First Byte = 0x03 (X25_IFACE_PARAMS) - -LAPB parameters. To be defined. - - - -Possible Problems -================= - -(Henner Eisen, 2000-10-28) - -The X.25 packet layer protocol depends on a reliable datalink service. -The LAPB protocol provides such reliable service. But this reliability -is not preserved by the Linux network device driver interface: - -- With Linux 2.4.x (and above) SMP kernels, packet ordering is not - preserved. Even if a device driver calls netif_rx(skb1) and later - netif_rx(skb2), skb2 might be delivered to the network layer - earlier that skb1. -- Data passed upstream by means of netif_rx() might be dropped by the - kernel if the backlog queue is congested. - -The X.25 packet layer protocol will detect this and reset the virtual -call in question. But many upper layer protocols are not designed to -handle such N-Reset events gracefully. And frequent N-Reset events -will always degrade performance. - -Thus, driver authors should make netif_rx() as reliable as possible: - -SMP re-ordering will not occur if the driver's interrupt handler is -always executed on the same CPU. Thus, - -- Driver authors should use irq affinity for the interrupt handler. - -The probability of packet loss due to backlog congestion can be -reduced by the following measures or a combination thereof: - -(1) Drivers for kernel versions 2.4.x and above should always check the - return value of netif_rx(). If it returns NET_RX_DROP, the - driver's LAPB protocol must not confirm reception of the frame - to the peer. - This will reliably suppress packet loss. The LAPB protocol will - automatically cause the peer to re-transmit the dropped packet - later. - The lapb module interface was modified to support this. Its - data_indication() method should now transparently pass the - netif_rx() return value to the (lapb module) caller. -(2) Drivers for kernel versions 2.2.x should always check the global - variable netdev_dropping when a new frame is received. The driver - should only call netif_rx() if netdev_dropping is zero. Otherwise - the driver should not confirm delivery of the frame and drop it. - Alternatively, the driver can queue the frame internally and call - netif_rx() later when netif_dropping is 0 again. In that case, delivery - confirmation should also be deferred such that the internal queue - cannot grow to much. - This will not reliably avoid packet loss, but the probability - of packet loss in netif_rx() path will be significantly reduced. -(3) Additionally, driver authors might consider to support - CONFIG_NET_HW_FLOWCONTROL. This allows the driver to be woken up - when a previously congested backlog queue becomes empty again. - The driver could uses this for flow-controlling the peer by means - of the LAPB protocol's flow-control service. diff --git a/include/uapi/linux/if_x25.h b/include/uapi/linux/if_x25.h index 5d962448345f..3a5938e38370 100644 --- a/include/uapi/linux/if_x25.h +++ b/include/uapi/linux/if_x25.h @@ -18,7 +18,7 @@ #include -/* Documentation/networking/x25-iface.txt */ +/* Documentation/networking/x25-iface.rst */ #define X25_IFACE_DATA 0x00 #define X25_IFACE_CONNECT 0x01 #define X25_IFACE_DISCONNECT 0x02 diff --git a/net/x25/Kconfig b/net/x25/Kconfig index 2ecb2e5e241e..a328f79885d1 100644 --- a/net/x25/Kconfig +++ b/net/x25/Kconfig @@ -21,7 +21,7 @@ config X25 . Information about X.25 for Linux is contained in the files and - . + . One connects to an X.25 network either with a dedicated network card using the X.21 protocol (not yet supported by Linux) or one can do -- cgit v1.2.3 From c4ea03fdfd122b4ff293bff643c2369852e9cc1c Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 1 May 2020 16:44:28 +0200 Subject: docs: networking: convert x25.txt to ReST Not much to be done here: - add SPDX header; - add a document title; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/x25.rst | 48 ++++++++++++++++++++++++++++++++++++++ Documentation/networking/x25.txt | 44 ---------------------------------- net/x25/Kconfig | 2 +- 4 files changed, 50 insertions(+), 45 deletions(-) create mode 100644 Documentation/networking/x25.rst delete mode 100644 Documentation/networking/x25.txt (limited to 'net') diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 7a4bdbc111b0..75521e6c473b 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -116,6 +116,7 @@ Contents: vrf vxlan x25-iface + x25 .. only:: subproject and html diff --git a/Documentation/networking/x25.rst b/Documentation/networking/x25.rst new file mode 100644 index 000000000000..00e45d384ba0 --- /dev/null +++ b/Documentation/networking/x25.rst @@ -0,0 +1,48 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================== +Linux X.25 Project +================== + +As my third year dissertation at University I have taken it upon myself to +write an X.25 implementation for Linux. My aim is to provide a complete X.25 +Packet Layer and a LAPB module to allow for "normal" X.25 to be run using +Linux. There are two sorts of X.25 cards available, intelligent ones that +implement LAPB on the card itself, and unintelligent ones that simply do +framing, bit-stuffing and checksumming. These both need to be handled by the +system. + +I therefore decided to write the implementation such that as far as the +Packet Layer is concerned, the link layer was being performed by a lower +layer of the Linux kernel and therefore it did not concern itself with +implementation of LAPB. Therefore the LAPB modules would be called by +unintelligent X.25 card drivers and not by intelligent ones, this would +provide a uniform device driver interface, and simplify configuration. + +To confuse matters a little, an 802.2 LLC implementation for Linux is being +written which will allow X.25 to be run over an Ethernet (or Token Ring) and +conform with the JNT "Pink Book", this will have a different interface to +the Packet Layer but there will be no confusion since the class of device +being served by the LLC will be completely separate from LAPB. The LLC +implementation is being done as part of another protocol project (SNA) and +by a different author. + +Just when you thought that it could not become more confusing, another +option appeared, XOT. This allows X.25 Packet Layer frames to operate over +the Internet using TCP/IP as a reliable link layer. RFC1613 specifies the +format and behaviour of the protocol. If time permits this option will also +be actively considered. + +A linux-x25 mailing list has been created at vger.kernel.org to support the +development and use of Linux X.25. It is early days yet, but interested +parties are welcome to subscribe to it. Just send a message to +majordomo@vger.kernel.org with the following in the message body: + +subscribe linux-x25 +end + +The contents of the Subject line are ignored. + +Jonathan + +g4klx@g4klx.demon.co.uk diff --git a/Documentation/networking/x25.txt b/Documentation/networking/x25.txt deleted file mode 100644 index c91c6d7159ff..000000000000 --- a/Documentation/networking/x25.txt +++ /dev/null @@ -1,44 +0,0 @@ -Linux X.25 Project - -As my third year dissertation at University I have taken it upon myself to -write an X.25 implementation for Linux. My aim is to provide a complete X.25 -Packet Layer and a LAPB module to allow for "normal" X.25 to be run using -Linux. There are two sorts of X.25 cards available, intelligent ones that -implement LAPB on the card itself, and unintelligent ones that simply do -framing, bit-stuffing and checksumming. These both need to be handled by the -system. - -I therefore decided to write the implementation such that as far as the -Packet Layer is concerned, the link layer was being performed by a lower -layer of the Linux kernel and therefore it did not concern itself with -implementation of LAPB. Therefore the LAPB modules would be called by -unintelligent X.25 card drivers and not by intelligent ones, this would -provide a uniform device driver interface, and simplify configuration. - -To confuse matters a little, an 802.2 LLC implementation for Linux is being -written which will allow X.25 to be run over an Ethernet (or Token Ring) and -conform with the JNT "Pink Book", this will have a different interface to -the Packet Layer but there will be no confusion since the class of device -being served by the LLC will be completely separate from LAPB. The LLC -implementation is being done as part of another protocol project (SNA) and -by a different author. - -Just when you thought that it could not become more confusing, another -option appeared, XOT. This allows X.25 Packet Layer frames to operate over -the Internet using TCP/IP as a reliable link layer. RFC1613 specifies the -format and behaviour of the protocol. If time permits this option will also -be actively considered. - -A linux-x25 mailing list has been created at vger.kernel.org to support the -development and use of Linux X.25. It is early days yet, but interested -parties are welcome to subscribe to it. Just send a message to -majordomo@vger.kernel.org with the following in the message body: - -subscribe linux-x25 -end - -The contents of the Subject line are ignored. - -Jonathan - -g4klx@g4klx.demon.co.uk diff --git a/net/x25/Kconfig b/net/x25/Kconfig index a328f79885d1..9f0d58b0b90b 100644 --- a/net/x25/Kconfig +++ b/net/x25/Kconfig @@ -20,7 +20,7 @@ config X25 You can read more about X.25 at and . Information about X.25 for Linux is contained in the files - and + and . One connects to an X.25 network either with a dedicated network card -- cgit v1.2.3 From beecf11bc2188067824591612151c4dc6ec383c7 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 30 Apr 2020 16:31:52 -0700 Subject: bpf: Bpf_{g,s}etsockopt for struct bpf_sock_addr Currently, bpf_getsockopt and bpf_setsockopt helpers operate on the 'struct bpf_sock_ops' context in BPF_PROG_TYPE_SOCK_OPS program. Let's generalize them and make them available for 'struct bpf_sock_addr'. That way, in the future, we can allow those helpers in more places. As an example, let's expose those 'struct bpf_sock_addr' based helpers to BPF_CGROUP_INET{4,6}_CONNECT hooks. That way we can override CC before the connection is made. v3: * Expose custom helpers for bpf_sock_addr context instead of doing generic bpf_sock argument (as suggested by Daniel). Even with try_socket_lock that doesn't sleep we have a problem where context sk is already locked and socket lock is non-nestable. v2: * s/BPF_PROG_TYPE_CGROUP_SOCKOPT/BPF_PROG_TYPE_SOCK_OPS/ Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200430233152.199403-1-sdf@google.com --- include/uapi/linux/bpf.h | 14 ++- net/core/filter.c | 118 +++++++++++++++++----- tools/include/uapi/linux/bpf.h | 14 ++- tools/testing/selftests/bpf/config | 1 + tools/testing/selftests/bpf/progs/connect4_prog.c | 46 +++++++++ 5 files changed, 166 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 705e4822f997..b3643e27e264 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1587,7 +1587,7 @@ union bpf_attr { * Return * 0 * - * int bpf_setsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, void *optval, int optlen) + * int bpf_setsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen) * Description * Emulate a call to **setsockopt()** on the socket associated to * *bpf_socket*, which must be a full socket. The *level* at @@ -1595,6 +1595,11 @@ union bpf_attr { * must be specified, see **setsockopt(2)** for more information. * The option value of length *optlen* is pointed by *optval*. * + * *bpf_socket* should be one of the following: + * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. + * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** + * and **BPF_CGROUP_INET6_CONNECT**. + * * This helper actually implements a subset of **setsockopt()**. * It supports the following *level*\ s: * @@ -1789,7 +1794,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_getsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, void *optval, int optlen) + * int bpf_getsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen) * Description * Emulate a call to **getsockopt()** on the socket associated to * *bpf_socket*, which must be a full socket. The *level* at @@ -1798,6 +1803,11 @@ union bpf_attr { * The retrieved value is stored in the structure pointed by * *opval* and of length *optlen*. * + * *bpf_socket* should be one of the following: + * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. + * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** + * and **BPF_CGROUP_INET6_CONNECT**. + * * This helper actually implements a subset of **getsockopt()**. * It supports the following *level*\ s: * diff --git a/net/core/filter.c b/net/core/filter.c index 70b32723e6be..dfaf5df13722 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4194,16 +4194,19 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = { .arg1_type = ARG_PTR_TO_CTX, }; -BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, - int, level, int, optname, char *, optval, int, optlen) +#define SOCKOPT_CC_REINIT (1 << 0) + +static int _bpf_setsockopt(struct sock *sk, int level, int optname, + char *optval, int optlen, u32 flags) { - struct sock *sk = bpf_sock->sk; int ret = 0; int val; if (!sk_fullsock(sk)) return -EINVAL; + sock_owned_by_me(sk); + if (level == SOL_SOCKET) { if (optlen != sizeof(int)) return -EINVAL; @@ -4298,7 +4301,7 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, sk->sk_prot->setsockopt == tcp_setsockopt) { if (optname == TCP_CONGESTION) { char name[TCP_CA_NAME_MAX]; - bool reinit = bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN; + bool reinit = flags & SOCKOPT_CC_REINIT; strncpy(name, optval, min_t(long, optlen, TCP_CA_NAME_MAX-1)); @@ -4345,24 +4348,14 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, return ret; } -static const struct bpf_func_proto bpf_setsockopt_proto = { - .func = bpf_setsockopt, - .gpl_only = false, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, - .arg5_type = ARG_CONST_SIZE, -}; - -BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, - int, level, int, optname, char *, optval, int, optlen) +static int _bpf_getsockopt(struct sock *sk, int level, int optname, + char *optval, int optlen) { - struct sock *sk = bpf_sock->sk; - if (!sk_fullsock(sk)) goto err_clear; + + sock_owned_by_me(sk); + #ifdef CONFIG_INET if (level == SOL_TCP && sk->sk_prot->getsockopt == tcp_getsockopt) { struct inet_connection_sock *icsk; @@ -4428,8 +4421,71 @@ err_clear: return -EINVAL; } -static const struct bpf_func_proto bpf_getsockopt_proto = { - .func = bpf_getsockopt, +BPF_CALL_5(bpf_sock_addr_setsockopt, struct bpf_sock_addr_kern *, ctx, + int, level, int, optname, char *, optval, int, optlen) +{ + u32 flags = 0; + return _bpf_setsockopt(ctx->sk, level, optname, optval, optlen, + flags); +} + +static const struct bpf_func_proto bpf_sock_addr_setsockopt_proto = { + .func = bpf_sock_addr_setsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE, +}; + +BPF_CALL_5(bpf_sock_addr_getsockopt, struct bpf_sock_addr_kern *, ctx, + int, level, int, optname, char *, optval, int, optlen) +{ + return _bpf_getsockopt(ctx->sk, level, optname, optval, optlen); +} + +static const struct bpf_func_proto bpf_sock_addr_getsockopt_proto = { + .func = bpf_sock_addr_getsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_UNINIT_MEM, + .arg5_type = ARG_CONST_SIZE, +}; + +BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, + int, level, int, optname, char *, optval, int, optlen) +{ + u32 flags = 0; + if (bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN) + flags |= SOCKOPT_CC_REINIT; + return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen, + flags); +} + +static const struct bpf_func_proto bpf_sock_ops_setsockopt_proto = { + .func = bpf_sock_ops_setsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE, +}; + +BPF_CALL_5(bpf_sock_ops_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, + int, level, int, optname, char *, optval, int, optlen) +{ + return _bpf_getsockopt(bpf_sock->sk, level, optname, optval, optlen); +} + +static const struct bpf_func_proto bpf_sock_ops_getsockopt_proto = { + .func = bpf_sock_ops_getsockopt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, @@ -6043,6 +6099,22 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; + case BPF_FUNC_setsockopt: + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET6_CONNECT: + return &bpf_sock_addr_setsockopt_proto; + default: + return NULL; + } + case BPF_FUNC_getsockopt: + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET6_CONNECT: + return &bpf_sock_addr_getsockopt_proto; + default: + return NULL; + } default: return bpf_base_func_proto(func_id); } @@ -6261,9 +6333,9 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_setsockopt: - return &bpf_setsockopt_proto; + return &bpf_sock_ops_setsockopt_proto; case BPF_FUNC_getsockopt: - return &bpf_getsockopt_proto; + return &bpf_sock_ops_getsockopt_proto; case BPF_FUNC_sock_ops_cb_flags_set: return &bpf_sock_ops_cb_flags_set_proto; case BPF_FUNC_sock_map_update: diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 705e4822f997..b3643e27e264 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1587,7 +1587,7 @@ union bpf_attr { * Return * 0 * - * int bpf_setsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, void *optval, int optlen) + * int bpf_setsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen) * Description * Emulate a call to **setsockopt()** on the socket associated to * *bpf_socket*, which must be a full socket. The *level* at @@ -1595,6 +1595,11 @@ union bpf_attr { * must be specified, see **setsockopt(2)** for more information. * The option value of length *optlen* is pointed by *optval*. * + * *bpf_socket* should be one of the following: + * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. + * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** + * and **BPF_CGROUP_INET6_CONNECT**. + * * This helper actually implements a subset of **setsockopt()**. * It supports the following *level*\ s: * @@ -1789,7 +1794,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_getsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, void *optval, int optlen) + * int bpf_getsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen) * Description * Emulate a call to **getsockopt()** on the socket associated to * *bpf_socket*, which must be a full socket. The *level* at @@ -1798,6 +1803,11 @@ union bpf_attr { * The retrieved value is stored in the structure pointed by * *opval* and of length *optlen*. * + * *bpf_socket* should be one of the following: + * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. + * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** + * and **BPF_CGROUP_INET6_CONNECT**. + * * This helper actually implements a subset of **getsockopt()**. * It supports the following *level*\ s: * diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index 60e3ae5d4e48..6e5b94c036ca 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -37,3 +37,4 @@ CONFIG_IPV6_SIT=m CONFIG_BPF_JIT=y CONFIG_BPF_LSM=y CONFIG_SECURITY=y +CONFIG_TCP_CONG_DCTCP=y diff --git a/tools/testing/selftests/bpf/progs/connect4_prog.c b/tools/testing/selftests/bpf/progs/connect4_prog.c index ad3c498a8150..972918cd2d7f 100644 --- a/tools/testing/selftests/bpf/progs/connect4_prog.c +++ b/tools/testing/selftests/bpf/progs/connect4_prog.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -16,6 +17,10 @@ #define DST_REWRITE_IP4 0x7f000001U #define DST_REWRITE_PORT4 4444 +#ifndef TCP_CA_NAME_MAX +#define TCP_CA_NAME_MAX 16 +#endif + int _version SEC("version") = 1; __attribute__ ((noinline)) @@ -33,6 +38,43 @@ int do_bind(struct bpf_sock_addr *ctx) return 1; } +static __inline int verify_cc(struct bpf_sock_addr *ctx, + char expected[TCP_CA_NAME_MAX]) +{ + char buf[TCP_CA_NAME_MAX]; + int i; + + if (bpf_getsockopt(ctx, SOL_TCP, TCP_CONGESTION, &buf, sizeof(buf))) + return 1; + + for (i = 0; i < TCP_CA_NAME_MAX; i++) { + if (buf[i] != expected[i]) + return 1; + if (buf[i] == 0) + break; + } + + return 0; +} + +static __inline int set_cc(struct bpf_sock_addr *ctx) +{ + char dctcp[TCP_CA_NAME_MAX] = "dctcp"; + char cubic[TCP_CA_NAME_MAX] = "cubic"; + + if (bpf_setsockopt(ctx, SOL_TCP, TCP_CONGESTION, &dctcp, sizeof(dctcp))) + return 1; + if (verify_cc(ctx, dctcp)) + return 1; + + if (bpf_setsockopt(ctx, SOL_TCP, TCP_CONGESTION, &cubic, sizeof(cubic))) + return 1; + if (verify_cc(ctx, cubic)) + return 1; + + return 0; +} + SEC("cgroup/connect4") int connect_v4_prog(struct bpf_sock_addr *ctx) { @@ -66,6 +108,10 @@ int connect_v4_prog(struct bpf_sock_addr *ctx) bpf_sk_release(sk); + /* Rewrite congestion control. */ + if (ctx->type == SOCK_STREAM && set_cc(ctx)) + return 0; + /* Rewrite destination. */ ctx->user_ip4 = bpf_htonl(DST_REWRITE_IP4); ctx->user_port = bpf_htons(DST_REWRITE_PORT4); -- cgit v1.2.3 From f0628c524fd188c3f9418e12478dfdfadacba815 Mon Sep 17 00:00:00 2001 From: Cambda Zhu Date: Fri, 24 Apr 2020 16:06:16 +0800 Subject: net: Replace the limit of TCP_LINGER2 with TCP_FIN_TIMEOUT_MAX This patch changes the behavior of TCP_LINGER2 about its limit. The sysctl_tcp_fin_timeout used to be the limit of TCP_LINGER2 but now it's only the default value. A new macro named TCP_FIN_TIMEOUT_MAX is added as the limit of TCP_LINGER2, which is 2 minutes. Since TCP_LINGER2 used sysctl_tcp_fin_timeout as the default value and the limit in the past, the system administrator cannot set the default value for most of sockets and let some sockets have a greater timeout. It might be a mistake that let the sysctl to be the limit of the TCP_LINGER2. Maybe we can add a new sysctl to set the max of TCP_LINGER2, but FIN-WAIT-2 timeout is usually no need to be too long and 2 minutes are legal considering TCP specs. Changes in v3: - Remove the new socket option and change the TCP_LINGER2 behavior so that the timeout can be set to value between sysctl_tcp_fin_timeout and 2 minutes. Changes in v2: - Add int overflow check for the new socket option. Changes in v1: - Add a new socket option to set timeout greater than sysctl_tcp_fin_timeout. Signed-off-by: Cambda Zhu Signed-off-by: David S. Miller --- include/net/tcp.h | 1 + net/ipv4/tcp.c | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/tcp.h b/include/net/tcp.h index dcf9a72eeaa6..1beed50522b1 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -126,6 +126,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); * to combine FIN-WAIT-2 timeout with * TIME-WAIT timer. */ +#define TCP_FIN_TIMEOUT_MAX (120 * HZ) /* max TCP_LINGER2 value (two minutes) */ #define TCP_DELACK_MAX ((unsigned)(HZ/5)) /* maximal time to delay before sending an ACK */ #if HZ >= 100 diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 6d87de434377..8c1250103959 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3035,8 +3035,8 @@ static int do_tcp_setsockopt(struct sock *sk, int level, case TCP_LINGER2: if (val < 0) tp->linger2 = -1; - else if (val > net->ipv4.sysctl_tcp_fin_timeout / HZ) - tp->linger2 = 0; + else if (val > TCP_FIN_TIMEOUT_MAX / HZ) + tp->linger2 = TCP_FIN_TIMEOUT_MAX; else tp->linger2 = val * HZ; break; -- cgit v1.2.3 From 41a46913bee76d8493681442907ccd989ced2633 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Mon, 27 Apr 2020 18:37:43 +0200 Subject: net: fix skb_panic to output real address In skb_panic() the real pointer values are really needed to diagnose issues, e.g. data and head are related (to calculate headroom). The hashed versions of the addresses doesn't make much sense here. The patch use the printk specifier %px to print the actual address. The printk documentation on %px: https://www.kernel.org/doc/html/latest/core-api/printk-formats.html#unmodified-addresses Fixes: ad67b74d2469 ("printk: hash addresses printed with %p") Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- net/core/skbuff.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 7e29590482ce..1bf0c3d278e7 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -102,7 +102,7 @@ EXPORT_SYMBOL(sysctl_max_skb_frags); static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr, const char msg[]) { - pr_emerg("%s: text:%p len:%d put:%d head:%p data:%p tail:%#lx end:%#lx dev:%s\n", + pr_emerg("%s: text:%px len:%d put:%d head:%px data:%px tail:%#lx end:%#lx dev:%s\n", msg, addr, skb->len, sz, skb->head, skb->data, (unsigned long)skb->tail, (unsigned long)skb->end, skb->dev ? skb->dev->name : ""); -- cgit v1.2.3 From a51c328df3106663879645680609eb49b3ff6444 Mon Sep 17 00:00:00 2001 From: Po Liu Date: Fri, 1 May 2020 08:53:15 +0800 Subject: net: qos: introduce a gate control flow action Introduce a ingress frame gate control flow action. Tc gate action does the work like this: Assume there is a gate allow specified ingress frames can be passed at specific time slot, and be dropped at specific time slot. Tc filter chooses the ingress frames, and tc gate action would specify what slot does these frames can be passed to device and what time slot would be dropped. Tc gate action would provide an entry list to tell how much time gate keep open and how much time gate keep state close. Gate action also assign a start time to tell when the entry list start. Then driver would repeat the gate entry list cyclically. For the software simulation, gate action requires the user assign a time clock type. Below is the setting example in user space. Tc filter a stream source ip address is 192.168.0.20 and gate action own two time slots. One is last 200ms gate open let frame pass another is last 100ms gate close let frames dropped. When the ingress frames have reach total frames over 8000000 bytes, the excessive frames will be dropped in that 200000000ns time slot. > tc qdisc add dev eth0 ingress > tc filter add dev eth0 parent ffff: protocol ip \ flower src_ip 192.168.0.20 \ action gate index 2 clockid CLOCK_TAI \ sched-entry open 200000000 -1 8000000 \ sched-entry close 100000000 -1 -1 > tc chain del dev eth0 ingress chain 0 "sched-entry" follow the name taprio style. Gate state is "open"/"close". Follow with period nanosecond. Then next item is internal priority value means which ingress queue should put. "-1" means wildcard. The last value optional specifies the maximum number of MSDU octets that are permitted to pass the gate during the specified time interval. Base-time is not set will be 0 as default, as result start time would be ((N + 1) * cycletime) which is the minimal of future time. Below example shows filtering a stream with destination mac address is 10:00:80:00:00:00 and ip type is ICMP, follow the action gate. The gate action would run with one close time slot which means always keep close. The time cycle is total 200000000ns. The base-time would calculate by: 1357000000000 + (N + 1) * cycletime When the total value is the future time, it will be the start time. The cycletime here would be 200000000ns for this case. > tc filter add dev eth0 parent ffff: protocol ip \ flower skip_hw ip_proto icmp dst_mac 10:00:80:00:00:00 \ action gate index 12 base-time 1357000000000 \ sched-entry close 200000000 -1 -1 \ clockid CLOCK_TAI Signed-off-by: Po Liu Signed-off-by: David S. Miller --- include/net/tc_act/tc_gate.h | 47 +++ include/uapi/linux/pkt_cls.h | 1 + include/uapi/linux/tc_act/tc_gate.h | 47 +++ net/sched/Kconfig | 12 + net/sched/Makefile | 1 + net/sched/act_gate.c | 636 ++++++++++++++++++++++++++++++++++++ 6 files changed, 744 insertions(+) create mode 100644 include/net/tc_act/tc_gate.h create mode 100644 include/uapi/linux/tc_act/tc_gate.h create mode 100644 net/sched/act_gate.c (limited to 'net') diff --git a/include/net/tc_act/tc_gate.h b/include/net/tc_act/tc_gate.h new file mode 100644 index 000000000000..330ad8b02495 --- /dev/null +++ b/include/net/tc_act/tc_gate.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* Copyright 2020 NXP */ + +#ifndef __NET_TC_GATE_H +#define __NET_TC_GATE_H + +#include +#include + +struct tcfg_gate_entry { + int index; + u8 gate_state; + u32 interval; + s32 ipv; + s32 maxoctets; + struct list_head list; +}; + +struct tcf_gate_params { + s32 tcfg_priority; + u64 tcfg_basetime; + u64 tcfg_cycletime; + u64 tcfg_cycletime_ext; + u32 tcfg_flags; + s32 tcfg_clockid; + size_t num_entries; + struct list_head entries; +}; + +#define GATE_ACT_GATE_OPEN BIT(0) +#define GATE_ACT_PENDING BIT(1) + +struct tcf_gate { + struct tc_action common; + struct tcf_gate_params param; + u8 current_gate_status; + ktime_t current_close_time; + u32 current_entry_octets; + s32 current_max_octets; + struct tcfg_gate_entry *next_entry; + struct hrtimer hitimer; + enum tk_offsets tk_offset; +}; + +#define to_gate(a) ((struct tcf_gate *)a) + +#endif diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 9f06d29cab70..fc672b232437 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -134,6 +134,7 @@ enum tca_id { TCA_ID_CTINFO, TCA_ID_MPLS, TCA_ID_CT, + TCA_ID_GATE, /* other actions go here */ __TCA_ID_MAX = 255 }; diff --git a/include/uapi/linux/tc_act/tc_gate.h b/include/uapi/linux/tc_act/tc_gate.h new file mode 100644 index 000000000000..f214b3a6d44f --- /dev/null +++ b/include/uapi/linux/tc_act/tc_gate.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ +/* Copyright 2020 NXP */ + +#ifndef __LINUX_TC_GATE_H +#define __LINUX_TC_GATE_H + +#include + +struct tc_gate { + tc_gen; +}; + +enum { + TCA_GATE_ENTRY_UNSPEC, + TCA_GATE_ENTRY_INDEX, + TCA_GATE_ENTRY_GATE, + TCA_GATE_ENTRY_INTERVAL, + TCA_GATE_ENTRY_IPV, + TCA_GATE_ENTRY_MAX_OCTETS, + __TCA_GATE_ENTRY_MAX, +}; +#define TCA_GATE_ENTRY_MAX (__TCA_GATE_ENTRY_MAX - 1) + +enum { + TCA_GATE_ONE_ENTRY_UNSPEC, + TCA_GATE_ONE_ENTRY, + __TCA_GATE_ONE_ENTRY_MAX, +}; +#define TCA_GATE_ONE_ENTRY_MAX (__TCA_GATE_ONE_ENTRY_MAX - 1) + +enum { + TCA_GATE_UNSPEC, + TCA_GATE_TM, + TCA_GATE_PARMS, + TCA_GATE_PAD, + TCA_GATE_PRIORITY, + TCA_GATE_ENTRY_LIST, + TCA_GATE_BASE_TIME, + TCA_GATE_CYCLE_TIME, + TCA_GATE_CYCLE_TIME_EXT, + TCA_GATE_FLAGS, + TCA_GATE_CLOCKID, + __TCA_GATE_MAX, +}; +#define TCA_GATE_MAX (__TCA_GATE_MAX - 1) + +#endif diff --git a/net/sched/Kconfig b/net/sched/Kconfig index bfbefb7bff9d..2f20073f4f84 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -981,6 +981,18 @@ config NET_ACT_CT To compile this code as a module, choose M here: the module will be called act_ct. +config NET_ACT_GATE + tristate "Frame gate entry list control tc action" + depends on NET_CLS_ACT + help + Say Y here to allow to control the ingress flow to be passed at + specific time slot and be dropped at other specific time slot by + the gate entry list. + + If unsure, say N. + To compile this code as a module, choose M here: the + module will be called act_gate. + config NET_IFE_SKBMARK tristate "Support to encoding decoding skb mark on IFE action" depends on NET_ACT_IFE diff --git a/net/sched/Makefile b/net/sched/Makefile index 31c367a6cd09..66bbf9a98f9e 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -30,6 +30,7 @@ obj-$(CONFIG_NET_IFE_SKBPRIO) += act_meta_skbprio.o obj-$(CONFIG_NET_IFE_SKBTCINDEX) += act_meta_skbtcindex.o obj-$(CONFIG_NET_ACT_TUNNEL_KEY)+= act_tunnel_key.o obj-$(CONFIG_NET_ACT_CT) += act_ct.o +obj-$(CONFIG_NET_ACT_GATE) += act_gate.o obj-$(CONFIG_NET_SCH_FIFO) += sch_fifo.o obj-$(CONFIG_NET_SCH_CBQ) += sch_cbq.o obj-$(CONFIG_NET_SCH_HTB) += sch_htb.o diff --git a/net/sched/act_gate.c b/net/sched/act_gate.c new file mode 100644 index 000000000000..35fc48795541 --- /dev/null +++ b/net/sched/act_gate.c @@ -0,0 +1,636 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Copyright 2020 NXP */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static unsigned int gate_net_id; +static struct tc_action_ops act_gate_ops; + +static ktime_t gate_get_time(struct tcf_gate *gact) +{ + ktime_t mono = ktime_get(); + + switch (gact->tk_offset) { + case TK_OFFS_MAX: + return mono; + default: + return ktime_mono_to_any(mono, gact->tk_offset); + } + + return KTIME_MAX; +} + +static int gate_get_start_time(struct tcf_gate *gact, ktime_t *start) +{ + struct tcf_gate_params *param = &gact->param; + ktime_t now, base, cycle; + u64 n; + + base = ns_to_ktime(param->tcfg_basetime); + now = gate_get_time(gact); + + if (ktime_after(base, now)) { + *start = base; + return 0; + } + + cycle = param->tcfg_cycletime; + + /* cycle time should not be zero */ + if (!cycle) + return -EFAULT; + + n = div64_u64(ktime_sub_ns(now, base), cycle); + *start = ktime_add_ns(base, (n + 1) * cycle); + return 0; +} + +static void gate_start_timer(struct tcf_gate *gact, ktime_t start) +{ + ktime_t expires; + + expires = hrtimer_get_expires(&gact->hitimer); + if (expires == 0) + expires = KTIME_MAX; + + start = min_t(ktime_t, start, expires); + + hrtimer_start(&gact->hitimer, start, HRTIMER_MODE_ABS_SOFT); +} + +static enum hrtimer_restart gate_timer_func(struct hrtimer *timer) +{ + struct tcf_gate *gact = container_of(timer, struct tcf_gate, + hitimer); + struct tcf_gate_params *p = &gact->param; + struct tcfg_gate_entry *next; + ktime_t close_time, now; + + spin_lock(&gact->tcf_lock); + + next = gact->next_entry; + + /* cycle start, clear pending bit, clear total octets */ + gact->current_gate_status = next->gate_state ? GATE_ACT_GATE_OPEN : 0; + gact->current_entry_octets = 0; + gact->current_max_octets = next->maxoctets; + + gact->current_close_time = ktime_add_ns(gact->current_close_time, + next->interval); + + close_time = gact->current_close_time; + + if (list_is_last(&next->list, &p->entries)) + next = list_first_entry(&p->entries, + struct tcfg_gate_entry, list); + else + next = list_next_entry(next, list); + + now = gate_get_time(gact); + + if (ktime_after(now, close_time)) { + ktime_t cycle, base; + u64 n; + + cycle = p->tcfg_cycletime; + base = ns_to_ktime(p->tcfg_basetime); + n = div64_u64(ktime_sub_ns(now, base), cycle); + close_time = ktime_add_ns(base, (n + 1) * cycle); + } + + gact->next_entry = next; + + hrtimer_set_expires(&gact->hitimer, close_time); + + spin_unlock(&gact->tcf_lock); + + return HRTIMER_RESTART; +} + +static int tcf_gate_act(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) +{ + struct tcf_gate *gact = to_gate(a); + + spin_lock(&gact->tcf_lock); + + tcf_lastuse_update(&gact->tcf_tm); + bstats_update(&gact->tcf_bstats, skb); + + if (unlikely(gact->current_gate_status & GATE_ACT_PENDING)) { + spin_unlock(&gact->tcf_lock); + return gact->tcf_action; + } + + if (!(gact->current_gate_status & GATE_ACT_GATE_OPEN)) + goto drop; + + if (gact->current_max_octets >= 0) { + gact->current_entry_octets += qdisc_pkt_len(skb); + if (gact->current_entry_octets > gact->current_max_octets) { + gact->tcf_qstats.overlimits++; + goto drop; + } + } + + spin_unlock(&gact->tcf_lock); + + return gact->tcf_action; +drop: + gact->tcf_qstats.drops++; + spin_unlock(&gact->tcf_lock); + + return TC_ACT_SHOT; +} + +static const struct nla_policy entry_policy[TCA_GATE_ENTRY_MAX + 1] = { + [TCA_GATE_ENTRY_INDEX] = { .type = NLA_U32 }, + [TCA_GATE_ENTRY_GATE] = { .type = NLA_FLAG }, + [TCA_GATE_ENTRY_INTERVAL] = { .type = NLA_U32 }, + [TCA_GATE_ENTRY_IPV] = { .type = NLA_S32 }, + [TCA_GATE_ENTRY_MAX_OCTETS] = { .type = NLA_S32 }, +}; + +static const struct nla_policy gate_policy[TCA_GATE_MAX + 1] = { + [TCA_GATE_PARMS] = { .len = sizeof(struct tc_gate), + .type = NLA_EXACT_LEN }, + [TCA_GATE_PRIORITY] = { .type = NLA_S32 }, + [TCA_GATE_ENTRY_LIST] = { .type = NLA_NESTED }, + [TCA_GATE_BASE_TIME] = { .type = NLA_U64 }, + [TCA_GATE_CYCLE_TIME] = { .type = NLA_U64 }, + [TCA_GATE_CYCLE_TIME_EXT] = { .type = NLA_U64 }, + [TCA_GATE_FLAGS] = { .type = NLA_U32 }, + [TCA_GATE_CLOCKID] = { .type = NLA_S32 }, +}; + +static int fill_gate_entry(struct nlattr **tb, struct tcfg_gate_entry *entry, + struct netlink_ext_ack *extack) +{ + u32 interval = 0; + + entry->gate_state = nla_get_flag(tb[TCA_GATE_ENTRY_GATE]); + + if (tb[TCA_GATE_ENTRY_INTERVAL]) + interval = nla_get_u32(tb[TCA_GATE_ENTRY_INTERVAL]); + + if (interval == 0) { + NL_SET_ERR_MSG(extack, "Invalid interval for schedule entry"); + return -EINVAL; + } + + entry->interval = interval; + + if (tb[TCA_GATE_ENTRY_IPV]) + entry->ipv = nla_get_s32(tb[TCA_GATE_ENTRY_IPV]); + else + entry->ipv = -1; + + if (tb[TCA_GATE_ENTRY_MAX_OCTETS]) + entry->maxoctets = nla_get_s32(tb[TCA_GATE_ENTRY_MAX_OCTETS]); + else + entry->maxoctets = -1; + + return 0; +} + +static int parse_gate_entry(struct nlattr *n, struct tcfg_gate_entry *entry, + int index, struct netlink_ext_ack *extack) +{ + struct nlattr *tb[TCA_GATE_ENTRY_MAX + 1] = { }; + int err; + + err = nla_parse_nested(tb, TCA_GATE_ENTRY_MAX, n, entry_policy, extack); + if (err < 0) { + NL_SET_ERR_MSG(extack, "Could not parse nested entry"); + return -EINVAL; + } + + entry->index = index; + + return fill_gate_entry(tb, entry, extack); +} + +static void release_entry_list(struct list_head *entries) +{ + struct tcfg_gate_entry *entry, *e; + + list_for_each_entry_safe(entry, e, entries, list) { + list_del(&entry->list); + kfree(entry); + } +} + +static int parse_gate_list(struct nlattr *list_attr, + struct tcf_gate_params *sched, + struct netlink_ext_ack *extack) +{ + struct tcfg_gate_entry *entry; + struct nlattr *n; + int err, rem; + int i = 0; + + if (!list_attr) + return -EINVAL; + + nla_for_each_nested(n, list_attr, rem) { + if (nla_type(n) != TCA_GATE_ONE_ENTRY) { + NL_SET_ERR_MSG(extack, "Attribute isn't type 'entry'"); + continue; + } + + entry = kzalloc(sizeof(*entry), GFP_ATOMIC); + if (!entry) { + NL_SET_ERR_MSG(extack, "Not enough memory for entry"); + err = -ENOMEM; + goto release_list; + } + + err = parse_gate_entry(n, entry, i, extack); + if (err < 0) { + kfree(entry); + goto release_list; + } + + list_add_tail(&entry->list, &sched->entries); + i++; + } + + sched->num_entries = i; + + return i; + +release_list: + release_entry_list(&sched->entries); + + return err; +} + +static int tcf_gate_init(struct net *net, struct nlattr *nla, + struct nlattr *est, struct tc_action **a, + int ovr, int bind, bool rtnl_held, + struct tcf_proto *tp, u32 flags, + struct netlink_ext_ack *extack) +{ + struct tc_action_net *tn = net_generic(net, gate_net_id); + enum tk_offsets tk_offset = TK_OFFS_TAI; + struct nlattr *tb[TCA_GATE_MAX + 1]; + struct tcf_chain *goto_ch = NULL; + struct tcf_gate_params *p; + s32 clockid = CLOCK_TAI; + struct tcf_gate *gact; + struct tc_gate *parm; + int ret = 0, err; + u64 basetime = 0; + u32 gflags = 0; + s32 prio = -1; + ktime_t start; + u32 index; + + if (!nla) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_GATE_MAX, nla, gate_policy, extack); + if (err < 0) + return err; + + if (!tb[TCA_GATE_PARMS]) + return -EINVAL; + + parm = nla_data(tb[TCA_GATE_PARMS]); + index = parm->index; + + err = tcf_idr_check_alloc(tn, &index, a, bind); + if (err < 0) + return err; + + if (err && bind) + return 0; + + if (!err) { + ret = tcf_idr_create(tn, index, est, a, + &act_gate_ops, bind, false, 0); + if (ret) { + tcf_idr_cleanup(tn, index); + return ret; + } + + ret = ACT_P_CREATED; + } else if (!ovr) { + tcf_idr_release(*a, bind); + return -EEXIST; + } + + if (tb[TCA_GATE_PRIORITY]) + prio = nla_get_s32(tb[TCA_GATE_PRIORITY]); + + if (tb[TCA_GATE_BASE_TIME]) + basetime = nla_get_u64(tb[TCA_GATE_BASE_TIME]); + + if (tb[TCA_GATE_FLAGS]) + gflags = nla_get_u32(tb[TCA_GATE_FLAGS]); + + if (tb[TCA_GATE_CLOCKID]) { + clockid = nla_get_s32(tb[TCA_GATE_CLOCKID]); + switch (clockid) { + case CLOCK_REALTIME: + tk_offset = TK_OFFS_REAL; + break; + case CLOCK_MONOTONIC: + tk_offset = TK_OFFS_MAX; + break; + case CLOCK_BOOTTIME: + tk_offset = TK_OFFS_BOOT; + break; + case CLOCK_TAI: + tk_offset = TK_OFFS_TAI; + break; + default: + NL_SET_ERR_MSG(extack, "Invalid 'clockid'"); + goto release_idr; + } + } + + err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); + if (err < 0) + goto release_idr; + + gact = to_gate(*a); + + spin_lock_bh(&gact->tcf_lock); + p = &gact->param; + + if (tb[TCA_GATE_CYCLE_TIME]) { + p->tcfg_cycletime = nla_get_u64(tb[TCA_GATE_CYCLE_TIME]); + if (!p->tcfg_cycletime_ext) + goto chain_put; + } + + INIT_LIST_HEAD(&p->entries); + if (tb[TCA_GATE_ENTRY_LIST]) { + err = parse_gate_list(tb[TCA_GATE_ENTRY_LIST], p, extack); + if (err < 0) + goto chain_put; + } + + if (!p->tcfg_cycletime) { + struct tcfg_gate_entry *entry; + ktime_t cycle = 0; + + list_for_each_entry(entry, &p->entries, list) + cycle = ktime_add_ns(cycle, entry->interval); + p->tcfg_cycletime = cycle; + } + + if (tb[TCA_GATE_CYCLE_TIME_EXT]) + p->tcfg_cycletime_ext = + nla_get_u64(tb[TCA_GATE_CYCLE_TIME_EXT]); + + p->tcfg_priority = prio; + p->tcfg_basetime = basetime; + p->tcfg_clockid = clockid; + p->tcfg_flags = gflags; + + gact->tk_offset = tk_offset; + hrtimer_init(&gact->hitimer, clockid, HRTIMER_MODE_ABS_SOFT); + gact->hitimer.function = gate_timer_func; + + err = gate_get_start_time(gact, &start); + if (err < 0) { + NL_SET_ERR_MSG(extack, + "Internal error: failed get start time"); + release_entry_list(&p->entries); + goto chain_put; + } + + gact->current_close_time = start; + gact->current_gate_status = GATE_ACT_GATE_OPEN | GATE_ACT_PENDING; + + gact->next_entry = list_first_entry(&p->entries, + struct tcfg_gate_entry, list); + + goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); + + gate_start_timer(gact, start); + + spin_unlock_bh(&gact->tcf_lock); + + if (goto_ch) + tcf_chain_put_by_act(goto_ch); + + if (ret == ACT_P_CREATED) + tcf_idr_insert(tn, *a); + + return ret; + +chain_put: + spin_unlock_bh(&gact->tcf_lock); + + if (goto_ch) + tcf_chain_put_by_act(goto_ch); +release_idr: + tcf_idr_release(*a, bind); + return err; +} + +static void tcf_gate_cleanup(struct tc_action *a) +{ + struct tcf_gate *gact = to_gate(a); + struct tcf_gate_params *p; + + hrtimer_cancel(&gact->hitimer); + + p = &gact->param; + + release_entry_list(&p->entries); +} + +static int dumping_entry(struct sk_buff *skb, + struct tcfg_gate_entry *entry) +{ + struct nlattr *item; + + item = nla_nest_start_noflag(skb, TCA_GATE_ONE_ENTRY); + if (!item) + return -ENOSPC; + + if (nla_put_u32(skb, TCA_GATE_ENTRY_INDEX, entry->index)) + goto nla_put_failure; + + if (entry->gate_state && nla_put_flag(skb, TCA_GATE_ENTRY_GATE)) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_GATE_ENTRY_INTERVAL, entry->interval)) + goto nla_put_failure; + + if (nla_put_s32(skb, TCA_GATE_ENTRY_MAX_OCTETS, entry->maxoctets)) + goto nla_put_failure; + + if (nla_put_s32(skb, TCA_GATE_ENTRY_IPV, entry->ipv)) + goto nla_put_failure; + + return nla_nest_end(skb, item); + +nla_put_failure: + nla_nest_cancel(skb, item); + return -1; +} + +static int tcf_gate_dump(struct sk_buff *skb, struct tc_action *a, + int bind, int ref) +{ + unsigned char *b = skb_tail_pointer(skb); + struct tcf_gate *gact = to_gate(a); + struct tc_gate opt = { + .index = gact->tcf_index, + .refcnt = refcount_read(&gact->tcf_refcnt) - ref, + .bindcnt = atomic_read(&gact->tcf_bindcnt) - bind, + }; + struct tcfg_gate_entry *entry; + struct tcf_gate_params *p; + struct nlattr *entry_list; + struct tcf_t t; + + spin_lock_bh(&gact->tcf_lock); + opt.action = gact->tcf_action; + + p = &gact->param; + + if (nla_put(skb, TCA_GATE_PARMS, sizeof(opt), &opt)) + goto nla_put_failure; + + if (nla_put_u64_64bit(skb, TCA_GATE_BASE_TIME, + p->tcfg_basetime, TCA_GATE_PAD)) + goto nla_put_failure; + + if (nla_put_u64_64bit(skb, TCA_GATE_CYCLE_TIME, + p->tcfg_cycletime, TCA_GATE_PAD)) + goto nla_put_failure; + + if (nla_put_u64_64bit(skb, TCA_GATE_CYCLE_TIME_EXT, + p->tcfg_cycletime_ext, TCA_GATE_PAD)) + goto nla_put_failure; + + if (nla_put_s32(skb, TCA_GATE_CLOCKID, p->tcfg_clockid)) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_GATE_FLAGS, p->tcfg_flags)) + goto nla_put_failure; + + if (nla_put_s32(skb, TCA_GATE_PRIORITY, p->tcfg_priority)) + goto nla_put_failure; + + entry_list = nla_nest_start_noflag(skb, TCA_GATE_ENTRY_LIST); + if (!entry_list) + goto nla_put_failure; + + list_for_each_entry(entry, &p->entries, list) { + if (dumping_entry(skb, entry) < 0) + goto nla_put_failure; + } + + nla_nest_end(skb, entry_list); + + tcf_tm_dump(&t, &gact->tcf_tm); + if (nla_put_64bit(skb, TCA_GATE_TM, sizeof(t), &t, TCA_GATE_PAD)) + goto nla_put_failure; + spin_unlock_bh(&gact->tcf_lock); + + return skb->len; + +nla_put_failure: + spin_unlock_bh(&gact->tcf_lock); + nlmsg_trim(skb, b); + return -1; +} + +static int tcf_gate_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + const struct tc_action_ops *ops, + struct netlink_ext_ack *extack) +{ + struct tc_action_net *tn = net_generic(net, gate_net_id); + + return tcf_generic_walker(tn, skb, cb, type, ops, extack); +} + +static void tcf_gate_stats_update(struct tc_action *a, u64 bytes, u32 packets, + u64 lastuse, bool hw) +{ + struct tcf_gate *gact = to_gate(a); + struct tcf_t *tm = &gact->tcf_tm; + + tcf_action_update_stats(a, bytes, packets, false, hw); + tm->lastuse = max_t(u64, tm->lastuse, lastuse); +} + +static int tcf_gate_search(struct net *net, struct tc_action **a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, gate_net_id); + + return tcf_idr_search(tn, a, index); +} + +static size_t tcf_gate_get_fill_size(const struct tc_action *act) +{ + return nla_total_size(sizeof(struct tc_gate)); +} + +static struct tc_action_ops act_gate_ops = { + .kind = "gate", + .id = TCA_ID_GATE, + .owner = THIS_MODULE, + .act = tcf_gate_act, + .dump = tcf_gate_dump, + .init = tcf_gate_init, + .cleanup = tcf_gate_cleanup, + .walk = tcf_gate_walker, + .stats_update = tcf_gate_stats_update, + .get_fill_size = tcf_gate_get_fill_size, + .lookup = tcf_gate_search, + .size = sizeof(struct tcf_gate), +}; + +static __net_init int gate_init_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, gate_net_id); + + return tc_action_net_init(net, tn, &act_gate_ops); +} + +static void __net_exit gate_exit_net(struct list_head *net_list) +{ + tc_action_net_exit(net_list, gate_net_id); +} + +static struct pernet_operations gate_net_ops = { + .init = gate_init_net, + .exit_batch = gate_exit_net, + .id = &gate_net_id, + .size = sizeof(struct tc_action_net), +}; + +static int __init gate_init_module(void) +{ + return tcf_register_action(&act_gate_ops, &gate_net_ops); +} + +static void __exit gate_cleanup_module(void) +{ + tcf_unregister_action(&act_gate_ops, &gate_net_ops); +} + +module_init(gate_init_module); +module_exit(gate_cleanup_module); +MODULE_LICENSE("GPL v2"); -- cgit v1.2.3 From d29bdd69ecdd70e8e3c2268fc8e188d6ab55e54a Mon Sep 17 00:00:00 2001 From: Po Liu Date: Fri, 1 May 2020 08:53:16 +0800 Subject: net: schedule: add action gate offloading Add the gate action to the flow action entry. Add the gate parameters to the tc_setup_flow_action() queueing to the entries of flow_action_entry array provide to the driver. Signed-off-by: Po Liu Signed-off-by: David S. Miller --- include/net/flow_offload.h | 10 +++++ include/net/tc_act/tc_gate.h | 99 ++++++++++++++++++++++++++++++++++++++++++++ net/sched/cls_api.c | 33 +++++++++++++++ 3 files changed, 142 insertions(+) (limited to 'net') diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index 3619c6acf60f..94a30fe02e6d 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -147,6 +147,7 @@ enum flow_action_id { FLOW_ACTION_MPLS_PUSH, FLOW_ACTION_MPLS_POP, FLOW_ACTION_MPLS_MANGLE, + FLOW_ACTION_GATE, NUM_FLOW_ACTIONS, }; @@ -255,6 +256,15 @@ struct flow_action_entry { u8 bos; u8 ttl; } mpls_mangle; + struct { + u32 index; + s32 prio; + u64 basetime; + u64 cycletime; + u64 cycletimeext; + u32 num_entries; + struct action_gate_entry *entries; + } gate; }; struct flow_action_cookie *cookie; /* user defined action cookie */ }; diff --git a/include/net/tc_act/tc_gate.h b/include/net/tc_act/tc_gate.h index 330ad8b02495..8bc6be81a7ad 100644 --- a/include/net/tc_act/tc_gate.h +++ b/include/net/tc_act/tc_gate.h @@ -7,6 +7,13 @@ #include #include +struct action_gate_entry { + u8 gate_state; + u32 interval; + s32 ipv; + s32 maxoctets; +}; + struct tcfg_gate_entry { int index; u8 gate_state; @@ -44,4 +51,96 @@ struct tcf_gate { #define to_gate(a) ((struct tcf_gate *)a) +static inline bool is_tcf_gate(const struct tc_action *a) +{ +#ifdef CONFIG_NET_CLS_ACT + if (a->ops && a->ops->id == TCA_ID_GATE) + return true; +#endif + return false; +} + +static inline u32 tcf_gate_index(const struct tc_action *a) +{ + return a->tcfa_index; +} + +static inline s32 tcf_gate_prio(const struct tc_action *a) +{ + s32 tcfg_prio; + + tcfg_prio = to_gate(a)->param.tcfg_priority; + + return tcfg_prio; +} + +static inline u64 tcf_gate_basetime(const struct tc_action *a) +{ + u64 tcfg_basetime; + + tcfg_basetime = to_gate(a)->param.tcfg_basetime; + + return tcfg_basetime; +} + +static inline u64 tcf_gate_cycletime(const struct tc_action *a) +{ + u64 tcfg_cycletime; + + tcfg_cycletime = to_gate(a)->param.tcfg_cycletime; + + return tcfg_cycletime; +} + +static inline u64 tcf_gate_cycletimeext(const struct tc_action *a) +{ + u64 tcfg_cycletimeext; + + tcfg_cycletimeext = to_gate(a)->param.tcfg_cycletime_ext; + + return tcfg_cycletimeext; +} + +static inline u32 tcf_gate_num_entries(const struct tc_action *a) +{ + u32 num_entries; + + num_entries = to_gate(a)->param.num_entries; + + return num_entries; +} + +static inline struct action_gate_entry + *tcf_gate_get_list(const struct tc_action *a) +{ + struct action_gate_entry *oe; + struct tcf_gate_params *p; + struct tcfg_gate_entry *entry; + u32 num_entries; + int i = 0; + + p = &to_gate(a)->param; + num_entries = p->num_entries; + + list_for_each_entry(entry, &p->entries, list) + i++; + + if (i != num_entries) + return NULL; + + oe = kcalloc(num_entries, sizeof(*oe), GFP_ATOMIC); + if (!oe) + return NULL; + + i = 0; + list_for_each_entry(entry, &p->entries, list) { + oe[i].gate_state = entry->gate_state; + oe[i].interval = entry->interval; + oe[i].ipv = entry->ipv; + oe[i].maxoctets = entry->maxoctets; + i++; + } + + return oe; +} #endif diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 11b683c45c28..7e85c91d0752 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -39,6 +39,7 @@ #include #include #include +#include #include extern const struct nla_policy rtm_tca_policy[TCA_MAX + 1]; @@ -3526,6 +3527,27 @@ static void tcf_sample_get_group(struct flow_action_entry *entry, #endif } +static void tcf_gate_entry_destructor(void *priv) +{ + struct action_gate_entry *oe = priv; + + kfree(oe); +} + +static int tcf_gate_get_entries(struct flow_action_entry *entry, + const struct tc_action *act) +{ + entry->gate.entries = tcf_gate_get_list(act); + + if (!entry->gate.entries) + return -EINVAL; + + entry->destructor = tcf_gate_entry_destructor; + entry->destructor_priv = entry->gate.entries; + + return 0; +} + int tc_setup_flow_action(struct flow_action *flow_action, const struct tcf_exts *exts) { @@ -3672,6 +3694,17 @@ int tc_setup_flow_action(struct flow_action *flow_action, } else if (is_tcf_skbedit_priority(act)) { entry->id = FLOW_ACTION_PRIORITY; entry->priority = tcf_skbedit_priority(act); + } else if (is_tcf_gate(act)) { + entry->id = FLOW_ACTION_GATE; + entry->gate.index = tcf_gate_index(act); + entry->gate.prio = tcf_gate_prio(act); + entry->gate.basetime = tcf_gate_basetime(act); + entry->gate.cycletime = tcf_gate_cycletime(act); + entry->gate.cycletimeext = tcf_gate_cycletimeext(act); + entry->gate.num_entries = tcf_gate_num_entries(act); + err = tcf_gate_get_entries(entry, act); + if (err) + goto err_out; } else { err = -EOPNOTSUPP; goto err_out_locked; -- cgit v1.2.3 From 7562a13d5a8ce9bc5020705da5f50221021f5a2c Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Fri, 1 May 2020 12:48:01 +0200 Subject: net/smc: multiple link support for rmb buffer registration The CONFIRM_RKEY LLC processing handles all links in one LLC message. Move the call to this processing out of smcr_link_reg_rmb() which does processing per link, into smcr_lgr_reg_rmbs() which is responsible for link group level processing. Move smcr_link_reg_rmb() into module smc_core.c. >From af_smc.c now call smcr_lgr_reg_rmbs() to register new rmbs on all available links. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 54 +++++++++++++++++++----------------------------------- net/smc/smc_core.c | 16 ++++++++++++++++ net/smc/smc_core.h | 1 + 3 files changed, 36 insertions(+), 35 deletions(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index bd9662d06896..20d6d3fbb86c 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -337,46 +337,30 @@ static void smc_copy_sock_settings_to_smc(struct smc_sock *smc) smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC); } -/* register a new rmb, send confirm_rkey msg to register with peer */ -static int smcr_link_reg_rmb(struct smc_link *link, - struct smc_buf_desc *rmb_desc, bool conf_rkey) -{ - if (!rmb_desc->is_reg_mr[link->link_idx]) { - /* register memory region for new rmb */ - if (smc_wr_reg_send(link, rmb_desc->mr_rx[link->link_idx])) { - rmb_desc->is_reg_err = true; - return -EFAULT; - } - rmb_desc->is_reg_mr[link->link_idx] = true; - } - if (!conf_rkey) - return 0; - - /* exchange confirm_rkey msg with peer */ - if (!rmb_desc->is_conf_rkey) { - if (smc_llc_do_confirm_rkey(link, rmb_desc)) { - rmb_desc->is_reg_err = true; - return -EFAULT; - } - rmb_desc->is_conf_rkey = true; - } - return 0; -} - /* register the new rmb on all links */ -static int smcr_lgr_reg_rmbs(struct smc_link_group *lgr, +static int smcr_lgr_reg_rmbs(struct smc_link *link, struct smc_buf_desc *rmb_desc) { - int i, rc; + struct smc_link_group *lgr = link->lgr; + int i, rc = 0; for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { if (lgr->lnk[i].state != SMC_LNK_ACTIVE) continue; - rc = smcr_link_reg_rmb(&lgr->lnk[i], rmb_desc, true); + rc = smcr_link_reg_rmb(&lgr->lnk[i], rmb_desc); if (rc) - return rc; + goto out; } - return 0; + + /* exchange confirm_rkey msg with peer */ + rc = smc_llc_do_confirm_rkey(link, rmb_desc); + if (rc) { + rc = -EFAULT; + goto out; + } + rmb_desc->is_conf_rkey = true; +out: + return rc; } static int smcr_clnt_conf_first_link(struct smc_sock *smc) @@ -408,7 +392,7 @@ static int smcr_clnt_conf_first_link(struct smc_sock *smc) smc_wr_remember_qp_attr(link); - if (smcr_link_reg_rmb(link, smc->conn.rmb_desc, false)) + if (smcr_link_reg_rmb(link, smc->conn.rmb_desc)) return SMC_CLC_DECL_ERR_REGRMB; /* confirm_rkey is implicit on 1st contact */ @@ -670,7 +654,7 @@ static int smc_connect_rdma(struct smc_sock *smc, return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RDYLNK, ini->cln_first_contact); } else { - if (smcr_lgr_reg_rmbs(smc->conn.lgr, smc->conn.rmb_desc)) + if (smcr_lgr_reg_rmbs(link, smc->conn.rmb_desc)) return smc_connect_abort(smc, SMC_CLC_DECL_ERR_REGRMB, ini->cln_first_contact); } @@ -1045,7 +1029,7 @@ static int smcr_serv_conf_first_link(struct smc_sock *smc) link->lgr->type = SMC_LGR_SINGLE; - if (smcr_link_reg_rmb(link, smc->conn.rmb_desc, false)) + if (smcr_link_reg_rmb(link, smc->conn.rmb_desc)) return SMC_CLC_DECL_ERR_REGRMB; /* send CONFIRM LINK request to client over the RoCE fabric */ @@ -1220,7 +1204,7 @@ static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact) struct smc_connection *conn = &new_smc->conn; if (local_contact != SMC_FIRST_CONTACT) { - if (smcr_lgr_reg_rmbs(conn->lgr, conn->rmb_desc)) + if (smcr_lgr_reg_rmbs(conn->lnk, conn->rmb_desc)) return SMC_CLC_DECL_ERR_REGRMB; } smc_rmb_sync_sg_for_device(&new_smc->conn); diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 3539ceef9a97..de6bc36fe9a7 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1127,6 +1127,22 @@ free_table: return rc; } +/* register a new rmb on IB device */ +int smcr_link_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc) +{ + if (list_empty(&link->lgr->list)) + return -ENOLINK; + if (!rmb_desc->is_reg_mr[link->link_idx]) { + /* register memory region for new rmb */ + if (smc_wr_reg_send(link, rmb_desc->mr_rx[link->link_idx])) { + rmb_desc->is_reg_err = true; + return -EFAULT; + } + rmb_desc->is_reg_mr[link->link_idx] = true; + } + return 0; +} + static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr, bool is_rmb, int bufsize) { diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index f12474cc666c..fd512188d2c6 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -367,6 +367,7 @@ void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr); int smc_core_init(void); void smc_core_exit(void); +int smcr_link_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc); static inline struct smc_link_group *smc_get_lgr(struct smc_link *link) { return link->lgr; -- cgit v1.2.3 From 4a3641c160873fe6b6bcff00a6ea15e7430d8d42 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Fri, 1 May 2020 12:48:02 +0200 Subject: net/smc: unmapping of buffers to support multiple links With the support of multiple links that are created and cleared there is a need to unmap one link from all current buffers. Add unmapping by link and by rmb. And make smcr_link_clear() available to be called from the LLC layer. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_core.c | 76 ++++++++++++++++++++++++++++++++++++++++++------------ net/smc/smc_core.h | 1 + 2 files changed, 60 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index de6bc36fe9a7..d5ecea490b4e 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -498,14 +498,69 @@ void smc_conn_free(struct smc_connection *conn) smc_lgr_schedule_free_work(lgr); } -static void smcr_link_clear(struct smc_link *lnk) +/* unregister a link from a buf_desc */ +static void smcr_buf_unmap_link(struct smc_buf_desc *buf_desc, bool is_rmb, + struct smc_link *lnk) +{ + if (is_rmb) + buf_desc->is_reg_mr[lnk->link_idx] = false; + if (!buf_desc->is_map_ib[lnk->link_idx]) + return; + if (is_rmb) { + if (buf_desc->mr_rx[lnk->link_idx]) { + smc_ib_put_memory_region( + buf_desc->mr_rx[lnk->link_idx]); + buf_desc->mr_rx[lnk->link_idx] = NULL; + } + smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_FROM_DEVICE); + } else { + smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_TO_DEVICE); + } + sg_free_table(&buf_desc->sgt[lnk->link_idx]); + buf_desc->is_map_ib[lnk->link_idx] = false; +} + +/* unmap all buffers of lgr for a deleted link */ +static void smcr_buf_unmap_lgr(struct smc_link *lnk) +{ + struct smc_link_group *lgr = lnk->lgr; + struct smc_buf_desc *buf_desc, *bf; + int i; + + for (i = 0; i < SMC_RMBE_SIZES; i++) { + mutex_lock(&lgr->rmbs_lock); + list_for_each_entry_safe(buf_desc, bf, &lgr->rmbs[i], list) + smcr_buf_unmap_link(buf_desc, true, lnk); + mutex_unlock(&lgr->rmbs_lock); + mutex_lock(&lgr->sndbufs_lock); + list_for_each_entry_safe(buf_desc, bf, &lgr->sndbufs[i], + list) + smcr_buf_unmap_link(buf_desc, false, lnk); + mutex_unlock(&lgr->sndbufs_lock); + } +} + +static void smcr_rtoken_clear_link(struct smc_link *lnk) +{ + struct smc_link_group *lgr = lnk->lgr; + int i; + + for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { + lgr->rtokens[i][lnk->link_idx].rkey = 0; + lgr->rtokens[i][lnk->link_idx].dma_addr = 0; + } +} + +void smcr_link_clear(struct smc_link *lnk) { struct smc_ib_device *smcibdev; - if (lnk->peer_qpn == 0) + if (!lnk->lgr || lnk->state == SMC_LNK_UNUSED) return; lnk->peer_qpn = 0; smc_llc_link_clear(lnk); + smcr_buf_unmap_lgr(lnk); + smcr_rtoken_clear_link(lnk); smc_ib_modify_qp_reset(lnk); smc_wr_free_link(lnk); smc_ib_destroy_queue_pair(lnk); @@ -522,23 +577,10 @@ static void smcr_link_clear(struct smc_link *lnk) static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb, struct smc_buf_desc *buf_desc) { - struct smc_link *lnk; int i; - for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { - lnk = &lgr->lnk[i]; - if (!buf_desc->is_map_ib[lnk->link_idx]) - continue; - if (is_rmb) { - if (buf_desc->mr_rx[lnk->link_idx]) - smc_ib_put_memory_region( - buf_desc->mr_rx[lnk->link_idx]); - smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_FROM_DEVICE); - } else { - smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_TO_DEVICE); - } - sg_free_table(&buf_desc->sgt[lnk->link_idx]); - } + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) + smcr_buf_unmap_link(buf_desc, is_rmb, &lgr->lnk[i]); if (buf_desc->pages) __free_pages(buf_desc->pages, buf_desc->order); diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index fd512188d2c6..fa532a423fd7 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -367,6 +367,7 @@ void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr); int smc_core_init(void); void smc_core_exit(void); +void smcr_link_clear(struct smc_link *lnk); int smcr_link_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc); static inline struct smc_link_group *smc_get_lgr(struct smc_link *link) { -- cgit v1.2.3 From fb33d27727254618aaf6bc2fedcb0fda1d5c0239 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Fri, 1 May 2020 12:48:03 +0200 Subject: net/smc: map and register buffers for a new link Introduce support to map and register all current buffers for a new link. smcr_buf_map_lgr() will map used buffers for a new link and smcr_buf_reg_lgr() can be called to register used buffers on the IB device of the new link. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_core.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ net/smc/smc_core.h | 2 ++ 2 files changed, 62 insertions(+) (limited to 'net') diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index d5ecea490b4e..0e87f652caea 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1185,6 +1185,66 @@ int smcr_link_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc) return 0; } +static int _smcr_buf_map_lgr(struct smc_link *lnk, struct mutex *lock, + struct list_head *lst, bool is_rmb) +{ + struct smc_buf_desc *buf_desc, *bf; + int rc = 0; + + mutex_lock(lock); + list_for_each_entry_safe(buf_desc, bf, lst, list) { + if (!buf_desc->used) + continue; + rc = smcr_buf_map_link(buf_desc, is_rmb, lnk); + if (rc) + goto out; + } +out: + mutex_unlock(lock); + return rc; +} + +/* map all used buffers of lgr for a new link */ +int smcr_buf_map_lgr(struct smc_link *lnk) +{ + struct smc_link_group *lgr = lnk->lgr; + int i, rc = 0; + + for (i = 0; i < SMC_RMBE_SIZES; i++) { + rc = _smcr_buf_map_lgr(lnk, &lgr->rmbs_lock, + &lgr->rmbs[i], true); + if (rc) + return rc; + rc = _smcr_buf_map_lgr(lnk, &lgr->sndbufs_lock, + &lgr->sndbufs[i], false); + if (rc) + return rc; + } + return 0; +} + +/* register all used buffers of lgr for a new link */ +int smcr_buf_reg_lgr(struct smc_link *lnk) +{ + struct smc_link_group *lgr = lnk->lgr; + struct smc_buf_desc *buf_desc, *bf; + int i, rc = 0; + + mutex_lock(&lgr->rmbs_lock); + for (i = 0; i < SMC_RMBE_SIZES; i++) { + list_for_each_entry_safe(buf_desc, bf, &lgr->rmbs[i], list) { + if (!buf_desc->used) + continue; + rc = smcr_link_reg_rmb(lnk, buf_desc); + if (rc) + goto out; + } + } +out: + mutex_unlock(&lgr->rmbs_lock); + return rc; +} + static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr, bool is_rmb, int bufsize) { diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index fa532a423fd7..61ddb5264936 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -368,6 +368,8 @@ int smc_core_init(void); void smc_core_exit(void); void smcr_link_clear(struct smc_link *lnk); +int smcr_buf_map_lgr(struct smc_link *lnk); +int smcr_buf_reg_lgr(struct smc_link *lnk); int smcr_link_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc); static inline struct smc_link_group *smc_get_lgr(struct smc_link *link) { -- cgit v1.2.3 From fbed3b37c89633eb602f4ec8e30186e601b793e5 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Fri, 1 May 2020 12:48:04 +0200 Subject: net/smc: extend smc_llc_send_add_link() and smc_llc_send_delete_link() All LLC sends are done from worker context only, so remove the prep functions which were used to build the message before it was sent, and add the function content into the respective send function smc_llc_send_add_link() and smc_llc_send_delete_link(). Extend smc_llc_send_add_link() to include the qp_mtu value in the LLC message, which is needed to establish a link after the initial link was created. Extend smc_llc_send_delete_link() to contain a link_id and a reason code for the link deletion in the LLC message, which is needed when a specific link should be deleted. And add the list of existing DELETE_LINK reason codes. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_core.c | 3 +- net/smc/smc_llc.c | 89 ++++++++++++++++++++++++++++-------------------------- net/smc/smc_llc.h | 16 ++++++++-- 3 files changed, 62 insertions(+), 46 deletions(-) (limited to 'net') diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 0e87f652caea..c905675017c7 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -199,7 +199,8 @@ void smc_lgr_cleanup_early(struct smc_connection *conn) static int smcr_link_send_delete(struct smc_link *lnk, bool orderly) { if (lnk->state == SMC_LNK_ACTIVE && - !smc_llc_send_delete_link(lnk, SMC_LLC_REQ, orderly)) { + !smc_llc_send_delete_link(lnk, 0, SMC_LLC_REQ, orderly, + SMC_LLC_DEL_PROG_INIT_TERM)) { return 0; } return -ENOTCONN; diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 327cf30b98cc..171835926db6 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -58,7 +58,13 @@ struct smc_llc_msg_add_link { /* type 0x02 */ u8 sender_gid[SMC_GID_SIZE]; u8 sender_qp_num[3]; u8 link_num; - u8 flags2; /* QP mtu */ +#if defined(__BIG_ENDIAN_BITFIELD) + u8 reserved3 : 4, + qp_mtu : 4; +#elif defined(__LITTLE_ENDIAN_BITFIELD) + u8 qp_mtu : 4, + reserved3 : 4; +#endif u8 initial_psn[3]; u8 reserved[8]; }; @@ -427,26 +433,9 @@ static int smc_llc_send_delete_rkey(struct smc_link *link, return rc; } -/* prepare an add link message */ -static void smc_llc_prep_add_link(struct smc_llc_msg_add_link *addllc, - struct smc_link *link, u8 mac[], u8 gid[], - enum smc_llc_reqresp reqresp) -{ - memset(addllc, 0, sizeof(*addllc)); - addllc->hd.common.type = SMC_LLC_ADD_LINK; - addllc->hd.length = sizeof(struct smc_llc_msg_add_link); - if (reqresp == SMC_LLC_RESP) { - addllc->hd.flags |= SMC_LLC_FLAG_RESP; - /* always reject more links for now */ - addllc->hd.flags |= SMC_LLC_FLAG_ADD_LNK_REJ; - addllc->hd.add_link_rej_rsn = SMC_LLC_REJ_RSN_NO_ALT_PATH; - } - memcpy(addllc->sender_mac, mac, ETH_ALEN); - memcpy(addllc->sender_gid, gid, SMC_GID_SIZE); -} - /* send ADD LINK request or response */ int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[], + struct smc_link *link_new, enum smc_llc_reqresp reqresp) { struct smc_llc_msg_add_link *addllc; @@ -458,32 +447,33 @@ int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[], if (rc) return rc; addllc = (struct smc_llc_msg_add_link *)wr_buf; - smc_llc_prep_add_link(addllc, link, mac, gid, reqresp); + + memset(addllc, 0, sizeof(*addllc)); + addllc->hd.common.type = SMC_LLC_ADD_LINK; + addllc->hd.length = sizeof(struct smc_llc_msg_add_link); + if (reqresp == SMC_LLC_RESP) + addllc->hd.flags |= SMC_LLC_FLAG_RESP; + memcpy(addllc->sender_mac, mac, ETH_ALEN); + memcpy(addllc->sender_gid, gid, SMC_GID_SIZE); + if (link_new) { + addllc->link_num = link_new->link_id; + hton24(addllc->sender_qp_num, link_new->roce_qp->qp_num); + hton24(addllc->initial_psn, link_new->psn_initial); + if (reqresp == SMC_LLC_REQ) + addllc->qp_mtu = link_new->path_mtu; + else + addllc->qp_mtu = min(link_new->path_mtu, + link_new->peer_mtu); + } /* send llc message */ rc = smc_wr_tx_send(link, pend); return rc; } -/* prepare a delete link message */ -static void smc_llc_prep_delete_link(struct smc_llc_msg_del_link *delllc, - struct smc_link *link, - enum smc_llc_reqresp reqresp, bool orderly) -{ - memset(delllc, 0, sizeof(*delllc)); - delllc->hd.common.type = SMC_LLC_DELETE_LINK; - delllc->hd.length = sizeof(struct smc_llc_msg_add_link); - if (reqresp == SMC_LLC_RESP) - delllc->hd.flags |= SMC_LLC_FLAG_RESP; - /* DEL_LINK_ALL because only 1 link supported */ - delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ALL; - if (orderly) - delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY; - delllc->link_num = link->link_id; -} - /* send DELETE LINK request or response */ -int smc_llc_send_delete_link(struct smc_link *link, - enum smc_llc_reqresp reqresp, bool orderly) +int smc_llc_send_delete_link(struct smc_link *link, u8 link_del_id, + enum smc_llc_reqresp reqresp, bool orderly, + u32 reason) { struct smc_llc_msg_del_link *delllc; struct smc_wr_tx_pend_priv *pend; @@ -494,7 +484,19 @@ int smc_llc_send_delete_link(struct smc_link *link, if (rc) return rc; delllc = (struct smc_llc_msg_del_link *)wr_buf; - smc_llc_prep_delete_link(delllc, link, reqresp, orderly); + + memset(delllc, 0, sizeof(*delllc)); + delllc->hd.common.type = SMC_LLC_DELETE_LINK; + delllc->hd.length = sizeof(struct smc_llc_msg_del_link); + if (reqresp == SMC_LLC_RESP) + delllc->hd.flags |= SMC_LLC_FLAG_RESP; + if (orderly) + delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY; + if (link_del_id) + delllc->link_num = link_del_id; + else + delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ALL; + delllc->reason = htonl(reason); /* send llc message */ rc = smc_wr_tx_send(link, pend); return rc; @@ -547,12 +549,13 @@ static void smc_llc_rx_delete_link(struct smc_link *link, smc_lgr_forget(lgr); if (lgr->role == SMC_SERV) { /* client asks to delete this link, send request */ - smc_llc_prep_delete_link(llc, link, SMC_LLC_REQ, true); + smc_llc_send_delete_link(link, 0, SMC_LLC_REQ, true, + SMC_LLC_DEL_PROG_INIT_TERM); } else { /* server requests to delete this link, send response */ - smc_llc_prep_delete_link(llc, link, SMC_LLC_RESP, true); + smc_llc_send_delete_link(link, 0, SMC_LLC_RESP, true, + SMC_LLC_DEL_PROG_INIT_TERM); } - smc_llc_send_message(link, llc); smc_lgr_terminate_sched(lgr); } diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index 48029a5e14c3..d2c50d3e43a6 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -35,6 +35,16 @@ enum smc_llc_msg_type { SMC_LLC_DELETE_RKEY = 0x09, }; +/* LLC DELETE LINK Request Reason Codes */ +#define SMC_LLC_DEL_LOST_PATH 0x00010000 +#define SMC_LLC_DEL_OP_INIT_TERM 0x00020000 +#define SMC_LLC_DEL_PROG_INIT_TERM 0x00030000 +#define SMC_LLC_DEL_PROT_VIOL 0x00040000 +#define SMC_LLC_DEL_NO_ASYM_NEEDED 0x00050000 +/* LLC DELETE LINK Response Reason Codes */ +#define SMC_LLC_DEL_NOLNK 0x00100000 /* Unknown Link ID (no link) */ +#define SMC_LLC_DEL_NOLGR 0x00200000 /* Unknown Link Group */ + /* returns a usable link of the link group, or NULL */ static inline struct smc_link *smc_llc_usable_link(struct smc_link_group *lgr) { @@ -50,9 +60,11 @@ static inline struct smc_link *smc_llc_usable_link(struct smc_link_group *lgr) int smc_llc_send_confirm_link(struct smc_link *lnk, enum smc_llc_reqresp reqresp); int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[], + struct smc_link *link_new, enum smc_llc_reqresp reqresp); -int smc_llc_send_delete_link(struct smc_link *link, - enum smc_llc_reqresp reqresp, bool orderly); +int smc_llc_send_delete_link(struct smc_link *link, u8 link_del_id, + enum smc_llc_reqresp reqresp, bool orderly, + u32 reason); void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc); void smc_llc_lgr_clear(struct smc_link_group *lgr); int smc_llc_link_init(struct smc_link *link); -- cgit v1.2.3 From d550066776aae3bb31e0240cab24f62e33c47fd3 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Fri, 1 May 2020 12:48:05 +0200 Subject: net/smc: mutex to protect the lgr against parallel reconfigurations Introduce llc_conf_mutex in the link group which is used to protect the buffers and lgr states against parallel link reconfiguration. This ensures that new connections do not start to register buffers with the links of a link group when link creation or termination is running. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 9 +++++++++ net/smc/smc_core.c | 26 ++++++++++++++++++++++---- net/smc/smc_core.h | 2 ++ net/smc/smc_llc.c | 9 +-------- 4 files changed, 34 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 20d6d3fbb86c..6663a63be9e4 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -344,6 +344,13 @@ static int smcr_lgr_reg_rmbs(struct smc_link *link, struct smc_link_group *lgr = link->lgr; int i, rc = 0; + rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY); + if (rc) + return rc; + /* protect against parallel smc_llc_cli_rkey_exchange() and + * parallel smcr_link_reg_rmb() + */ + mutex_lock(&lgr->llc_conf_mutex); for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { if (lgr->lnk[i].state != SMC_LNK_ACTIVE) continue; @@ -360,6 +367,8 @@ static int smcr_lgr_reg_rmbs(struct smc_link *link, } rmb_desc->is_conf_rkey = true; out: + mutex_unlock(&lgr->llc_conf_mutex); + smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); return rc; } diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index c905675017c7..4c3af05d76a5 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -448,11 +448,21 @@ out: static void smcr_buf_unuse(struct smc_buf_desc *rmb_desc, struct smc_link_group *lgr) { + int rc; + if (rmb_desc->is_conf_rkey && !list_empty(&lgr->list)) { /* unregister rmb with peer */ - smc_llc_do_delete_rkey(lgr, rmb_desc); - rmb_desc->is_conf_rkey = false; + rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY); + if (!rc) { + /* protect against smc_llc_cli_rkey_exchange() */ + mutex_lock(&lgr->llc_conf_mutex); + smc_llc_do_delete_rkey(lgr, rmb_desc); + rmb_desc->is_conf_rkey = false; + mutex_unlock(&lgr->llc_conf_mutex); + smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); + } } + if (rmb_desc->is_reg_err) { /* buf registration failed, reuse not possible */ mutex_lock(&lgr->rmbs_lock); @@ -552,6 +562,7 @@ static void smcr_rtoken_clear_link(struct smc_link *lnk) } } +/* must be called under lgr->llc_conf_mutex lock */ void smcr_link_clear(struct smc_link *lnk) { struct smc_ib_device *smcibdev; @@ -1170,7 +1181,9 @@ free_table: return rc; } -/* register a new rmb on IB device */ +/* register a new rmb on IB device, + * must be called under lgr->llc_conf_mutex lock + */ int smcr_link_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc) { if (list_empty(&link->lgr->list)) @@ -1224,7 +1237,9 @@ int smcr_buf_map_lgr(struct smc_link *lnk) return 0; } -/* register all used buffers of lgr for a new link */ +/* register all used buffers of lgr for a new link, + * must be called under lgr->llc_conf_mutex lock + */ int smcr_buf_reg_lgr(struct smc_link *lnk) { struct smc_link_group *lgr = lnk->lgr; @@ -1278,6 +1293,8 @@ static int smcr_buf_map_usable_links(struct smc_link_group *lgr, { int i, rc = 0; + /* protect against parallel link reconfiguration */ + mutex_lock(&lgr->llc_conf_mutex); for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { struct smc_link *lnk = &lgr->lnk[i]; @@ -1289,6 +1306,7 @@ static int smcr_buf_map_usable_links(struct smc_link_group *lgr, } } out: + mutex_unlock(&lgr->llc_conf_mutex); return rc; } diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 61ddb5264936..aa198dd0f0e4 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -248,6 +248,8 @@ struct smc_link_group { /* queue for llc events */ spinlock_t llc_event_q_lock; /* protects llc_event_q */ + struct mutex llc_conf_mutex; + /* protects lgr reconfig. */ struct work_struct llc_event_work; /* llc event worker */ wait_queue_head_t llc_waiter; diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 171835926db6..ceed3c89926f 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -848,6 +848,7 @@ void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc) spin_lock_init(&lgr->llc_event_q_lock); spin_lock_init(&lgr->llc_flow_lock); init_waitqueue_head(&lgr->llc_waiter); + mutex_init(&lgr->llc_conf_mutex); lgr->llc_testlink_time = net->ipv4.sysctl_tcp_keepalive_time; } @@ -897,9 +898,6 @@ int smc_llc_do_confirm_rkey(struct smc_link *send_link, struct smc_llc_qentry *qentry = NULL; int rc = 0; - rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY); - if (rc) - return rc; rc = smc_llc_send_confirm_rkey(send_link, rmb_desc); if (rc) goto out; @@ -911,7 +909,6 @@ int smc_llc_do_confirm_rkey(struct smc_link *send_link, out: if (qentry) smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); - smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); return rc; } @@ -927,9 +924,6 @@ int smc_llc_do_delete_rkey(struct smc_link_group *lgr, if (!send_link) return -ENOLINK; - rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY); - if (rc) - return rc; /* protected by llc_flow control */ rc = smc_llc_send_delete_rkey(send_link, rmb_desc); if (rc) @@ -942,7 +936,6 @@ int smc_llc_do_delete_rkey(struct smc_link_group *lgr, out: if (qentry) smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); - smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); return rc; } -- cgit v1.2.3 From 35dcf7ec02dcff16504bc52a368822254f889f00 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Fri, 1 May 2020 12:48:06 +0200 Subject: net/smc: remember PNETID of IB device for later device matching The PNETID is needed to find an alternate link for a link group. Save the PNETID of the link that is used to create the link group for later device matching. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_core.c | 2 ++ net/smc/smc_core.h | 2 ++ 2 files changed, 4 insertions(+) (limited to 'net') diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 4c3af05d76a5..d7ab92fc5b15 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -413,6 +413,8 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT; memcpy(lgr->peer_systemid, ini->ib_lcl->id_for_peer, SMC_SYSTEMID_LEN); + memcpy(lgr->pnet_id, ini->ib_dev->pnetid[ini->ib_port - 1], + SMC_MAX_PNETID_LEN); smc_llc_lgr_init(lgr, smc); link_idx = SMC_SINGLE_LINK; diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index aa198dd0f0e4..413eaad50c7f 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -244,6 +244,8 @@ struct smc_link_group { u8 next_link_id; enum smc_lgr_type type; /* redundancy state */ + u8 pnet_id[SMC_MAX_PNETID_LEN + 1]; + /* pnet id of this lgr */ struct list_head llc_event_q; /* queue for llc events */ spinlock_t llc_event_q_lock; -- cgit v1.2.3 From 1f90a05d9ff907c70456e7c9d7058372679a88c6 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Fri, 1 May 2020 12:48:07 +0200 Subject: net/smc: add smcr_port_add() and smcr_link_up() processing Call smcr_port_add() when an IB event reports a new active IB device. smcr_port_add() will start a work which either triggers the local ADD_LINK processing, or send an ADD_LINK LLC message to the SMC server to initiate the processing. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_core.c | 86 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ net/smc/smc_core.h | 1 + net/smc/smc_ib.c | 1 + 3 files changed, 88 insertions(+) (limited to 'net') diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index d7ab92fc5b15..20bc9e46bf52 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -44,10 +44,19 @@ static struct smc_lgr_list smc_lgr_list = { /* established link groups */ static atomic_t lgr_cnt = ATOMIC_INIT(0); /* number of existing link groups */ static DECLARE_WAIT_QUEUE_HEAD(lgrs_deleted); +struct smc_ib_up_work { + struct work_struct work; + struct smc_link_group *lgr; + struct smc_ib_device *smcibdev; + u8 ibport; +}; + static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb, struct smc_buf_desc *buf_desc); static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft); +static void smc_link_up_work(struct work_struct *work); + /* return head of link group list and its lock for a given link group */ static inline struct list_head *smc_lgr_list_head(struct smc_link_group *lgr, spinlock_t **lgr_lock) @@ -928,6 +937,83 @@ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev) } } +/* link is up - establish alternate link if applicable */ +static void smcr_link_up(struct smc_link_group *lgr, + struct smc_ib_device *smcibdev, u8 ibport) +{ + struct smc_link *link = NULL; + + if (list_empty(&lgr->list) || + lgr->type == SMC_LGR_SYMMETRIC || + lgr->type == SMC_LGR_ASYMMETRIC_PEER) + return; + + if (lgr->role == SMC_SERV) { + /* trigger local add link processing */ + link = smc_llc_usable_link(lgr); + if (!link) + return; + /* tbd: call smc_llc_srv_add_link_local(link); */ + } else { + /* invite server to start add link processing */ + u8 gid[SMC_GID_SIZE]; + + if (smc_ib_determine_gid(smcibdev, ibport, lgr->vlan_id, gid, + NULL)) + return; + if (lgr->llc_flow_lcl.type != SMC_LLC_FLOW_NONE) { + /* some other llc task is ongoing */ + wait_event_interruptible_timeout(lgr->llc_waiter, + (lgr->llc_flow_lcl.type == SMC_LLC_FLOW_NONE), + SMC_LLC_WAIT_TIME); + } + if (list_empty(&lgr->list) || + !smc_ib_port_active(smcibdev, ibport)) + return; /* lgr or device no longer active */ + link = smc_llc_usable_link(lgr); + if (!link) + return; + smc_llc_send_add_link(link, smcibdev->mac[ibport - 1], gid, + NULL, SMC_LLC_REQ); + } +} + +void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport) +{ + struct smc_ib_up_work *ib_work; + struct smc_link_group *lgr, *n; + + list_for_each_entry_safe(lgr, n, &smc_lgr_list.list, list) { + if (strncmp(smcibdev->pnetid[ibport - 1], lgr->pnet_id, + SMC_MAX_PNETID_LEN) || + lgr->type == SMC_LGR_SYMMETRIC || + lgr->type == SMC_LGR_ASYMMETRIC_PEER) + continue; + ib_work = kmalloc(sizeof(*ib_work), GFP_KERNEL); + if (!ib_work) + continue; + INIT_WORK(&ib_work->work, smc_link_up_work); + ib_work->lgr = lgr; + ib_work->smcibdev = smcibdev; + ib_work->ibport = ibport; + schedule_work(&ib_work->work); + } +} + +static void smc_link_up_work(struct work_struct *work) +{ + struct smc_ib_up_work *ib_work = container_of(work, + struct smc_ib_up_work, + work); + struct smc_link_group *lgr = ib_work->lgr; + + if (list_empty(&lgr->list)) + goto out; + smcr_link_up(lgr, ib_work->smcibdev, ib_work->ibport); +out: + kfree(ib_work); +} + /* Determine vlan of internal TCP socket. * @vlan_id: address to store the determined vlan id into */ diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 413eaad50c7f..86453ad83491 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -345,6 +345,7 @@ void smc_lgr_forget(struct smc_link_group *lgr); void smc_lgr_cleanup_early(struct smc_connection *conn); void smc_lgr_terminate_sched(struct smc_link_group *lgr); void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport); +void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport); void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, unsigned short vlan); void smc_smcd_terminate_all(struct smcd_dev *dev); diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index c090678a3e5a..545fb0bc3714 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -252,6 +252,7 @@ static void smc_ib_port_event_work(struct work_struct *work) smc_port_terminate(smcibdev, port_idx + 1); } else { clear_bit(port_idx, smcibdev->ports_going_away); + smcr_port_add(smcibdev, port_idx + 1); } } } -- cgit v1.2.3 From 541afa10c126b6c22c2a805a559c70cc41fd156e Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Fri, 1 May 2020 12:48:08 +0200 Subject: net/smc: add smcr_port_err() and smcr_link_down() processing Call smcr_port_err() when an IB event reports an inactive IB device. smcr_port_err() calls smcr_link_down() for all affected links. smcr_link_down() either triggers the local DELETE_LINK processing, or sends an DELETE_LINK LLC message to the SMC server to initiate the processing. The old handler function smc_port_terminate() is removed. Add helper smcr_link_down_cond() to take a link down conditionally, and smcr_link_down_cond_sched() to schedule the link_down processing to a work. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_core.c | 119 +++++++++++++++++++++++++++++++++++++++-------------- net/smc/smc_core.h | 6 ++- net/smc/smc_ib.c | 2 +- net/smc/smc_llc.h | 3 ++ 4 files changed, 98 insertions(+), 32 deletions(-) (limited to 'net') diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 20bc9e46bf52..62108e0cd529 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -56,6 +56,7 @@ static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb, static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft); static void smc_link_up_work(struct work_struct *work); +static void smc_link_down_work(struct work_struct *work); /* return head of link group list and its lock for a given link group */ static inline struct list_head *smc_lgr_list_head(struct smc_link_group *lgr, @@ -320,6 +321,7 @@ static int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, lnk->smcibdev = ini->ib_dev; lnk->ibport = ini->ib_port; lnk->path_mtu = ini->ib_dev->pattr[ini->ib_port - 1].active_mtu; + INIT_WORK(&lnk->link_down_wrk, smc_link_down_work); if (!ini->ib_dev->initialized) { rc = (int)smc_ib_setup_per_ibdev(ini->ib_dev); if (rc) @@ -818,36 +820,6 @@ void smc_lgr_terminate_sched(struct smc_link_group *lgr) schedule_work(&lgr->terminate_work); } -/* Called when IB port is terminated */ -void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport) -{ - struct smc_link_group *lgr, *l; - LIST_HEAD(lgr_free_list); - int i; - - spin_lock_bh(&smc_lgr_list.lock); - list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) { - if (lgr->is_smcd) - continue; - /* tbd - terminate only when no more links are active */ - for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { - if (!smc_link_usable(&lgr->lnk[i])) - continue; - if (lgr->lnk[i].smcibdev == smcibdev && - lgr->lnk[i].ibport == ibport) { - list_move(&lgr->list, &lgr_free_list); - lgr->freeing = 1; - } - } - } - spin_unlock_bh(&smc_lgr_list.lock); - - list_for_each_entry_safe(lgr, l, &lgr_free_list, list) { - list_del_init(&lgr->list); - __smc_lgr_terminate(lgr, false); - } -} - /* Called when peer lgr shutdown (regularly or abnormally) is received */ void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, unsigned short vlan) { @@ -1000,6 +972,79 @@ void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport) } } +/* link is down - switch connections to alternate link, + * must be called under lgr->llc_conf_mutex lock + */ +static void smcr_link_down(struct smc_link *lnk) +{ + struct smc_link_group *lgr = lnk->lgr; + struct smc_link *to_lnk; + int del_link_id; + + if (!lgr || lnk->state == SMC_LNK_UNUSED || list_empty(&lgr->list)) + return; + + smc_ib_modify_qp_reset(lnk); + to_lnk = NULL; + /* tbd: call to_lnk = smc_switch_conns(lgr, lnk, true); */ + if (!to_lnk) { /* no backup link available */ + smcr_link_clear(lnk); + return; + } + lgr->type = SMC_LGR_SINGLE; + del_link_id = lnk->link_id; + + if (lgr->role == SMC_SERV) { + /* trigger local delete link processing */ + } else { + if (lgr->llc_flow_lcl.type != SMC_LLC_FLOW_NONE) { + /* another llc task is ongoing */ + mutex_unlock(&lgr->llc_conf_mutex); + wait_event_interruptible_timeout(lgr->llc_waiter, + (lgr->llc_flow_lcl.type == SMC_LLC_FLOW_NONE), + SMC_LLC_WAIT_TIME); + mutex_lock(&lgr->llc_conf_mutex); + } + smc_llc_send_delete_link(to_lnk, del_link_id, SMC_LLC_REQ, true, + SMC_LLC_DEL_LOST_PATH); + } +} + +/* must be called under lgr->llc_conf_mutex lock */ +void smcr_link_down_cond(struct smc_link *lnk) +{ + if (smc_link_downing(&lnk->state)) + smcr_link_down(lnk); +} + +/* will get the lgr->llc_conf_mutex lock */ +void smcr_link_down_cond_sched(struct smc_link *lnk) +{ + if (smc_link_downing(&lnk->state)) + schedule_work(&lnk->link_down_wrk); +} + +void smcr_port_err(struct smc_ib_device *smcibdev, u8 ibport) +{ + struct smc_link_group *lgr, *n; + int i; + + list_for_each_entry_safe(lgr, n, &smc_lgr_list.list, list) { + if (strncmp(smcibdev->pnetid[ibport - 1], lgr->pnet_id, + SMC_MAX_PNETID_LEN)) + continue; /* lgr is not affected */ + if (list_empty(&lgr->list)) + continue; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + struct smc_link *lnk = &lgr->lnk[i]; + + if (smc_link_usable(lnk) && + lnk->smcibdev == smcibdev && lnk->ibport == ibport) + smcr_link_down_cond_sched(lnk); + } + } +} + static void smc_link_up_work(struct work_struct *work) { struct smc_ib_up_work *ib_work = container_of(work, @@ -1014,6 +1059,20 @@ out: kfree(ib_work); } +static void smc_link_down_work(struct work_struct *work) +{ + struct smc_link *link = container_of(work, struct smc_link, + link_down_wrk); + struct smc_link_group *lgr = link->lgr; + + if (list_empty(&lgr->list)) + return; + wake_up_interruptible_all(&lgr->llc_waiter); + mutex_lock(&lgr->llc_conf_mutex); + smcr_link_down(link); + mutex_unlock(&lgr->llc_conf_mutex); +} + /* Determine vlan of internal TCP socket. * @vlan_id: address to store the determined vlan id into */ diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 86453ad83491..da3cddbd1651 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -117,6 +117,7 @@ struct smc_link { u8 link_id; /* unique # within link group */ u8 link_idx; /* index in lgr link array */ struct smc_link_group *lgr; /* parent link group */ + struct work_struct link_down_wrk; /* wrk to bring link down */ enum smc_link_state state; /* state of link */ struct delayed_work llc_testlink_wrk; /* testlink worker */ @@ -344,8 +345,8 @@ struct smc_clc_msg_local; void smc_lgr_forget(struct smc_link_group *lgr); void smc_lgr_cleanup_early(struct smc_connection *conn); void smc_lgr_terminate_sched(struct smc_link_group *lgr); -void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport); void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport); +void smcr_port_err(struct smc_ib_device *smcibdev, u8 ibport); void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, unsigned short vlan); void smc_smcd_terminate_all(struct smcd_dev *dev); @@ -376,6 +377,9 @@ void smcr_link_clear(struct smc_link *lnk); int smcr_buf_map_lgr(struct smc_link *lnk); int smcr_buf_reg_lgr(struct smc_link *lnk); int smcr_link_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc); +void smcr_link_down_cond(struct smc_link *lnk); +void smcr_link_down_cond_sched(struct smc_link *lnk); + static inline struct smc_link_group *smc_get_lgr(struct smc_link *link) { return link->lgr; diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 545fb0bc3714..2c743caad69a 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -249,7 +249,7 @@ static void smc_ib_port_event_work(struct work_struct *work) clear_bit(port_idx, &smcibdev->port_event_mask); if (!smc_ib_port_active(smcibdev, port_idx + 1)) { set_bit(port_idx, smcibdev->ports_going_away); - smc_port_terminate(smcibdev, port_idx + 1); + smcr_port_err(smcibdev, port_idx + 1); } else { clear_bit(port_idx, smcibdev->ports_going_away); smcr_port_add(smcibdev, port_idx + 1); diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index d2c50d3e43a6..4ed4486e5082 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -35,6 +35,9 @@ enum smc_llc_msg_type { SMC_LLC_DELETE_RKEY = 0x09, }; +#define smc_link_downing(state) \ + (cmpxchg(state, SMC_LNK_ACTIVE, SMC_LNK_INACTIVE) == SMC_LNK_ACTIVE) + /* LLC DELETE LINK Request Reason Codes */ #define SMC_LLC_DEL_LOST_PATH 0x00010000 #define SMC_LLC_DEL_OP_INIT_TERM 0x00020000 -- cgit v1.2.3 From 87523930a16eb57ebb20318e92b5df4b64fe8b20 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Fri, 1 May 2020 12:48:09 +0200 Subject: net/smc: take link down instead of terminating the link group Use the introduced link down processing in all places where the link group is terminated and take down the affected link only. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_core.c | 7 ++----- net/smc/smc_llc.c | 4 ++-- net/smc/smc_tx.c | 2 +- net/smc/smc_wr.c | 19 ++++++++----------- 4 files changed, 13 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 62108e0cd529..849ae3f9b796 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -884,11 +884,8 @@ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev) } else { list_for_each_entry_safe(lgr, lg, &smc_lgr_list.list, list) { for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { - if (lgr->lnk[i].smcibdev == smcibdev) { - list_move(&lgr->list, &lgr_free_list); - lgr->freeing = 1; - break; - } + if (lgr->lnk[i].smcibdev == smcibdev) + smcr_link_down_cond_sched(&lgr->lnk[i]); } } } diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index ceed3c89926f..e478a4c11877 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -556,7 +556,7 @@ static void smc_llc_rx_delete_link(struct smc_link *link, smc_llc_send_delete_link(link, 0, SMC_LLC_RESP, true, SMC_LLC_DEL_PROG_INIT_TERM); } - smc_lgr_terminate_sched(lgr); + smcr_link_down_cond(link); } /* process a confirm_rkey request from peer, remote flow */ @@ -831,7 +831,7 @@ static void smc_llc_testlink_work(struct work_struct *work) if (link->state != SMC_LNK_ACTIVE) return; /* link state changed */ if (rc <= 0) { - smc_lgr_terminate_sched(smc_get_lgr(link)); + smcr_link_down_cond_sched(link); return; } next_interval = link->llc_testlink_time; diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index d74bfe6a90f1..417204572a69 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -283,7 +283,7 @@ static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset, rdma_wr->rkey = lgr->rtokens[conn->rtoken_idx][link->link_idx].rkey; rc = ib_post_send(link->roce_qp, &rdma_wr->wr, NULL); if (rc) - smc_lgr_terminate_sched(lgr); + smcr_link_down_cond_sched(link); return rc; } diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 93223628c002..031e6c9561b1 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -120,8 +120,8 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) sizeof(link->wr_tx_bufs[i])); clear_bit(i, link->wr_tx_mask); } - /* terminate connections of this link group abnormally */ - smc_lgr_terminate_sched(smc_get_lgr(link)); + /* terminate link */ + smcr_link_down_cond_sched(link); } if (pnd_snd.handler) pnd_snd.handler(&pnd_snd.priv, link, wc->status); @@ -212,8 +212,8 @@ int smc_wr_tx_get_free_slot(struct smc_link *link, (smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY), SMC_WR_TX_WAIT_FREE_SLOT_TIME); if (!rc) { - /* timeout - terminate connections */ - smc_lgr_terminate_sched(lgr); + /* timeout - terminate link */ + smcr_link_down_cond_sched(link); return -EPIPE; } if (idx == link->wr_tx_cnt) @@ -270,7 +270,7 @@ int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv) rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx], NULL); if (rc) { smc_wr_tx_put_slot(link, priv); - smc_lgr_terminate_sched(smc_get_lgr(link)); + smcr_link_down_cond_sched(link); } return rc; } @@ -294,8 +294,8 @@ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr) (link->wr_reg_state != POSTED), SMC_WR_REG_MR_WAIT_TIME); if (!rc) { - /* timeout - terminate connections */ - smc_lgr_terminate_sched(smc_get_lgr(link)); + /* timeout - terminate link */ + smcr_link_down_cond_sched(link); return -EPIPE; } if (rc == -ERESTARTSYS) @@ -393,10 +393,7 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num) case IB_WC_RETRY_EXC_ERR: case IB_WC_RNR_RETRY_EXC_ERR: case IB_WC_WR_FLUSH_ERR: - /* terminate connections of this link group - * abnormally - */ - smc_lgr_terminate_sched(smc_get_lgr(link)); + smcr_link_down_cond_sched(link); break; default: smc_wr_rx_post(link); /* refill WR RX */ -- cgit v1.2.3 From 33d203302d1cc744a13349d2576c985feb469220 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Fri, 1 May 2020 12:48:10 +0200 Subject: net/smc: remove DELETE LINK processing from smc_core.c Support for multiple links makes the former DELETE LINK processing obsolete which sent one DELETE_LINK LLC message for each single link. Remove this processing from smc_core.c. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_core.c | 33 --------------------------------- 1 file changed, 33 deletions(-) (limited to 'net') diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 849ae3f9b796..60c708f6de51 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -202,20 +202,6 @@ void smc_lgr_cleanup_early(struct smc_connection *conn) smc_lgr_schedule_free_work_fast(lgr); } -/* Send delete link, either as client to request the initiation - * of the DELETE LINK sequence from server; or as server to - * initiate the delete processing. See smc_llc_rx_delete_link(). - */ -static int smcr_link_send_delete(struct smc_link *lnk, bool orderly) -{ - if (lnk->state == SMC_LNK_ACTIVE && - !smc_llc_send_delete_link(lnk, 0, SMC_LLC_REQ, orderly, - SMC_LLC_DEL_PROG_INIT_TERM)) { - return 0; - } - return -ENOTCONN; -} - static void smc_lgr_free(struct smc_link_group *lgr); static void smc_lgr_free_work(struct work_struct *work) @@ -241,25 +227,6 @@ static void smc_lgr_free_work(struct work_struct *work) return; } list_del_init(&lgr->list); /* remove from smc_lgr_list */ - - if (!lgr->is_smcd && !lgr->terminating) { - bool do_wait = false; - - for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { - struct smc_link *lnk = &lgr->lnk[i]; - /* try to send del link msg, on err free immediately */ - if (lnk->state == SMC_LNK_ACTIVE && - !smcr_link_send_delete(lnk, true)) { - /* reschedule in case we never receive a resp */ - smc_lgr_schedule_free_work(lgr); - do_wait = true; - } - } - if (do_wait) { - spin_unlock_bh(lgr_lock); - return; /* wait for resp, see smc_llc_rx_delete_link */ - } - } lgr->freeing = 1; /* this instance does the freeing, no new schedule */ spin_unlock_bh(lgr_lock); cancel_delayed_work(&lgr->free_work); -- cgit v1.2.3 From 6c868a3edc70ec9819d6a94268625d25e6bc9587 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Fri, 1 May 2020 12:48:11 +0200 Subject: net/smc: introduce smc_pnet_find_alt_roce() Introduce a new function in smc_pnet.c that searches for an alternate IB device, using an existing link group and a primary IB device. The alternate IB device needs to be active and must have the same PNETID as the link group. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_pnet.c | 15 +++++++++++++-- net/smc/smc_pnet.h | 5 ++++- 2 files changed, 17 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index bd01c71b827a..50c96e843fab 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -777,7 +777,8 @@ static int smc_pnet_find_ndev_pnetid_by_table(struct net_device *ndev, /* find a roce device for the given pnetid */ static void _smc_pnet_find_roce_by_pnetid(u8 *pnet_id, - struct smc_init_info *ini) + struct smc_init_info *ini, + struct smc_ib_device *known_dev) { struct smc_ib_device *ibdev; int i; @@ -785,6 +786,8 @@ static void _smc_pnet_find_roce_by_pnetid(u8 *pnet_id, ini->ib_dev = NULL; spin_lock(&smc_ib_devices.lock); list_for_each_entry(ibdev, &smc_ib_devices.list, list) { + if (ibdev == known_dev) + continue; for (i = 1; i <= SMC_MAX_PORTS; i++) { if (!rdma_is_port_valid(ibdev->ibdev, i)) continue; @@ -803,6 +806,14 @@ out: spin_unlock(&smc_ib_devices.lock); } +/* find alternate roce device with same pnet_id and vlan_id */ +void smc_pnet_find_alt_roce(struct smc_link_group *lgr, + struct smc_init_info *ini, + struct smc_ib_device *known_dev) +{ + _smc_pnet_find_roce_by_pnetid(lgr->pnet_id, ini, known_dev); +} + /* if handshake network device belongs to a roce device, return its * IB device and port */ @@ -857,7 +868,7 @@ static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev, smc_pnet_find_rdma_dev(ndev, ini); return; /* pnetid could not be determined */ } - _smc_pnet_find_roce_by_pnetid(ndev_pnetid, ini); + _smc_pnet_find_roce_by_pnetid(ndev_pnetid, ini, NULL); } static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev, diff --git a/net/smc/smc_pnet.h b/net/smc/smc_pnet.h index ea207f8fc6f7..811a65986691 100644 --- a/net/smc/smc_pnet.h +++ b/net/smc/smc_pnet.h @@ -19,6 +19,7 @@ struct smc_ib_device; struct smcd_dev; struct smc_init_info; +struct smc_link_group; /** * struct smc_pnettable - SMC PNET table anchor @@ -48,5 +49,7 @@ void smc_pnet_find_roce_resource(struct sock *sk, struct smc_init_info *ini); void smc_pnet_find_ism_resource(struct sock *sk, struct smc_init_info *ini); int smc_pnetid_by_table_ib(struct smc_ib_device *smcibdev, u8 ib_port); int smc_pnetid_by_table_smcd(struct smcd_dev *smcd); - +void smc_pnet_find_alt_roce(struct smc_link_group *lgr, + struct smc_init_info *ini, + struct smc_ib_device *known_dev); #endif -- cgit v1.2.3 From 8574cf4055ab44724ee9a4c30921d3ed853d787c Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Fri, 1 May 2020 12:48:12 +0200 Subject: net/smc: allocate index for a new link Add smc_llc_alloc_alt_link() to find a free link index for a new link, depending on the new link group type. And update constants for the maximum number of links to 3 (2 symmetric and 1 dangling asymmetric link). These maximum numbers are the same as used by other implementations of the SMC-R protocol. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_core.h | 2 +- net/smc/smc_llc.c | 24 ++++++++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index da3cddbd1651..eb27f2eb7c8c 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -128,7 +128,7 @@ struct smc_link { /* For now we just allow one parallel link per link group. The SMC protocol * allows more (up to 8). */ -#define SMC_LINKS_PER_LGR_MAX 1 +#define SMC_LINKS_PER_LGR_MAX 3 #define SMC_SINGLE_LINK 0 #define SMC_FIRST_CONTACT 1 /* first contact to a peer */ diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index e478a4c11877..3a25b6ebe3a8 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -541,6 +541,30 @@ static int smc_llc_send_message(struct smc_link *link, void *llcbuf) /********************************* receive ***********************************/ +static int smc_llc_alloc_alt_link(struct smc_link_group *lgr, + enum smc_lgr_type lgr_new_t) +{ + int i; + + if (lgr->type == SMC_LGR_SYMMETRIC || + (lgr->type != SMC_LGR_SINGLE && + (lgr_new_t == SMC_LGR_ASYMMETRIC_LOCAL || + lgr_new_t == SMC_LGR_ASYMMETRIC_PEER))) + return -EMLINK; + + if (lgr_new_t == SMC_LGR_ASYMMETRIC_LOCAL || + lgr_new_t == SMC_LGR_ASYMMETRIC_PEER) { + for (i = SMC_LINKS_PER_LGR_MAX - 1; i >= 0; i--) + if (lgr->lnk[i].state == SMC_LNK_UNUSED) + return i; + } else { + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) + if (lgr->lnk[i].state == SMC_LNK_UNUSED) + return i; + } + return -EMLINK; +} + static void smc_llc_rx_delete_link(struct smc_link *link, struct smc_llc_msg_del_link *llc) { -- cgit v1.2.3 From b45e7f98ab7c2d7035d92100ee011584693eccce Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Fri, 1 May 2020 12:48:13 +0200 Subject: net/smc: llc_add_link_work to handle ADD_LINK LLC requests Introduce a work that is scheduled when a new ADD_LINK LLC request is received. The work will call either the SMC client or SMC server ADD_LINK processing. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_core.h | 1 + net/smc/smc_llc.c | 24 ++++++++++++++++++++++-- 2 files changed, 23 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index eb27f2eb7c8c..555ada9d2423 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -253,6 +253,7 @@ struct smc_link_group { /* protects llc_event_q */ struct mutex llc_conf_mutex; /* protects lgr reconfig. */ + struct work_struct llc_add_link_work; struct work_struct llc_event_work; /* llc event worker */ wait_queue_head_t llc_waiter; diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 3a25b6ebe3a8..50f59746bdf9 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -565,6 +565,24 @@ static int smc_llc_alloc_alt_link(struct smc_link_group *lgr, return -EMLINK; } +/* worker to process an add link message */ +static void smc_llc_add_link_work(struct work_struct *work) +{ + struct smc_link_group *lgr = container_of(work, struct smc_link_group, + llc_add_link_work); + + if (list_empty(&lgr->list)) { + /* link group is terminating */ + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + goto out; + } + + /* tbd: call smc_llc_process_cli_add_link(lgr); */ + /* tbd: call smc_llc_process_srv_add_link(lgr); */ +out: + smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); +} + static void smc_llc_rx_delete_link(struct smc_link *link, struct smc_llc_msg_del_link *llc) { @@ -685,11 +703,11 @@ static void smc_llc_event_handler(struct smc_llc_qentry *qentry) wake_up_interruptible(&lgr->llc_waiter); } else if (smc_llc_flow_start(&lgr->llc_flow_lcl, qentry)) { - /* tbd: schedule_work(&lgr->llc_add_link_work); */ + schedule_work(&lgr->llc_add_link_work); } } else if (smc_llc_flow_start(&lgr->llc_flow_lcl, qentry)) { /* as smc server, handle client suggestion */ - /* tbd: schedule_work(&lgr->llc_add_link_work); */ + schedule_work(&lgr->llc_add_link_work); } return; case SMC_LLC_CONFIRM_LINK: @@ -868,6 +886,7 @@ void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc) struct net *net = sock_net(smc->clcsock->sk); INIT_WORK(&lgr->llc_event_work, smc_llc_event_work); + INIT_WORK(&lgr->llc_add_link_work, smc_llc_add_link_work); INIT_LIST_HEAD(&lgr->llc_event_q); spin_lock_init(&lgr->llc_event_q_lock); spin_lock_init(&lgr->llc_flow_lock); @@ -882,6 +901,7 @@ void smc_llc_lgr_clear(struct smc_link_group *lgr) smc_llc_event_flush(lgr); wake_up_interruptible_all(&lgr->llc_waiter); cancel_work_sync(&lgr->llc_event_work); + cancel_work_sync(&lgr->llc_add_link_work); if (lgr->delayed_event) { kfree(lgr->delayed_event); lgr->delayed_event = NULL; -- cgit v1.2.3 From 40b94224c339e44f689e713875c6c27c9c1270a7 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sat, 2 May 2020 16:41:06 -0700 Subject: smc: Remove unused function. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit net/smc/smc_llc.c:544:12: warning: ‘smc_llc_alloc_alt_link’ defined but not used [-Wunused-function] static int smc_llc_alloc_alt_link(struct smc_link_group *lgr, ^~~~~~~~~~~~~~~~~~~~~~ Signed-off-by: David S. Miller --- net/smc/smc_llc.c | 24 ------------------------ 1 file changed, 24 deletions(-) (limited to 'net') diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 50f59746bdf9..4e3db4d4b783 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -541,30 +541,6 @@ static int smc_llc_send_message(struct smc_link *link, void *llcbuf) /********************************* receive ***********************************/ -static int smc_llc_alloc_alt_link(struct smc_link_group *lgr, - enum smc_lgr_type lgr_new_t) -{ - int i; - - if (lgr->type == SMC_LGR_SYMMETRIC || - (lgr->type != SMC_LGR_SINGLE && - (lgr_new_t == SMC_LGR_ASYMMETRIC_LOCAL || - lgr_new_t == SMC_LGR_ASYMMETRIC_PEER))) - return -EMLINK; - - if (lgr_new_t == SMC_LGR_ASYMMETRIC_LOCAL || - lgr_new_t == SMC_LGR_ASYMMETRIC_PEER) { - for (i = SMC_LINKS_PER_LGR_MAX - 1; i >= 0; i--) - if (lgr->lnk[i].state == SMC_LNK_UNUSED) - return i; - } else { - for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) - if (lgr->lnk[i].state == SMC_LNK_UNUSED) - return i; - } - return -EMLINK; -} - /* worker to process an add link message */ static void smc_llc_add_link_work(struct work_struct *work) { -- cgit v1.2.3 From ee1bd483cc062d5050f9537064651dd2e06baee7 Mon Sep 17 00:00:00 2001 From: Dmitry Yakunin Date: Sat, 2 May 2020 18:34:42 +0300 Subject: inet_diag: bc: read cgroup id only for full sockets Fix bug introduced by commit b1f3e43dbfac ("inet_diag: add support for cgroup filter"). Signed-off-by: Dmitry Yakunin Reported-by: syzbot+ee80f840d9bf6893223b@syzkaller.appspotmail.com Reported-by: syzbot+13bef047dbfffa5cd1af@syzkaller.appspotmail.com Fixes: b1f3e43dbfac ("inet_diag: add support for cgroup filter") Signed-off-by: David S. Miller --- net/ipv4/inet_diag.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 0034092358c3..125f4f8a36b4 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -746,7 +746,8 @@ int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk) else entry.mark = 0; #ifdef CONFIG_SOCK_CGROUP_DATA - entry.cgroup_id = cgroup_id(sock_cgroup_ptr(&sk->sk_cgrp_data)); + entry.cgroup_id = sk_fullsock(sk) ? + cgroup_id(sock_cgroup_ptr(&sk->sk_cgrp_data)) : 0; #endif return inet_diag_bc_run(bc, &entry); -- cgit v1.2.3 From dde0a648fc00e2156a3358600c5fbfb3f53256ac Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 2 May 2020 19:54:18 -0700 Subject: net_sched: sch_fq: avoid touching f->next from fq_gc() A significant amount of cpu cycles is spent in fq_gc() When fq_gc() does its lookup in the rb-tree, it needs the following fields from struct fq_flow : f->sk (lookup key in the rb-tree) f->fq_node (anchor in the rb-tree) f->next (used to determine if the flow is detached) f->age (used to determine if the flow is candidate for gc) This unfortunately spans two cache lines (assuming 64 bytes cache lines) We can avoid using f->next, if we use the low order bit of f->{age|tail} This low order bit is 0, if f->tail points to an sk_buff. We set the low order bit to 1, if the union contains a jiffies value. Combined with the following patch, this makes sure we only need to bring into cpu caches one cache line per flow. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_fq.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 4c060134c736..bc9ca1ba507b 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -70,14 +70,14 @@ struct fq_flow { struct sk_buff *head; /* list of skbs for this flow : first skb */ union { struct sk_buff *tail; /* last skb in the list */ - unsigned long age; /* jiffies when flow was emptied, for gc */ + unsigned long age; /* (jiffies | 1UL) when flow was emptied, for gc */ }; struct rb_node fq_node; /* anchor in fq_root[] trees */ struct sock *sk; int qlen; /* number of packets in flow queue */ int credit; u32 socket_hash; /* sk_hash */ - struct fq_flow *next; /* next pointer in RR lists, or &detached */ + struct fq_flow *next; /* next pointer in RR lists */ struct rb_node rate_node; /* anchor in q->delayed tree */ u64 time_next_packet; @@ -126,20 +126,25 @@ struct fq_sched_data { struct qdisc_watchdog watchdog; }; -/* special value to mark a detached flow (not on old/new list) */ -static struct fq_flow detached, throttled; - +/* + * f->tail and f->age share the same location. + * We can use the low order bit to differentiate if this location points + * to a sk_buff or contains a jiffies value, if we force this value to be odd. + * This assumes f->tail low order bit must be 0 since alignof(struct sk_buff) >= 2 + */ static void fq_flow_set_detached(struct fq_flow *f) { - f->next = &detached; - f->age = jiffies; + f->age = jiffies | 1UL; } static bool fq_flow_is_detached(const struct fq_flow *f) { - return f->next == &detached; + return !!(f->age & 1UL); } +/* special value to mark a throttled flow (not on old/new list) */ +static struct fq_flow throttled; + static bool fq_flow_is_throttled(const struct fq_flow *f) { return f->next == &throttled; -- cgit v1.2.3 From 7ba0537c2b534149be288f851900b4cf5aacde48 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 2 May 2020 19:54:19 -0700 Subject: net_sched: sch_fq: change fq_flow size/layout sizeof(struct fq_flow) is 112 bytes on 64bit arches. This means that half of them use two cache lines, but 50% use three cache lines. This patch adds cache line alignment, and makes sure that only the first cache line is touched by fq_enqueue(), which is more expensive that fq_dequeue() in general. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_fq.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index bc9ca1ba507b..ced1f987d7e4 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -66,6 +66,7 @@ static inline struct fq_skb_cb *fq_skb_cb(struct sk_buff *skb) * in linear list (head,tail), otherwise are placed in a rbtree (t_root). */ struct fq_flow { +/* First cache line : used in fq_gc(), fq_enqueue(), fq_dequeue() */ struct rb_root t_root; struct sk_buff *head; /* list of skbs for this flow : first skb */ union { @@ -74,14 +75,18 @@ struct fq_flow { }; struct rb_node fq_node; /* anchor in fq_root[] trees */ struct sock *sk; + u32 socket_hash; /* sk_hash */ int qlen; /* number of packets in flow queue */ + +/* Second cache line, used in fq_dequeue() */ int credit; - u32 socket_hash; /* sk_hash */ + /* 32bit hole on 64bit arches */ + struct fq_flow *next; /* next pointer in RR lists */ struct rb_node rate_node; /* anchor in q->delayed tree */ u64 time_next_packet; -}; +} ____cacheline_aligned_in_smp; struct fq_flow_head { struct fq_flow *first; -- cgit v1.2.3 From 82a0aa53b520edf50e08bad347d87d898de414eb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 2 May 2020 19:54:20 -0700 Subject: net_sched: sch_fq: use bulk freeing in fq_gc() fq_gc() already builds a small array of pointers, so using kmem_cache_free_bulk() needs very little change. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_fq.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index ced1f987d7e4..53ec47ff8469 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -214,9 +214,10 @@ static void fq_gc(struct fq_sched_data *q, struct rb_root *root, struct sock *sk) { - struct fq_flow *f, *tofree[FQ_GC_MAX]; struct rb_node **p, *parent; - int fcnt = 0; + void *tofree[FQ_GC_MAX]; + struct fq_flow *f; + int i, fcnt = 0; p = &root->rb_node; parent = NULL; @@ -239,15 +240,18 @@ static void fq_gc(struct fq_sched_data *q, p = &parent->rb_left; } + if (!fcnt) + return; + + for (i = fcnt; i > 0; ) { + f = tofree[--i]; + rb_erase(&f->fq_node, root); + } q->flows -= fcnt; q->inactive_flows -= fcnt; q->stat_gc_flows += fcnt; - while (fcnt) { - struct fq_flow *f = tofree[--fcnt]; - rb_erase(&f->fq_node, root); - kmem_cache_free(fq_flow_cachep, f); - } + kmem_cache_free_bulk(fq_flow_cachep, fcnt, tofree); } static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) -- cgit v1.2.3 From c288b0ca86a0a49ad450cf2d76a9a70c2ca9e43f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 2 May 2020 19:54:21 -0700 Subject: net_sched: sch_fq: do not call fq_peek() twice per packet This refactors the code to not call fq_peek() from fq_dequeue_head() since the caller can provide the skb. Also rename fq_dequeue_head() to fq_dequeue_skb() because 'head' is a bit vague, given the skb could come from t_root rb-tree. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_fq.c | 34 ++++++++++++++++------------------ 1 file changed, 16 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 53ec47ff8469..567df8fcaf70 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -384,19 +384,17 @@ static void fq_erase_head(struct Qdisc *sch, struct fq_flow *flow, } } -/* remove one skb from head of flow queue */ -static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow) +/* Remove one skb from flow queue. + * This skb must be the return value of prior fq_peek(). + */ +static void fq_dequeue_skb(struct Qdisc *sch, struct fq_flow *flow, + struct sk_buff *skb) { - struct sk_buff *skb = fq_peek(flow); - - if (skb) { - fq_erase_head(sch, flow, skb); - skb_mark_not_on_list(skb); - flow->qlen--; - qdisc_qstats_backlog_dec(sch, skb); - sch->q.qlen--; - } - return skb; + fq_erase_head(sch, flow, skb); + skb_mark_not_on_list(skb); + flow->qlen--; + qdisc_qstats_backlog_dec(sch, skb); + sch->q.qlen--; } static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb) @@ -508,9 +506,11 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch) if (!sch->q.qlen) return NULL; - skb = fq_dequeue_head(sch, &q->internal); - if (skb) + skb = fq_peek(&q->internal); + if (unlikely(skb)) { + fq_dequeue_skb(sch, &q->internal, skb); goto out; + } now = ktime_get_ns(); fq_check_throttled(q, now); @@ -550,10 +550,8 @@ begin: INET_ECN_set_ce(skb); q->stat_ce_mark++; } - } - - skb = fq_dequeue_head(sch, f); - if (!skb) { + fq_dequeue_skb(sch, f, skb); + } else { head->first = f->next; /* force a pass through old_flows to prevent starvation */ if ((head == &q->new_flows) && q->old_flows.first) { -- cgit v1.2.3 From 348e289b0f23451d1c47b940e759f0f3a0c5756e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 2 May 2020 19:54:22 -0700 Subject: net_sched: sch_fq: perform a prefetch() earlier The prefetch() done in fq_dequeue() can be done a bit earlier after the refactoring of the code done in the prior patch. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_fq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 567df8fcaf70..4f0104243cc2 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -546,6 +546,7 @@ begin: fq_flow_set_throttled(q, f); goto begin; } + prefetch(&skb->end); if ((s64)(now - time_next_packet - q->ce_threshold) > 0) { INET_ECN_set_ce(skb); q->stat_ce_mark++; @@ -562,7 +563,6 @@ begin: } goto begin; } - prefetch(&skb->end); plen = qdisc_pkt_len(skb); f->credit -= plen; -- cgit v1.2.3 From 336ba09f2ef71b82f07c1200be0ddf4eb923d69f Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Sun, 3 May 2020 14:38:40 +0200 Subject: net/smc: first part of add link processing as SMC client First set of functions to process an ADD_LINK LLC request as an SMC client. Find an alternate IB device, determine the new link group type and get the index for the new link. Then ready the link, map the buffers and send an ADD_LINK LLC response. If any error occurs, send a reject LLC message and terminate the processing. Add smc_llc_alloc_alt_link() to find a free link index for a new link, depending on the new link group type. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_core.c | 4 +- net/smc/smc_core.h | 2 + net/smc/smc_llc.c | 107 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 111 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 60c708f6de51..2f8faa9c9e8e 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -273,8 +273,8 @@ static u8 smcr_next_link_id(struct smc_link_group *lgr) return link_id; } -static int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, - u8 link_idx, struct smc_init_info *ini) +int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, + u8 link_idx, struct smc_init_info *ini) { u8 rndvec[3]; int rc; diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 555ada9d2423..4e00819e2db7 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -374,6 +374,8 @@ void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr); int smc_core_init(void); void smc_core_exit(void); +int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, + u8 link_idx, struct smc_init_info *ini); void smcr_link_clear(struct smc_link *lnk); int smcr_buf_map_lgr(struct smc_link *lnk); int smcr_buf_reg_lgr(struct smc_link *lnk); diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 4e3db4d4b783..8716d8739329 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -17,6 +17,7 @@ #include "smc_core.h" #include "smc_clc.h" #include "smc_llc.h" +#include "smc_pnet.h" #define SMC_LLC_DATA_LEN 40 @@ -541,6 +542,112 @@ static int smc_llc_send_message(struct smc_link *link, void *llcbuf) /********************************* receive ***********************************/ +static int smc_llc_alloc_alt_link(struct smc_link_group *lgr, + enum smc_lgr_type lgr_new_t) +{ + int i; + + if (lgr->type == SMC_LGR_SYMMETRIC || + (lgr->type != SMC_LGR_SINGLE && + (lgr_new_t == SMC_LGR_ASYMMETRIC_LOCAL || + lgr_new_t == SMC_LGR_ASYMMETRIC_PEER))) + return -EMLINK; + + if (lgr_new_t == SMC_LGR_ASYMMETRIC_LOCAL || + lgr_new_t == SMC_LGR_ASYMMETRIC_PEER) { + for (i = SMC_LINKS_PER_LGR_MAX - 1; i >= 0; i--) + if (lgr->lnk[i].state == SMC_LNK_UNUSED) + return i; + } else { + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) + if (lgr->lnk[i].state == SMC_LNK_UNUSED) + return i; + } + return -EMLINK; +} + +/* prepare and send an add link reject response */ +static int smc_llc_cli_add_link_reject(struct smc_llc_qentry *qentry) +{ + qentry->msg.raw.hdr.flags |= SMC_LLC_FLAG_RESP; + qentry->msg.raw.hdr.flags |= SMC_LLC_FLAG_ADD_LNK_REJ; + qentry->msg.raw.hdr.add_link_rej_rsn = SMC_LLC_REJ_RSN_NO_ALT_PATH; + return smc_llc_send_message(qentry->link, &qentry->msg); +} + +static void smc_llc_save_add_link_info(struct smc_link *link, + struct smc_llc_msg_add_link *add_llc) +{ + link->peer_qpn = ntoh24(add_llc->sender_qp_num); + memcpy(link->peer_gid, add_llc->sender_gid, SMC_GID_SIZE); + memcpy(link->peer_mac, add_llc->sender_mac, ETH_ALEN); + link->peer_psn = ntoh24(add_llc->initial_psn); + link->peer_mtu = add_llc->qp_mtu; +} + +/* as an SMC client, process an add link request */ +int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry) +{ + struct smc_llc_msg_add_link *llc = &qentry->msg.add_link; + enum smc_lgr_type lgr_new_t = SMC_LGR_SYMMETRIC; + struct smc_link_group *lgr = smc_get_lgr(link); + struct smc_link *lnk_new = NULL; + struct smc_init_info ini; + int lnk_idx, rc = 0; + + ini.vlan_id = lgr->vlan_id; + smc_pnet_find_alt_roce(lgr, &ini, link->smcibdev); + if (!memcmp(llc->sender_gid, link->peer_gid, SMC_GID_SIZE) && + !memcmp(llc->sender_mac, link->peer_mac, ETH_ALEN)) { + if (!ini.ib_dev) + goto out_reject; + lgr_new_t = SMC_LGR_ASYMMETRIC_PEER; + } + if (!ini.ib_dev) { + lgr_new_t = SMC_LGR_ASYMMETRIC_LOCAL; + ini.ib_dev = link->smcibdev; + ini.ib_port = link->ibport; + } + lnk_idx = smc_llc_alloc_alt_link(lgr, lgr_new_t); + if (lnk_idx < 0) + goto out_reject; + lnk_new = &lgr->lnk[lnk_idx]; + rc = smcr_link_init(lgr, lnk_new, lnk_idx, &ini); + if (rc) + goto out_reject; + smc_llc_save_add_link_info(lnk_new, llc); + lnk_new->link_id = llc->link_num; + + rc = smc_ib_ready_link(lnk_new); + if (rc) + goto out_clear_lnk; + + rc = smcr_buf_map_lgr(lnk_new); + if (rc) + goto out_clear_lnk; + + rc = smc_llc_send_add_link(link, + lnk_new->smcibdev->mac[ini.ib_port - 1], + lnk_new->gid, lnk_new, SMC_LLC_RESP); + if (rc) + goto out_clear_lnk; + /* tbd: rc = smc_llc_cli_rkey_exchange(link, lnk_new); */ + if (rc) { + rc = 0; + goto out_clear_lnk; + } + /* tbd: rc = smc_llc_cli_conf_link(link, &ini, lnk_new, lgr_new_t); */ + if (!rc) + goto out; +out_clear_lnk: + smcr_link_clear(lnk_new); +out_reject: + smc_llc_cli_add_link_reject(qentry); +out: + kfree(qentry); + return rc; +} + /* worker to process an add link message */ static void smc_llc_add_link_work(struct work_struct *work) { -- cgit v1.2.3 From 87f88cda2128a72d79d4cc700729488af1081a06 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Sun, 3 May 2020 14:38:41 +0200 Subject: net/smc: rkey processing for a new link as SMC client Part of the SMC client new link establishment process is the exchange of rkeys for all used buffers. Add new LLC message type ADD_LINK_CONTINUE which is used to exchange rkeys of all current RMB buffers. Add functions to iterate over all used RMB buffers of the link group, and implement the ADD_LINK_CONTINUE processing. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_llc.c | 157 +++++++++++++++++++++++++++++++++++++++++++++++++++++- net/smc/smc_llc.h | 1 + 2 files changed, 157 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 8716d8739329..a06b618f172e 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -70,6 +70,23 @@ struct smc_llc_msg_add_link { /* type 0x02 */ u8 reserved[8]; }; +struct smc_llc_msg_add_link_cont_rt { + __be32 rmb_key; + __be32 rmb_key_new; + __be64 rmb_vaddr_new; +}; + +#define SMC_LLC_RKEYS_PER_CONT_MSG 2 + +struct smc_llc_msg_add_link_cont { /* type 0x03 */ + struct smc_llc_hdr hd; + u8 link_num; + u8 num_rkeys; + u8 reserved2[2]; + struct smc_llc_msg_add_link_cont_rt rt[SMC_LLC_RKEYS_PER_CONT_MSG]; + u8 reserved[4]; +} __packed; /* format defined in RFC7609 */ + #define SMC_LLC_FLAG_DEL_LINK_ALL 0x40 #define SMC_LLC_FLAG_DEL_LINK_ORDERLY 0x20 @@ -121,6 +138,7 @@ struct smc_llc_msg_delete_rkey { /* type 0x09 */ union smc_llc_msg { struct smc_llc_msg_confirm_link confirm_link; struct smc_llc_msg_add_link add_link; + struct smc_llc_msg_add_link_cont add_link_cont; struct smc_llc_msg_del_link delete_link; struct smc_llc_msg_confirm_rkey confirm_rkey; @@ -566,6 +584,137 @@ static int smc_llc_alloc_alt_link(struct smc_link_group *lgr, return -EMLINK; } +/* return first buffer from any of the next buf lists */ +static struct smc_buf_desc *_smc_llc_get_next_rmb(struct smc_link_group *lgr, + int *buf_lst) +{ + struct smc_buf_desc *buf_pos; + + while (*buf_lst < SMC_RMBE_SIZES) { + buf_pos = list_first_entry_or_null(&lgr->rmbs[*buf_lst], + struct smc_buf_desc, list); + if (buf_pos) + return buf_pos; + (*buf_lst)++; + } + return NULL; +} + +/* return next rmb from buffer lists */ +static struct smc_buf_desc *smc_llc_get_next_rmb(struct smc_link_group *lgr, + int *buf_lst, + struct smc_buf_desc *buf_pos) +{ + struct smc_buf_desc *buf_next; + + if (!buf_pos || list_is_last(&buf_pos->list, &lgr->rmbs[*buf_lst])) { + (*buf_lst)++; + return _smc_llc_get_next_rmb(lgr, buf_lst); + } + buf_next = list_next_entry(buf_pos, list); + return buf_next; +} + +static struct smc_buf_desc *smc_llc_get_first_rmb(struct smc_link_group *lgr, + int *buf_lst) +{ + *buf_lst = 0; + return smc_llc_get_next_rmb(lgr, buf_lst, NULL); +} + +/* send one add_link_continue msg */ +static int smc_llc_add_link_cont(struct smc_link *link, + struct smc_link *link_new, u8 *num_rkeys_todo, + int *buf_lst, struct smc_buf_desc **buf_pos) +{ + struct smc_llc_msg_add_link_cont *addc_llc; + struct smc_link_group *lgr = link->lgr; + int prim_lnk_idx, lnk_idx, i, rc; + struct smc_wr_tx_pend_priv *pend; + struct smc_wr_buf *wr_buf; + struct smc_buf_desc *rmb; + u8 n; + + rc = smc_llc_add_pending_send(link, &wr_buf, &pend); + if (rc) + return rc; + addc_llc = (struct smc_llc_msg_add_link_cont *)wr_buf; + memset(addc_llc, 0, sizeof(*addc_llc)); + + prim_lnk_idx = link->link_idx; + lnk_idx = link_new->link_idx; + addc_llc->link_num = link_new->link_id; + addc_llc->num_rkeys = *num_rkeys_todo; + n = *num_rkeys_todo; + for (i = 0; i < min_t(u8, n, SMC_LLC_RKEYS_PER_CONT_MSG); i++) { + if (!*buf_pos) { + addc_llc->num_rkeys = addc_llc->num_rkeys - + *num_rkeys_todo; + *num_rkeys_todo = 0; + break; + } + rmb = *buf_pos; + + addc_llc->rt[i].rmb_key = htonl(rmb->mr_rx[prim_lnk_idx]->rkey); + addc_llc->rt[i].rmb_key_new = htonl(rmb->mr_rx[lnk_idx]->rkey); + addc_llc->rt[i].rmb_vaddr_new = + cpu_to_be64((u64)sg_dma_address(rmb->sgt[lnk_idx].sgl)); + + (*num_rkeys_todo)--; + *buf_pos = smc_llc_get_next_rmb(lgr, buf_lst, *buf_pos); + while (*buf_pos && !(*buf_pos)->used) + *buf_pos = smc_llc_get_next_rmb(lgr, buf_lst, *buf_pos); + } + addc_llc->hd.common.type = SMC_LLC_ADD_LINK_CONT; + addc_llc->hd.length = sizeof(struct smc_llc_msg_add_link_cont); + if (lgr->role == SMC_CLNT) + addc_llc->hd.flags |= SMC_LLC_FLAG_RESP; + return smc_wr_tx_send(link, pend); +} + +static int smc_llc_cli_rkey_exchange(struct smc_link *link, + struct smc_link *link_new) +{ + struct smc_llc_msg_add_link_cont *addc_llc; + struct smc_link_group *lgr = link->lgr; + u8 max, num_rkeys_send, num_rkeys_recv; + struct smc_llc_qentry *qentry; + struct smc_buf_desc *buf_pos; + int buf_lst; + int rc = 0; + int i; + + mutex_lock(&lgr->rmbs_lock); + num_rkeys_send = lgr->conns_num; + buf_pos = smc_llc_get_first_rmb(lgr, &buf_lst); + do { + qentry = smc_llc_wait(lgr, NULL, SMC_LLC_WAIT_TIME, + SMC_LLC_ADD_LINK_CONT); + if (!qentry) { + rc = -ETIMEDOUT; + break; + } + addc_llc = &qentry->msg.add_link_cont; + num_rkeys_recv = addc_llc->num_rkeys; + max = min_t(u8, num_rkeys_recv, SMC_LLC_RKEYS_PER_CONT_MSG); + for (i = 0; i < max; i++) { + smc_rtoken_set(lgr, link->link_idx, link_new->link_idx, + addc_llc->rt[i].rmb_key, + addc_llc->rt[i].rmb_vaddr_new, + addc_llc->rt[i].rmb_key_new); + num_rkeys_recv--; + } + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + rc = smc_llc_add_link_cont(link, link_new, &num_rkeys_send, + &buf_lst, &buf_pos); + if (rc) + break; + } while (num_rkeys_send || num_rkeys_recv); + + mutex_unlock(&lgr->rmbs_lock); + return rc; +} + /* prepare and send an add link reject response */ static int smc_llc_cli_add_link_reject(struct smc_llc_qentry *qentry) { @@ -631,7 +780,7 @@ int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry) lnk_new->gid, lnk_new, SMC_LLC_RESP); if (rc) goto out_clear_lnk; - /* tbd: rc = smc_llc_cli_rkey_exchange(link, lnk_new); */ + rc = smc_llc_cli_rkey_exchange(link, lnk_new); if (rc) { rc = 0; goto out_clear_lnk; @@ -794,6 +943,7 @@ static void smc_llc_event_handler(struct smc_llc_qentry *qentry) } return; case SMC_LLC_CONFIRM_LINK: + case SMC_LLC_ADD_LINK_CONT: if (lgr->llc_flow_lcl.type != SMC_LLC_FLOW_NONE) { /* a flow is waiting for this message */ smc_llc_flow_qentry_set(&lgr->llc_flow_lcl, qentry); @@ -873,6 +1023,7 @@ static void smc_llc_rx_response(struct smc_link *link, break; case SMC_LLC_ADD_LINK: case SMC_LLC_CONFIRM_LINK: + case SMC_LLC_ADD_LINK_CONT: case SMC_LLC_CONFIRM_RKEY: case SMC_LLC_DELETE_RKEY: /* assign responses to the local flow, we requested them */ @@ -1092,6 +1243,10 @@ static struct smc_wr_rx_handler smc_llc_rx_handlers[] = { .handler = smc_llc_rx_handler, .type = SMC_LLC_ADD_LINK }, + { + .handler = smc_llc_rx_handler, + .type = SMC_LLC_ADD_LINK_CONT + }, { .handler = smc_llc_rx_handler, .type = SMC_LLC_DELETE_LINK diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index 4ed4486e5082..97a4f02f5a93 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -28,6 +28,7 @@ enum smc_llc_reqresp { enum smc_llc_msg_type { SMC_LLC_CONFIRM_LINK = 0x01, SMC_LLC_ADD_LINK = 0x02, + SMC_LLC_ADD_LINK_CONT = 0x03, SMC_LLC_DELETE_LINK = 0x04, SMC_LLC_CONFIRM_RKEY = 0x06, SMC_LLC_TEST_LINK = 0x07, -- cgit v1.2.3 From b1570a87f57e94e9f74b8942840f9bd16bd1aba5 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Sun, 3 May 2020 14:38:42 +0200 Subject: net/smc: final part of add link processing as SMC client This patch finalizes the ADD_LINK processing of new links. Receive the CONFIRM_LINK request from peer, complete the link initialization, register all used buffers with the IB device and finally send the CONFIRM_LINK response, which completes the ADD_LINK processing. And activate smc_llc_cli_add_link() in af_smc.c. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 2 +- net/smc/smc_llc.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++++--- net/smc/smc_llc.h | 1 + 3 files changed, 72 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 6663a63be9e4..1afb6e4275f2 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -427,7 +427,7 @@ static int smcr_clnt_conf_first_link(struct smc_sock *smc) return rc; } smc_llc_flow_qentry_clr(&link->lgr->llc_flow_lcl); - /* tbd: call smc_llc_cli_add_link(link, qentry); */ + smc_llc_cli_add_link(link, qentry); return 0; } diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index a06b618f172e..d56ca60597d4 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -381,7 +381,7 @@ int smc_llc_send_confirm_link(struct smc_link *link, hton24(confllc->sender_qp_num, link->roce_qp->qp_num); confllc->link_num = link->link_id; memcpy(confllc->link_uid, lgr->id, SMC_LGR_ID_SIZE); - confllc->max_links = SMC_LLC_ADD_LNK_MAX_LINKS; /* enforce peer resp. */ + confllc->max_links = SMC_LLC_ADD_LNK_MAX_LINKS; /* send llc message */ rc = smc_wr_tx_send(link, pend); return rc; @@ -724,6 +724,61 @@ static int smc_llc_cli_add_link_reject(struct smc_llc_qentry *qentry) return smc_llc_send_message(qentry->link, &qentry->msg); } +static int smc_llc_cli_conf_link(struct smc_link *link, + struct smc_init_info *ini, + struct smc_link *link_new, + enum smc_lgr_type lgr_new_t) +{ + struct smc_link_group *lgr = link->lgr; + struct smc_llc_msg_del_link *del_llc; + struct smc_llc_qentry *qentry = NULL; + int rc = 0; + + /* receive CONFIRM LINK request over RoCE fabric */ + qentry = smc_llc_wait(lgr, NULL, SMC_LLC_WAIT_FIRST_TIME, 0); + if (!qentry) { + rc = smc_llc_send_delete_link(link, link_new->link_id, + SMC_LLC_REQ, false, + SMC_LLC_DEL_LOST_PATH); + return -ENOLINK; + } + if (qentry->msg.raw.hdr.common.type != SMC_LLC_CONFIRM_LINK) { + /* received DELETE_LINK instead */ + del_llc = &qentry->msg.delete_link; + qentry->msg.raw.hdr.flags |= SMC_LLC_FLAG_RESP; + smc_llc_send_message(link, &qentry->msg); + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + return -ENOLINK; + } + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + + rc = smc_ib_modify_qp_rts(link_new); + if (rc) { + smc_llc_send_delete_link(link, link_new->link_id, SMC_LLC_REQ, + false, SMC_LLC_DEL_LOST_PATH); + return -ENOLINK; + } + smc_wr_remember_qp_attr(link_new); + + rc = smcr_buf_reg_lgr(link_new); + if (rc) { + smc_llc_send_delete_link(link, link_new->link_id, SMC_LLC_REQ, + false, SMC_LLC_DEL_LOST_PATH); + return -ENOLINK; + } + + /* send CONFIRM LINK response over RoCE fabric */ + rc = smc_llc_send_confirm_link(link_new, SMC_LLC_RESP); + if (rc) { + smc_llc_send_delete_link(link, link_new->link_id, SMC_LLC_REQ, + false, SMC_LLC_DEL_LOST_PATH); + return -ENOLINK; + } + smc_llc_link_active(link_new); + lgr->type = lgr_new_t; + return 0; +} + static void smc_llc_save_add_link_info(struct smc_link *link, struct smc_llc_msg_add_link *add_llc) { @@ -785,7 +840,7 @@ int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry) rc = 0; goto out_clear_lnk; } - /* tbd: rc = smc_llc_cli_conf_link(link, &ini, lnk_new, lgr_new_t); */ + rc = smc_llc_cli_conf_link(link, &ini, lnk_new, lgr_new_t); if (!rc) goto out; out_clear_lnk: @@ -797,6 +852,17 @@ out: return rc; } +static void smc_llc_process_cli_add_link(struct smc_link_group *lgr) +{ + struct smc_llc_qentry *qentry; + + qentry = smc_llc_flow_qentry_clr(&lgr->llc_flow_lcl); + + mutex_lock(&lgr->llc_conf_mutex); + smc_llc_cli_add_link(qentry->link, qentry); + mutex_unlock(&lgr->llc_conf_mutex); +} + /* worker to process an add link message */ static void smc_llc_add_link_work(struct work_struct *work) { @@ -809,7 +875,8 @@ static void smc_llc_add_link_work(struct work_struct *work) goto out; } - /* tbd: call smc_llc_process_cli_add_link(lgr); */ + if (lgr->role == SMC_CLNT) + smc_llc_process_cli_add_link(lgr); /* tbd: call smc_llc_process_srv_add_link(lgr); */ out: smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index 97a4f02f5a93..7c314bbef8c8 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -88,6 +88,7 @@ struct smc_llc_qentry *smc_llc_wait(struct smc_link_group *lgr, int time_out, u8 exp_msg); struct smc_llc_qentry *smc_llc_flow_qentry_clr(struct smc_llc_flow *flow); void smc_llc_flow_qentry_del(struct smc_llc_flow *flow); +int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry); int smc_llc_init(void) __init; #endif /* SMC_LLC_H */ -- cgit v1.2.3 From 2d2209f2018943d4152a21eff5b76f1952e0b435 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Sun, 3 May 2020 14:38:43 +0200 Subject: net/smc: first part of add link processing as SMC server First set of functions to process an ADD_LINK LLC request as an SMC server. Find an alternate IB device, determine the new link group type and get the index for the new link. Then initialize the link and send the ADD_LINK LLC message to the peer. Save the contents of the response, ready the link, map all used buffers and register the buffers with the IB device. If any error occurs, stop the processing and clear the link. And call smc_llc_srv_add_link() in af_smc.c to start second link establishment after the initial link of a link group was created. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 2 +- net/smc/smc_llc.c | 91 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- net/smc/smc_llc.h | 1 + 3 files changed, 92 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 1afb6e4275f2..c67272007f41 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1067,7 +1067,7 @@ static int smcr_serv_conf_first_link(struct smc_sock *smc) smc_llc_link_active(link); /* initial contact - try to establish second link */ - /* tbd: call smc_llc_srv_add_link(link); */ + smc_llc_srv_add_link(link); return 0; } diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index d56ca60597d4..e2f254e21759 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -863,6 +863,94 @@ static void smc_llc_process_cli_add_link(struct smc_link_group *lgr) mutex_unlock(&lgr->llc_conf_mutex); } +int smc_llc_srv_add_link(struct smc_link *link) +{ + enum smc_lgr_type lgr_new_t = SMC_LGR_SYMMETRIC; + struct smc_link_group *lgr = link->lgr; + struct smc_llc_msg_add_link *add_llc; + struct smc_llc_qentry *qentry = NULL; + struct smc_link *link_new; + struct smc_init_info ini; + int lnk_idx, rc = 0; + + /* ignore client add link recommendation, start new flow */ + ini.vlan_id = lgr->vlan_id; + smc_pnet_find_alt_roce(lgr, &ini, link->smcibdev); + if (!ini.ib_dev) { + lgr_new_t = SMC_LGR_ASYMMETRIC_LOCAL; + ini.ib_dev = link->smcibdev; + ini.ib_port = link->ibport; + } + lnk_idx = smc_llc_alloc_alt_link(lgr, lgr_new_t); + if (lnk_idx < 0) + return 0; + + rc = smcr_link_init(lgr, &lgr->lnk[lnk_idx], lnk_idx, &ini); + if (rc) + return rc; + link_new = &lgr->lnk[lnk_idx]; + rc = smc_llc_send_add_link(link, + link_new->smcibdev->mac[ini.ib_port - 1], + link_new->gid, link_new, SMC_LLC_REQ); + if (rc) + goto out_err; + /* receive ADD LINK response over the RoCE fabric */ + qentry = smc_llc_wait(lgr, link, SMC_LLC_WAIT_TIME, SMC_LLC_ADD_LINK); + if (!qentry) { + rc = -ETIMEDOUT; + goto out_err; + } + add_llc = &qentry->msg.add_link; + if (add_llc->hd.flags & SMC_LLC_FLAG_ADD_LNK_REJ) { + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + rc = -ENOLINK; + goto out_err; + } + if (lgr->type == SMC_LGR_SINGLE && + (!memcmp(add_llc->sender_gid, link->peer_gid, SMC_GID_SIZE) && + !memcmp(add_llc->sender_mac, link->peer_mac, ETH_ALEN))) { + lgr_new_t = SMC_LGR_ASYMMETRIC_PEER; + } + smc_llc_save_add_link_info(link_new, add_llc); + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + + rc = smc_ib_ready_link(link_new); + if (rc) + goto out_err; + rc = smcr_buf_map_lgr(link_new); + if (rc) + goto out_err; + rc = smcr_buf_reg_lgr(link_new); + if (rc) + goto out_err; + /* tbd: rc = smc_llc_srv_rkey_exchange(link, link_new); */ + if (rc) + goto out_err; + /* tbd: rc = smc_llc_srv_conf_link(link, link_new, lgr_new_t); */ + if (rc) + goto out_err; + return 0; +out_err: + smcr_link_clear(link_new); + return rc; +} + +static void smc_llc_process_srv_add_link(struct smc_link_group *lgr) +{ + struct smc_link *link = lgr->llc_flow_lcl.qentry->link; + int rc; + + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + + mutex_lock(&lgr->llc_conf_mutex); + rc = smc_llc_srv_add_link(link); + if (!rc && lgr->type == SMC_LGR_SYMMETRIC) { + /* delete any asymmetric link */ + /* tbd: smc_llc_delete_asym_link(lgr); */ + } + mutex_unlock(&lgr->llc_conf_mutex); +} + /* worker to process an add link message */ static void smc_llc_add_link_work(struct work_struct *work) { @@ -877,7 +965,8 @@ static void smc_llc_add_link_work(struct work_struct *work) if (lgr->role == SMC_CLNT) smc_llc_process_cli_add_link(lgr); - /* tbd: call smc_llc_process_srv_add_link(lgr); */ + else + smc_llc_process_srv_add_link(lgr); out: smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); } diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index 7c314bbef8c8..1a7748d0541f 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -89,6 +89,7 @@ struct smc_llc_qentry *smc_llc_wait(struct smc_link_group *lgr, struct smc_llc_qentry *smc_llc_flow_qentry_clr(struct smc_llc_flow *flow); void smc_llc_flow_qentry_del(struct smc_llc_flow *flow); int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry); +int smc_llc_srv_add_link(struct smc_link *link); int smc_llc_init(void) __init; #endif /* SMC_LLC_H */ -- cgit v1.2.3 From 57b499242cb888a32815f8663b60338bcb0b5747 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Sun, 3 May 2020 14:38:44 +0200 Subject: net/smc: rkey processing for a new link as SMC server Part of SMC server new link establishment is the exchange of rkeys for used buffers. Loop over all used RMB buffers and send ADD_LINK_CONTINUE LLC messages to the peer. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_llc.c | 43 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index e2f254e21759..de73432bd72f 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -863,6 +863,47 @@ static void smc_llc_process_cli_add_link(struct smc_link_group *lgr) mutex_unlock(&lgr->llc_conf_mutex); } +static int smc_llc_srv_rkey_exchange(struct smc_link *link, + struct smc_link *link_new) +{ + struct smc_llc_msg_add_link_cont *addc_llc; + struct smc_link_group *lgr = link->lgr; + u8 max, num_rkeys_send, num_rkeys_recv; + struct smc_llc_qentry *qentry = NULL; + struct smc_buf_desc *buf_pos; + int buf_lst; + int rc = 0; + int i; + + mutex_lock(&lgr->rmbs_lock); + num_rkeys_send = lgr->conns_num; + buf_pos = smc_llc_get_first_rmb(lgr, &buf_lst); + do { + smc_llc_add_link_cont(link, link_new, &num_rkeys_send, + &buf_lst, &buf_pos); + qentry = smc_llc_wait(lgr, link, SMC_LLC_WAIT_TIME, + SMC_LLC_ADD_LINK_CONT); + if (!qentry) { + rc = -ETIMEDOUT; + goto out; + } + addc_llc = &qentry->msg.add_link_cont; + num_rkeys_recv = addc_llc->num_rkeys; + max = min_t(u8, num_rkeys_recv, SMC_LLC_RKEYS_PER_CONT_MSG); + for (i = 0; i < max; i++) { + smc_rtoken_set(lgr, link->link_idx, link_new->link_idx, + addc_llc->rt[i].rmb_key, + addc_llc->rt[i].rmb_vaddr_new, + addc_llc->rt[i].rmb_key_new); + num_rkeys_recv--; + } + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + } while (num_rkeys_send || num_rkeys_recv); +out: + mutex_unlock(&lgr->rmbs_lock); + return rc; +} + int smc_llc_srv_add_link(struct smc_link *link) { enum smc_lgr_type lgr_new_t = SMC_LGR_SYMMETRIC; @@ -923,7 +964,7 @@ int smc_llc_srv_add_link(struct smc_link *link) rc = smcr_buf_reg_lgr(link_new); if (rc) goto out_err; - /* tbd: rc = smc_llc_srv_rkey_exchange(link, link_new); */ + rc = smc_llc_srv_rkey_exchange(link, link_new); if (rc) goto out_err; /* tbd: rc = smc_llc_srv_conf_link(link, link_new, lgr_new_t); */ -- cgit v1.2.3 From 1551c95b61242b1a20565bae8d711f35a601c4f3 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Sun, 3 May 2020 14:38:45 +0200 Subject: net/smc: final part of add link processing as SMC server This patch finalizes the ADD_LINK processing of new links. Send the CONFIRM_LINK request to the peer, receive the response and set link state to ACTIVE. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_llc.c | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index de73432bd72f..1fefee55e293 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -904,6 +904,33 @@ out: return rc; } +static int smc_llc_srv_conf_link(struct smc_link *link, + struct smc_link *link_new, + enum smc_lgr_type lgr_new_t) +{ + struct smc_link_group *lgr = link->lgr; + struct smc_llc_qentry *qentry = NULL; + int rc; + + /* send CONFIRM LINK request over the RoCE fabric */ + rc = smc_llc_send_confirm_link(link_new, SMC_LLC_REQ); + if (rc) + return -ENOLINK; + /* receive CONFIRM LINK response over the RoCE fabric */ + qentry = smc_llc_wait(lgr, link, SMC_LLC_WAIT_FIRST_TIME, + SMC_LLC_CONFIRM_LINK); + if (!qentry) { + /* send DELETE LINK */ + smc_llc_send_delete_link(link, link_new->link_id, SMC_LLC_REQ, + false, SMC_LLC_DEL_LOST_PATH); + return -ENOLINK; + } + smc_llc_link_active(link_new); + lgr->type = lgr_new_t; + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + return 0; +} + int smc_llc_srv_add_link(struct smc_link *link) { enum smc_lgr_type lgr_new_t = SMC_LGR_SYMMETRIC; @@ -967,7 +994,7 @@ int smc_llc_srv_add_link(struct smc_link *link) rc = smc_llc_srv_rkey_exchange(link, link_new); if (rc) goto out_err; - /* tbd: rc = smc_llc_srv_conf_link(link, link_new, lgr_new_t); */ + rc = smc_llc_srv_conf_link(link, link_new, lgr_new_t); if (rc) goto out_err; return 0; -- cgit v1.2.3 From c9a5d243035161f06175a7c6d487c9860e0f179a Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Sun, 3 May 2020 14:38:46 +0200 Subject: net/smc: delete an asymmetric link as SMC server When a link group moved from asymmetric to symmetric state then the dangling asymmetric link can be deleted. Add smc_llc_find_asym_link() to find the respective link and add smc_llc_delete_asym_link() to delete it. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_llc.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- net/smc/smc_wr.c | 2 +- net/smc/smc_wr.h | 1 + 3 files changed, 82 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 1fefee55e293..9d102c912be9 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -863,6 +863,85 @@ static void smc_llc_process_cli_add_link(struct smc_link_group *lgr) mutex_unlock(&lgr->llc_conf_mutex); } +/* find the asymmetric link when 3 links are established */ +static struct smc_link *smc_llc_find_asym_link(struct smc_link_group *lgr) +{ + int asym_idx = -ENOENT; + int i, j, k; + bool found; + + /* determine asymmetric link */ + found = false; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + for (j = i + 1; j < SMC_LINKS_PER_LGR_MAX; j++) { + if (!smc_link_usable(&lgr->lnk[i]) || + !smc_link_usable(&lgr->lnk[j])) + continue; + if (!memcmp(lgr->lnk[i].gid, lgr->lnk[j].gid, + SMC_GID_SIZE)) { + found = true; /* asym_lnk is i or j */ + break; + } + } + if (found) + break; + } + if (!found) + goto out; /* no asymmetric link */ + for (k = 0; k < SMC_LINKS_PER_LGR_MAX; k++) { + if (!smc_link_usable(&lgr->lnk[k])) + continue; + if (k != i && + !memcmp(lgr->lnk[i].peer_gid, lgr->lnk[k].peer_gid, + SMC_GID_SIZE)) { + asym_idx = i; + break; + } + if (k != j && + !memcmp(lgr->lnk[j].peer_gid, lgr->lnk[k].peer_gid, + SMC_GID_SIZE)) { + asym_idx = j; + break; + } + } +out: + return (asym_idx < 0) ? NULL : &lgr->lnk[asym_idx]; +} + +static void smc_llc_delete_asym_link(struct smc_link_group *lgr) +{ + struct smc_link *lnk_new = NULL, *lnk_asym; + struct smc_llc_qentry *qentry; + int rc; + + lnk_asym = smc_llc_find_asym_link(lgr); + if (!lnk_asym) + return; /* no asymmetric link */ + if (!smc_link_downing(&lnk_asym->state)) + return; + /* tbd: lnk_new = smc_switch_conns(lgr, lnk_asym, false); */ + smc_wr_tx_wait_no_pending_sends(lnk_asym); + if (!lnk_new) + goto out_free; + /* change flow type from ADD_LINK into DEL_LINK */ + lgr->llc_flow_lcl.type = SMC_LLC_FLOW_DEL_LINK; + rc = smc_llc_send_delete_link(lnk_new, lnk_asym->link_id, SMC_LLC_REQ, + true, SMC_LLC_DEL_NO_ASYM_NEEDED); + if (rc) { + smcr_link_down_cond(lnk_new); + goto out_free; + } + qentry = smc_llc_wait(lgr, lnk_new, SMC_LLC_WAIT_TIME, + SMC_LLC_DELETE_LINK); + if (!qentry) { + smcr_link_down_cond(lnk_new); + goto out_free; + } + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); +out_free: + smcr_link_clear(lnk_asym); +} + static int smc_llc_srv_rkey_exchange(struct smc_link *link, struct smc_link *link_new) { @@ -1014,7 +1093,7 @@ static void smc_llc_process_srv_add_link(struct smc_link_group *lgr) rc = smc_llc_srv_add_link(link); if (!rc && lgr->type == SMC_LGR_SYMMETRIC) { /* delete any asymmetric link */ - /* tbd: smc_llc_delete_asym_link(lgr); */ + smc_llc_delete_asym_link(lgr); } mutex_unlock(&lgr->llc_conf_mutex); } diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 031e6c9561b1..3fd27bea4f7a 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -61,7 +61,7 @@ static inline bool smc_wr_is_tx_pend(struct smc_link *link) } /* wait till all pending tx work requests on the given link are completed */ -static inline int smc_wr_tx_wait_no_pending_sends(struct smc_link *link) +int smc_wr_tx_wait_no_pending_sends(struct smc_link *link) { if (wait_event_timeout(link->wr_tx_wait, !smc_wr_is_tx_pend(link), SMC_WR_TX_WAIT_PENDING_TIME)) diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h index 3ac99c898418..f7eaeb3391f3 100644 --- a/net/smc/smc_wr.h +++ b/net/smc/smc_wr.h @@ -106,6 +106,7 @@ void smc_wr_tx_dismiss_slots(struct smc_link *lnk, u8 wr_rx_hdr_type, smc_wr_tx_filter filter, smc_wr_tx_dismisser dismisser, unsigned long data); +int smc_wr_tx_wait_no_pending_sends(struct smc_link *link); int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler); int smc_wr_rx_post_init(struct smc_link *link); -- cgit v1.2.3 From 9ec6bf19ec8bb19f4211f6a2bf62c079d46b54ea Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Sun, 3 May 2020 14:38:47 +0200 Subject: net/smc: llc_del_link_work and use the LLC flow for delete link Introduce a work that is scheduled when a new DELETE_LINK LLC request is received. The work will call either the SMC client or SMC server DELETE_LINK processing. And use the LLC flow framework to process incoming DELETE_LINK LLC messages, scheduling the llc_del_link_work for those events. With these changes smc_lgr_forget() is only called by one function and can be migrated into smc_lgr_cleanup_early(). Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_core.c | 22 ++++++++-------------- net/smc/smc_core.h | 2 +- net/smc/smc_llc.c | 55 +++++++++++++++++++++++++++++++++++------------------- 3 files changed, 45 insertions(+), 34 deletions(-) (limited to 'net') diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 2f8faa9c9e8e..a964304283fa 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -193,12 +193,19 @@ static void smc_lgr_unregister_conn(struct smc_connection *conn) void smc_lgr_cleanup_early(struct smc_connection *conn) { struct smc_link_group *lgr = conn->lgr; + struct list_head *lgr_list; + spinlock_t *lgr_lock; if (!lgr) return; smc_conn_free(conn); - smc_lgr_forget(lgr); + lgr_list = smc_lgr_list_head(lgr, &lgr_lock); + spin_lock_bh(lgr_lock); + /* do not use this link group for new connections */ + if (!list_empty(lgr_list)) + list_del_init(lgr_list); + spin_unlock_bh(lgr_lock); smc_lgr_schedule_free_work_fast(lgr); } @@ -653,19 +660,6 @@ static void smc_lgr_free(struct smc_link_group *lgr) kfree(lgr); } -void smc_lgr_forget(struct smc_link_group *lgr) -{ - struct list_head *lgr_list; - spinlock_t *lgr_lock; - - lgr_list = smc_lgr_list_head(lgr, &lgr_lock); - spin_lock_bh(lgr_lock); - /* do not use this link group for new connections */ - if (!list_empty(lgr_list)) - list_del_init(lgr_list); - spin_unlock_bh(lgr_lock); -} - static void smcd_unregister_all_dmbs(struct smc_link_group *lgr) { int i; diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 4e00819e2db7..7fe53feb9dc4 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -254,6 +254,7 @@ struct smc_link_group { struct mutex llc_conf_mutex; /* protects lgr reconfig. */ struct work_struct llc_add_link_work; + struct work_struct llc_del_link_work; struct work_struct llc_event_work; /* llc event worker */ wait_queue_head_t llc_waiter; @@ -343,7 +344,6 @@ struct smc_sock; struct smc_clc_msg_accept_confirm; struct smc_clc_msg_local; -void smc_lgr_forget(struct smc_link_group *lgr); void smc_lgr_cleanup_early(struct smc_connection *conn); void smc_lgr_terminate_sched(struct smc_link_group *lgr); void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport); diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 9d102c912be9..e4e3910a9624 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -1118,22 +1118,18 @@ out: smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); } -static void smc_llc_rx_delete_link(struct smc_link *link, - struct smc_llc_msg_del_link *llc) +static void smc_llc_delete_link_work(struct work_struct *work) { - struct smc_link_group *lgr = smc_get_lgr(link); + struct smc_link_group *lgr = container_of(work, struct smc_link_group, + llc_del_link_work); - smc_lgr_forget(lgr); - if (lgr->role == SMC_SERV) { - /* client asks to delete this link, send request */ - smc_llc_send_delete_link(link, 0, SMC_LLC_REQ, true, - SMC_LLC_DEL_PROG_INIT_TERM); - } else { - /* server requests to delete this link, send response */ - smc_llc_send_delete_link(link, 0, SMC_LLC_RESP, true, - SMC_LLC_DEL_PROG_INIT_TERM); + if (list_empty(&lgr->list)) { + /* link group is terminating */ + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + goto out; } - smcr_link_down_cond(link); +out: + smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); } /* process a confirm_rkey request from peer, remote flow */ @@ -1255,8 +1251,30 @@ static void smc_llc_event_handler(struct smc_llc_qentry *qentry) } break; case SMC_LLC_DELETE_LINK: - smc_llc_rx_delete_link(link, &llc->delete_link); - break; + if (lgr->role == SMC_CLNT) { + /* server requests to delete this link, send response */ + if (lgr->llc_flow_lcl.type != SMC_LLC_FLOW_NONE) { + /* DEL LINK REQ during ADD LINK SEQ */ + smc_llc_flow_qentry_set(&lgr->llc_flow_lcl, + qentry); + wake_up_interruptible(&lgr->llc_waiter); + } else if (smc_llc_flow_start(&lgr->llc_flow_lcl, + qentry)) { + schedule_work(&lgr->llc_del_link_work); + } + } else { + if (lgr->llc_flow_lcl.type == SMC_LLC_FLOW_ADD_LINK && + !lgr->llc_flow_lcl.qentry) { + /* DEL LINK REQ during ADD LINK SEQ */ + smc_llc_flow_qentry_set(&lgr->llc_flow_lcl, + qentry); + wake_up_interruptible(&lgr->llc_waiter); + } else if (smc_llc_flow_start(&lgr->llc_flow_lcl, + qentry)) { + schedule_work(&lgr->llc_del_link_work); + } + } + return; case SMC_LLC_CONFIRM_RKEY: /* new request from remote, assign to remote flow */ if (smc_llc_flow_start(&lgr->llc_flow_rmt, qentry)) { @@ -1325,6 +1343,7 @@ static void smc_llc_rx_response(struct smc_link *link, complete(&link->llc_testlink_resp); break; case SMC_LLC_ADD_LINK: + case SMC_LLC_DELETE_LINK: case SMC_LLC_CONFIRM_LINK: case SMC_LLC_ADD_LINK_CONT: case SMC_LLC_CONFIRM_RKEY: @@ -1333,10 +1352,6 @@ static void smc_llc_rx_response(struct smc_link *link, smc_llc_flow_qentry_set(&link->lgr->llc_flow_lcl, qentry); wake_up_interruptible(&link->lgr->llc_waiter); return; - case SMC_LLC_DELETE_LINK: - if (link->lgr->role == SMC_SERV) - smc_lgr_schedule_free_work_fast(link->lgr); - break; case SMC_LLC_CONFIRM_RKEY_CONT: /* not used because max links is 3 */ break; @@ -1424,6 +1439,7 @@ void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc) INIT_WORK(&lgr->llc_event_work, smc_llc_event_work); INIT_WORK(&lgr->llc_add_link_work, smc_llc_add_link_work); + INIT_WORK(&lgr->llc_del_link_work, smc_llc_delete_link_work); INIT_LIST_HEAD(&lgr->llc_event_q); spin_lock_init(&lgr->llc_event_q_lock); spin_lock_init(&lgr->llc_flow_lock); @@ -1439,6 +1455,7 @@ void smc_llc_lgr_clear(struct smc_link_group *lgr) wake_up_interruptible_all(&lgr->llc_waiter); cancel_work_sync(&lgr->llc_event_work); cancel_work_sync(&lgr->llc_add_link_work); + cancel_work_sync(&lgr->llc_del_link_work); if (lgr->delayed_event) { kfree(lgr->delayed_event); lgr->delayed_event = NULL; -- cgit v1.2.3 From 9c4168789cc635e1f0d265157b7617259d56bfee Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Sun, 3 May 2020 14:38:48 +0200 Subject: net/smc: delete link processing as SMC client Add smc_llc_process_cli_delete_link() to process a DELETE_LINK request as SMC client. When the request is to delete ALL links then terminate the whole link group. If not, find the link to delete by its link_id, send the DELETE_LINK response LLC message and then clear the deleted link. Finally determine and update the link group state. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_llc.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) (limited to 'net') diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index e4e3910a9624..cd57b4fb1842 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -863,6 +863,18 @@ static void smc_llc_process_cli_add_link(struct smc_link_group *lgr) mutex_unlock(&lgr->llc_conf_mutex); } +static int smc_llc_active_link_count(struct smc_link_group *lgr) +{ + int i, link_count = 0; + + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (!smc_link_usable(&lgr->lnk[i])) + continue; + link_count++; + } + return link_count; +} + /* find the asymmetric link when 3 links are established */ static struct smc_link *smc_llc_find_asym_link(struct smc_link_group *lgr) { @@ -1118,6 +1130,63 @@ out: smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); } +static void smc_llc_process_cli_delete_link(struct smc_link_group *lgr) +{ + struct smc_link *lnk_del = NULL, *lnk_asym, *lnk; + struct smc_llc_msg_del_link *del_llc; + struct smc_llc_qentry *qentry; + int active_links; + int lnk_idx; + + qentry = smc_llc_flow_qentry_clr(&lgr->llc_flow_lcl); + lnk = qentry->link; + del_llc = &qentry->msg.delete_link; + + if (del_llc->hd.flags & SMC_LLC_FLAG_DEL_LINK_ALL) { + smc_lgr_terminate_sched(lgr); + goto out; + } + mutex_lock(&lgr->llc_conf_mutex); + /* delete single link */ + for (lnk_idx = 0; lnk_idx < SMC_LINKS_PER_LGR_MAX; lnk_idx++) { + if (lgr->lnk[lnk_idx].link_id != del_llc->link_num) + continue; + lnk_del = &lgr->lnk[lnk_idx]; + break; + } + del_llc->hd.flags |= SMC_LLC_FLAG_RESP; + if (!lnk_del) { + /* link was not found */ + del_llc->reason = htonl(SMC_LLC_DEL_NOLNK); + smc_llc_send_message(lnk, &qentry->msg); + goto out_unlock; + } + lnk_asym = smc_llc_find_asym_link(lgr); + + del_llc->reason = 0; + smc_llc_send_message(lnk, &qentry->msg); /* response */ + + if (smc_link_downing(&lnk_del->state)) { + /* tbd: call smc_switch_conns(lgr, lnk_del, false); */ + smc_wr_tx_wait_no_pending_sends(lnk_del); + } + smcr_link_clear(lnk_del); + + active_links = smc_llc_active_link_count(lgr); + if (lnk_del == lnk_asym) { + /* expected deletion of asym link, don't change lgr state */ + } else if (active_links == 1) { + lgr->type = SMC_LGR_SINGLE; + } else if (!active_links) { + lgr->type = SMC_LGR_NONE; + smc_lgr_terminate_sched(lgr); + } +out_unlock: + mutex_unlock(&lgr->llc_conf_mutex); +out: + kfree(qentry); +} + static void smc_llc_delete_link_work(struct work_struct *work) { struct smc_link_group *lgr = container_of(work, struct smc_link_group, @@ -1128,6 +1197,9 @@ static void smc_llc_delete_link_work(struct work_struct *work) smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); goto out; } + + if (lgr->role == SMC_CLNT) + smc_llc_process_cli_delete_link(lgr); out: smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); } -- cgit v1.2.3 From 08ae27ddfb6514a8316b17256cd4262bb6931c1f Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Sun, 3 May 2020 14:38:49 +0200 Subject: net/smc: delete link processing as SMC server Add smc_llc_process_srv_delete_link() to process a DELETE_LINK request as SMC server. When the request is to delete ALL links then terminate the whole link group. If not, find the link to delete by its link_id, send the DELETE_LINK request LLC message and wait for the response. No matter if a response was received, clear the deleted link and update the link group state. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_llc.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) (limited to 'net') diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index cd57b4fb1842..ac065f6d60dc 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -1187,6 +1187,76 @@ out: kfree(qentry); } +static void smc_llc_process_srv_delete_link(struct smc_link_group *lgr) +{ + struct smc_llc_msg_del_link *del_llc; + struct smc_link *lnk, *lnk_del; + struct smc_llc_qentry *qentry; + int active_links; + int i; + + mutex_lock(&lgr->llc_conf_mutex); + qentry = smc_llc_flow_qentry_clr(&lgr->llc_flow_lcl); + lnk = qentry->link; + del_llc = &qentry->msg.delete_link; + + if (qentry->msg.delete_link.hd.flags & SMC_LLC_FLAG_DEL_LINK_ALL) { + /* delete entire lgr */ + smc_lgr_terminate_sched(lgr); + goto out; + } + /* delete single link */ + lnk_del = NULL; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (lgr->lnk[i].link_id == del_llc->link_num) { + lnk_del = &lgr->lnk[i]; + break; + } + } + if (!lnk_del) + goto out; /* asymmetric link already deleted */ + + if (smc_link_downing(&lnk_del->state)) { + /* tbd: call smc_switch_conns(lgr, lnk_del, false); */ + smc_wr_tx_wait_no_pending_sends(lnk_del); + } + if (!list_empty(&lgr->list)) { + /* qentry is either a request from peer (send it back to + * initiate the DELETE_LINK processing), or a locally + * enqueued DELETE_LINK request (forward it) + */ + if (!smc_llc_send_message(lnk, &qentry->msg)) { + struct smc_llc_msg_del_link *del_llc_resp; + struct smc_llc_qentry *qentry2; + + qentry2 = smc_llc_wait(lgr, lnk, SMC_LLC_WAIT_TIME, + SMC_LLC_DELETE_LINK); + if (!qentry2) { + } else { + del_llc_resp = &qentry2->msg.delete_link; + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + } + } + } + smcr_link_clear(lnk_del); + + active_links = smc_llc_active_link_count(lgr); + if (active_links == 1) { + lgr->type = SMC_LGR_SINGLE; + } else if (!active_links) { + lgr->type = SMC_LGR_NONE; + smc_lgr_terminate_sched(lgr); + } + + if (lgr->type == SMC_LGR_SINGLE && !list_empty(&lgr->list)) { + /* trigger setup of asymm alt link */ + /* tbd: call smc_llc_srv_add_link_local(lnk); */ + } +out: + mutex_unlock(&lgr->llc_conf_mutex); + kfree(qentry); +} + static void smc_llc_delete_link_work(struct work_struct *work) { struct smc_link_group *lgr = container_of(work, struct smc_link_group, @@ -1200,6 +1270,8 @@ static void smc_llc_delete_link_work(struct work_struct *work) if (lgr->role == SMC_CLNT) smc_llc_process_cli_delete_link(lgr); + else + smc_llc_process_srv_delete_link(lgr); out: smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); } -- cgit v1.2.3 From 4dadd151b26589fd0520feb97c93ee981b393a99 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Sun, 3 May 2020 14:38:50 +0200 Subject: net/smc: enqueue local LLC messages As SMC server, when a second link was deleted, trigger the setup of an asymmetric link. Do this by enqueueing a local ADD_LINK message which is processed by the LLC layer as if it were received from peer. Do the same when a new IB port became active and a new link could be created. smc_llc_srv_add_link_local() enqueues a local ADD_LINK message. And smc_llc_srv_delete_link_local() is used the same way to enqueue a local DELETE_LINK message. This is used when an IB port is no longer active. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_core.c | 3 ++- net/smc/smc_llc.c | 30 +++++++++++++++++++++++++++++- net/smc/smc_llc.h | 2 ++ 3 files changed, 33 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index a964304283fa..32a6cadc5c1f 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -883,7 +883,7 @@ static void smcr_link_up(struct smc_link_group *lgr, link = smc_llc_usable_link(lgr); if (!link) return; - /* tbd: call smc_llc_srv_add_link_local(link); */ + smc_llc_srv_add_link_local(link); } else { /* invite server to start add link processing */ u8 gid[SMC_GID_SIZE]; @@ -954,6 +954,7 @@ static void smcr_link_down(struct smc_link *lnk) if (lgr->role == SMC_SERV) { /* trigger local delete link processing */ + smc_llc_srv_delete_link_local(to_lnk, del_link_id); } else { if (lgr->llc_flow_lcl.type != SMC_LLC_FLOW_NONE) { /* another llc task is ongoing */ diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index ac065f6d60dc..7675ccd6f3c3 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -159,6 +159,8 @@ struct smc_llc_qentry { union smc_llc_msg msg; }; +static void smc_llc_enqueue(struct smc_link *link, union smc_llc_msg *llc); + struct smc_llc_qentry *smc_llc_flow_qentry_clr(struct smc_llc_flow *flow) { struct smc_llc_qentry *qentry = flow->qentry; @@ -1110,6 +1112,17 @@ static void smc_llc_process_srv_add_link(struct smc_link_group *lgr) mutex_unlock(&lgr->llc_conf_mutex); } +/* enqueue a local add_link req to trigger a new add_link flow, only as SERV */ +void smc_llc_srv_add_link_local(struct smc_link *link) +{ + struct smc_llc_msg_add_link add_llc = {0}; + + add_llc.hd.length = sizeof(add_llc); + add_llc.hd.common.type = SMC_LLC_ADD_LINK; + /* no dev and port needed, we as server ignore client data anyway */ + smc_llc_enqueue(link, (union smc_llc_msg *)&add_llc); +} + /* worker to process an add link message */ static void smc_llc_add_link_work(struct work_struct *work) { @@ -1130,6 +1143,21 @@ out: smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); } +/* enqueue a local del_link msg to trigger a new del_link flow, + * called only for role SMC_SERV + */ +void smc_llc_srv_delete_link_local(struct smc_link *link, u8 del_link_id) +{ + struct smc_llc_msg_del_link del_llc = {0}; + + del_llc.hd.length = sizeof(del_llc); + del_llc.hd.common.type = SMC_LLC_DELETE_LINK; + del_llc.link_num = del_link_id; + del_llc.reason = htonl(SMC_LLC_DEL_LOST_PATH); + del_llc.hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY; + smc_llc_enqueue(link, (union smc_llc_msg *)&del_llc); +} + static void smc_llc_process_cli_delete_link(struct smc_link_group *lgr) { struct smc_link *lnk_del = NULL, *lnk_asym, *lnk; @@ -1250,7 +1278,7 @@ static void smc_llc_process_srv_delete_link(struct smc_link_group *lgr) if (lgr->type == SMC_LGR_SINGLE && !list_empty(&lgr->list)) { /* trigger setup of asymm alt link */ - /* tbd: call smc_llc_srv_add_link_local(lnk); */ + smc_llc_srv_add_link_local(lnk); } out: mutex_unlock(&lgr->llc_conf_mutex); diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index 1a7748d0541f..c335fc5f363c 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -69,6 +69,7 @@ int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[], int smc_llc_send_delete_link(struct smc_link *link, u8 link_del_id, enum smc_llc_reqresp reqresp, bool orderly, u32 reason); +void smc_llc_srv_delete_link_local(struct smc_link *link, u8 del_link_id); void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc); void smc_llc_lgr_clear(struct smc_link_group *lgr); int smc_llc_link_init(struct smc_link *link); @@ -90,6 +91,7 @@ struct smc_llc_qentry *smc_llc_flow_qentry_clr(struct smc_llc_flow *flow); void smc_llc_flow_qentry_del(struct smc_llc_flow *flow); int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry); int smc_llc_srv_add_link(struct smc_link *link); +void smc_llc_srv_add_link_local(struct smc_link *link); int smc_llc_init(void) __init; #endif /* SMC_LLC_H */ -- cgit v1.2.3 From f0ec4f1d32ad49a23b93156949208dd9348e3590 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Mon, 4 May 2020 14:18:37 +0200 Subject: net/smc: save state of last sent CDC message When a link goes down and all connections of this link need to be switched to an other link then the producer cursor and the sequence of the last successfully sent CDC message must be known. Add the two fields to the SMC connection and update it in the tx completion handler. And to allow matching of sequences in error cases reset the seqno to the old value in smc_cdc_msg_send() when the actual send failed. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc.h | 4 ++++ net/smc/smc_cdc.c | 6 ++++++ 2 files changed, 10 insertions(+) (limited to 'net') diff --git a/net/smc/smc.h b/net/smc/smc.h index 1a084afa7372..1e9113771600 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -143,6 +143,9 @@ struct smc_connection { * .prod cf. TCP snd_nxt * .cons cf. TCP sends ack */ + union smc_host_cursor local_tx_ctrl_fin; + /* prod crsr - confirmed by peer + */ union smc_host_cursor tx_curs_prep; /* tx - prepared data * snd_max..wmem_alloc */ @@ -154,6 +157,7 @@ struct smc_connection { */ atomic_t sndbuf_space; /* remaining space in sndbuf */ u16 tx_cdc_seq; /* sequence # for CDC send */ + u16 tx_cdc_seq_fin; /* sequence # - tx completed */ spinlock_t send_lock; /* protect wr_sends */ struct delayed_work tx_work; /* retry of smc_cdc_msg_send */ u32 tx_off; /* base offset in peer rmb */ diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index f64589d823aa..c5e33296e55c 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -47,6 +47,9 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd, /* guarantee 0 <= sndbuf_space <= sndbuf_desc->len */ smp_mb__after_atomic(); smc_curs_copy(&conn->tx_curs_fin, &cdcpend->cursor, conn); + smc_curs_copy(&conn->local_tx_ctrl_fin, &cdcpend->p_cursor, + conn); + conn->tx_cdc_seq_fin = cdcpend->ctrl_seq; } smc_tx_sndbuf_nonfull(smc); bh_unlock_sock(&smc->sk); @@ -104,6 +107,9 @@ int smc_cdc_msg_send(struct smc_connection *conn, if (!rc) { smc_curs_copy(&conn->rx_curs_confirmed, &cfed, conn); conn->local_rx_ctrl.prod_flags.cons_curs_upd_req = 0; + } else { + conn->tx_cdc_seq--; + conn->local_tx_ctrl.seqno = conn->tx_cdc_seq; } return rc; -- cgit v1.2.3 From c6f02ebeea3a0ff4bddddf0fd82303190ebb3dd1 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Mon, 4 May 2020 14:18:38 +0200 Subject: net/smc: switch connections to alternate link Add smc_switch_conns() to switch all connections from a link that is going down. Find an other link to switch the connections to, and switch each connection to the new link. smc_switch_cursor() updates the cursors of a connection to the state of the last successfully sent CDC message. When there is no link to switch to, terminate the link group. Call smc_switch_conns() when a link is going down. And with the possibility that links of connections can switch adapt CDC and TX functions to detect and handle link switches. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_cdc.c | 18 +++++++- net/smc/smc_cdc.h | 1 + net/smc/smc_core.c | 132 ++++++++++++++++++++++++++++++++++++++++++++++++++++- net/smc/smc_core.h | 2 + net/smc/smc_llc.c | 6 +-- net/smc/smc_tx.c | 12 ++++- 6 files changed, 162 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index c5e33296e55c..3ca986066f32 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -56,11 +56,11 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd, } int smc_cdc_get_free_slot(struct smc_connection *conn, + struct smc_link *link, struct smc_wr_buf **wr_buf, struct smc_rdma_wr **wr_rdma_buf, struct smc_cdc_tx_pend **pend) { - struct smc_link *link = conn->lnk; int rc; rc = smc_wr_tx_get_free_slot(link, smc_cdc_tx_handler, wr_buf, @@ -119,13 +119,27 @@ static int smcr_cdc_get_slot_and_msg_send(struct smc_connection *conn) { struct smc_cdc_tx_pend *pend; struct smc_wr_buf *wr_buf; + struct smc_link *link; + bool again = false; int rc; - rc = smc_cdc_get_free_slot(conn, &wr_buf, NULL, &pend); +again: + link = conn->lnk; + rc = smc_cdc_get_free_slot(conn, link, &wr_buf, NULL, &pend); if (rc) return rc; spin_lock_bh(&conn->send_lock); + if (link != conn->lnk) { + /* link of connection changed, try again one time*/ + spin_unlock_bh(&conn->send_lock); + smc_wr_tx_put_slot(link, + (struct smc_wr_tx_pend_priv *)pend); + if (again) + return -ENOLINK; + again = true; + goto again; + } rc = smc_cdc_msg_send(conn, wr_buf, pend); spin_unlock_bh(&conn->send_lock); return rc; diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h index 861dc24c588c..42246b4bdcc9 100644 --- a/net/smc/smc_cdc.h +++ b/net/smc/smc_cdc.h @@ -304,6 +304,7 @@ struct smc_cdc_tx_pend { }; int smc_cdc_get_free_slot(struct smc_connection *conn, + struct smc_link *link, struct smc_wr_buf **wr_buf, struct smc_rdma_wr **wr_rdma_buf, struct smc_cdc_tx_pend **pend); diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 32a6cadc5c1f..21bc1ec07e99 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -432,6 +432,135 @@ out: return rc; } +static int smc_write_space(struct smc_connection *conn) +{ + int buffer_len = conn->peer_rmbe_size; + union smc_host_cursor prod; + union smc_host_cursor cons; + int space; + + smc_curs_copy(&prod, &conn->local_tx_ctrl.prod, conn); + smc_curs_copy(&cons, &conn->local_rx_ctrl.cons, conn); + /* determine rx_buf space */ + space = buffer_len - smc_curs_diff(buffer_len, &cons, &prod); + return space; +} + +static int smc_switch_cursor(struct smc_sock *smc) +{ + struct smc_connection *conn = &smc->conn; + union smc_host_cursor cons, fin; + int rc = 0; + int diff; + + smc_curs_copy(&conn->tx_curs_sent, &conn->tx_curs_fin, conn); + smc_curs_copy(&fin, &conn->local_tx_ctrl_fin, conn); + /* set prod cursor to old state, enforce tx_rdma_writes() */ + smc_curs_copy(&conn->local_tx_ctrl.prod, &fin, conn); + smc_curs_copy(&cons, &conn->local_rx_ctrl.cons, conn); + + if (smc_curs_comp(conn->peer_rmbe_size, &cons, &fin) < 0) { + /* cons cursor advanced more than fin, and prod was set + * fin above, so now prod is smaller than cons. Fix that. + */ + diff = smc_curs_diff(conn->peer_rmbe_size, &fin, &cons); + smc_curs_add(conn->sndbuf_desc->len, + &conn->tx_curs_sent, diff); + smc_curs_add(conn->sndbuf_desc->len, + &conn->tx_curs_fin, diff); + + smp_mb__before_atomic(); + atomic_add(diff, &conn->sndbuf_space); + smp_mb__after_atomic(); + + smc_curs_add(conn->peer_rmbe_size, + &conn->local_tx_ctrl.prod, diff); + smc_curs_add(conn->peer_rmbe_size, + &conn->local_tx_ctrl_fin, diff); + } + /* recalculate, value is used by tx_rdma_writes() */ + atomic_set(&smc->conn.peer_rmbe_space, smc_write_space(conn)); + + if (smc->sk.sk_state != SMC_INIT && + smc->sk.sk_state != SMC_CLOSED) { + /* tbd: call rc = smc_cdc_get_slot_and_msg_send(conn); */ + if (!rc) { + schedule_delayed_work(&conn->tx_work, 0); + smc->sk.sk_data_ready(&smc->sk); + } + } + return rc; +} + +struct smc_link *smc_switch_conns(struct smc_link_group *lgr, + struct smc_link *from_lnk, bool is_dev_err) +{ + struct smc_link *to_lnk = NULL; + struct smc_connection *conn; + struct smc_sock *smc; + struct rb_node *node; + int i, rc = 0; + + /* link is inactive, wake up tx waiters */ + smc_wr_wakeup_tx_wait(from_lnk); + + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (lgr->lnk[i].state != SMC_LNK_ACTIVE || + i == from_lnk->link_idx) + continue; + if (is_dev_err && from_lnk->smcibdev == lgr->lnk[i].smcibdev && + from_lnk->ibport == lgr->lnk[i].ibport) { + continue; + } + to_lnk = &lgr->lnk[i]; + break; + } + if (!to_lnk) { + smc_lgr_terminate_sched(lgr); + return NULL; + } +again: + read_lock_bh(&lgr->conns_lock); + for (node = rb_first(&lgr->conns_all); node; node = rb_next(node)) { + conn = rb_entry(node, struct smc_connection, alert_node); + if (conn->lnk != from_lnk) + continue; + smc = container_of(conn, struct smc_sock, conn); + /* conn->lnk not yet set in SMC_INIT state */ + if (smc->sk.sk_state == SMC_INIT) + continue; + if (smc->sk.sk_state == SMC_CLOSED || + smc->sk.sk_state == SMC_PEERCLOSEWAIT1 || + smc->sk.sk_state == SMC_PEERCLOSEWAIT2 || + smc->sk.sk_state == SMC_APPFINCLOSEWAIT || + smc->sk.sk_state == SMC_APPCLOSEWAIT1 || + smc->sk.sk_state == SMC_APPCLOSEWAIT2 || + smc->sk.sk_state == SMC_PEERFINCLOSEWAIT || + smc->sk.sk_state == SMC_PEERABORTWAIT || + smc->sk.sk_state == SMC_PROCESSABORT) { + spin_lock_bh(&conn->send_lock); + conn->lnk = to_lnk; + spin_unlock_bh(&conn->send_lock); + continue; + } + sock_hold(&smc->sk); + read_unlock_bh(&lgr->conns_lock); + /* avoid race with smcr_tx_sndbuf_nonempty() */ + spin_lock_bh(&conn->send_lock); + conn->lnk = to_lnk; + rc = smc_switch_cursor(smc); + spin_unlock_bh(&conn->send_lock); + sock_put(&smc->sk); + if (rc) { + smcr_link_down_cond_sched(to_lnk); + return NULL; + } + goto again; + } + read_unlock_bh(&lgr->conns_lock); + return to_lnk; +} + static void smcr_buf_unuse(struct smc_buf_desc *rmb_desc, struct smc_link_group *lgr) { @@ -943,8 +1072,7 @@ static void smcr_link_down(struct smc_link *lnk) return; smc_ib_modify_qp_reset(lnk); - to_lnk = NULL; - /* tbd: call to_lnk = smc_switch_conns(lgr, lnk, true); */ + to_lnk = smc_switch_conns(lgr, lnk, true); if (!to_lnk) { /* no backup link available */ smcr_link_clear(lnk); return; diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 7fe53feb9dc4..584f11230c4f 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -380,6 +380,8 @@ void smcr_link_clear(struct smc_link *lnk); int smcr_buf_map_lgr(struct smc_link *lnk); int smcr_buf_reg_lgr(struct smc_link *lnk); int smcr_link_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc); +struct smc_link *smc_switch_conns(struct smc_link_group *lgr, + struct smc_link *from_lnk, bool is_dev_err); void smcr_link_down_cond(struct smc_link *lnk); void smcr_link_down_cond_sched(struct smc_link *lnk); diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 7675ccd6f3c3..8d2368accbad 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -933,7 +933,7 @@ static void smc_llc_delete_asym_link(struct smc_link_group *lgr) return; /* no asymmetric link */ if (!smc_link_downing(&lnk_asym->state)) return; - /* tbd: lnk_new = smc_switch_conns(lgr, lnk_asym, false); */ + lnk_new = smc_switch_conns(lgr, lnk_asym, false); smc_wr_tx_wait_no_pending_sends(lnk_asym); if (!lnk_new) goto out_free; @@ -1195,7 +1195,7 @@ static void smc_llc_process_cli_delete_link(struct smc_link_group *lgr) smc_llc_send_message(lnk, &qentry->msg); /* response */ if (smc_link_downing(&lnk_del->state)) { - /* tbd: call smc_switch_conns(lgr, lnk_del, false); */ + smc_switch_conns(lgr, lnk_del, false); smc_wr_tx_wait_no_pending_sends(lnk_del); } smcr_link_clear(lnk_del); @@ -1245,7 +1245,7 @@ static void smc_llc_process_srv_delete_link(struct smc_link_group *lgr) goto out; /* asymmetric link already deleted */ if (smc_link_downing(&lnk_del->state)) { - /* tbd: call smc_switch_conns(lgr, lnk_del, false); */ + smc_switch_conns(lgr, lnk_del, false); smc_wr_tx_wait_no_pending_sends(lnk_del); } if (!list_empty(&lgr->list)) { diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 417204572a69..54ba0443847e 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -482,12 +482,13 @@ static int smc_tx_rdma_writes(struct smc_connection *conn, static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn) { struct smc_cdc_producer_flags *pflags = &conn->local_tx_ctrl.prod_flags; + struct smc_link *link = conn->lnk; struct smc_rdma_wr *wr_rdma_buf; struct smc_cdc_tx_pend *pend; struct smc_wr_buf *wr_buf; int rc; - rc = smc_cdc_get_free_slot(conn, &wr_buf, &wr_rdma_buf, &pend); + rc = smc_cdc_get_free_slot(conn, link, &wr_buf, &wr_rdma_buf, &pend); if (rc < 0) { if (rc == -EBUSY) { struct smc_sock *smc = @@ -505,10 +506,17 @@ static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn) } spin_lock_bh(&conn->send_lock); + if (link != conn->lnk) { + /* link of connection changed, tx_work will restart */ + smc_wr_tx_put_slot(link, + (struct smc_wr_tx_pend_priv *)pend); + rc = -ENOLINK; + goto out_unlock; + } if (!pflags->urg_data_present) { rc = smc_tx_rdma_writes(conn, wr_rdma_buf); if (rc) { - smc_wr_tx_put_slot(conn->lnk, + smc_wr_tx_put_slot(link, (struct smc_wr_tx_pend_priv *)pend); goto out_unlock; } -- cgit v1.2.3 From 29bd73dba4f72970895a2459f7190d388f5204f7 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Mon, 4 May 2020 14:18:39 +0200 Subject: net/smc: send failover validation message When a connection is switched to a new link then a link validation message must be sent to the peer over the new link, containing the sequence number of the last CDC message that was sent over the old link. The peer will validate if this sequence number is the same or lower then the number he received, and abort the connection if messages were lost. Add smcr_cdc_msg_send_validation() to send the message validation message and call it when a connection was switched in smc_switch_cursor(). Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_cdc.c | 25 +++++++++++++++++++++++++ net/smc/smc_cdc.h | 1 + net/smc/smc_core.c | 2 +- 3 files changed, 27 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index 3ca986066f32..e6b7eef71831 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -115,6 +115,31 @@ int smc_cdc_msg_send(struct smc_connection *conn, return rc; } +/* send a validation msg indicating the move of a conn to an other QP link */ +int smcr_cdc_msg_send_validation(struct smc_connection *conn) +{ + struct smc_host_cdc_msg *local = &conn->local_tx_ctrl; + struct smc_link *link = conn->lnk; + struct smc_cdc_tx_pend *pend; + struct smc_wr_buf *wr_buf; + struct smc_cdc_msg *peer; + int rc; + + rc = smc_cdc_get_free_slot(conn, link, &wr_buf, NULL, &pend); + if (rc) + return rc; + + peer = (struct smc_cdc_msg *)wr_buf; + peer->common.type = local->common.type; + peer->len = local->len; + peer->seqno = htons(conn->tx_cdc_seq_fin); /* seqno last compl. tx */ + peer->token = htonl(local->token); + peer->prod_flags.failover_validation = 1; + + rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend); + return rc; +} + static int smcr_cdc_get_slot_and_msg_send(struct smc_connection *conn) { struct smc_cdc_tx_pend *pend; diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h index 42246b4bdcc9..9cfabc9af120 100644 --- a/net/smc/smc_cdc.h +++ b/net/smc/smc_cdc.h @@ -313,6 +313,7 @@ int smc_cdc_msg_send(struct smc_connection *conn, struct smc_wr_buf *wr_buf, struct smc_cdc_tx_pend *pend); int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn); int smcd_cdc_msg_send(struct smc_connection *conn); +int smcr_cdc_msg_send_validation(struct smc_connection *conn); int smc_cdc_init(void) __init; void smcd_cdc_rx_init(struct smc_connection *conn); diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 21bc1ec07e99..a558ce0bde97 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -483,7 +483,7 @@ static int smc_switch_cursor(struct smc_sock *smc) if (smc->sk.sk_state != SMC_INIT && smc->sk.sk_state != SMC_CLOSED) { - /* tbd: call rc = smc_cdc_get_slot_and_msg_send(conn); */ + rc = smcr_cdc_msg_send_validation(conn); if (!rc) { schedule_delayed_work(&conn->tx_work, 0); smc->sk.sk_data_ready(&smc->sk); -- cgit v1.2.3 From b286a0651e4404ab96cdfdcdad8a839a26b3751e Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Mon, 4 May 2020 14:18:40 +0200 Subject: net/smc: handle incoming CDC validation message Call smc_cdc_msg_validate() when a CDC message with the failover validation bit enabled was received. Validate that the sequence number sent with the message is one we already have received. If not, messages were lost and the connection is terminated using a new abort_work. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc.h | 2 ++ net/smc/smc_cdc.c | 37 +++++++++++++++++++++++++++++++------ net/smc/smc_core.c | 15 +++++++++++++++ 3 files changed, 48 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/smc/smc.h b/net/smc/smc.h index 1e9113771600..6f1c42da7a4c 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -188,12 +188,14 @@ struct smc_connection { spinlock_t acurs_lock; /* protect cursors */ #endif struct work_struct close_work; /* peer sent some closing */ + struct work_struct abort_work; /* abort the connection */ struct tasklet_struct rx_tsklet; /* Receiver tasklet for SMC-D */ u8 rx_off; /* receive offset: * 0 for SMC-R, 32 for SMC-D */ u64 peer_token; /* SMC-D token of peer */ u8 killed : 1; /* abnormal termination */ + u8 out_of_sync : 1; /* out of sync with peer */ }; struct smc_sock { /* smc sock container */ diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index e6b7eef71831..b2b85e1be72c 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -282,6 +282,28 @@ static void smc_cdc_handle_urg_data_arrival(struct smc_sock *smc, sk_send_sigurg(&smc->sk); } +static void smc_cdc_msg_validate(struct smc_sock *smc, struct smc_cdc_msg *cdc, + struct smc_link *link) +{ + struct smc_connection *conn = &smc->conn; + u16 recv_seq = ntohs(cdc->seqno); + s16 diff; + + /* check that seqnum was seen before */ + diff = conn->local_rx_ctrl.seqno - recv_seq; + if (diff < 0) { /* diff larger than 0x7fff */ + /* drop connection */ + conn->out_of_sync = 1; /* prevent any further receives */ + spin_lock_bh(&conn->send_lock); + conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; + conn->lnk = link; + spin_unlock_bh(&conn->send_lock); + sock_hold(&smc->sk); /* sock_put in abort_work */ + if (!schedule_work(&conn->abort_work)) + sock_put(&smc->sk); + } +} + static void smc_cdc_msg_recv_action(struct smc_sock *smc, struct smc_cdc_msg *cdc) { @@ -412,16 +434,19 @@ static void smc_cdc_rx_handler(struct ib_wc *wc, void *buf) read_lock_bh(&lgr->conns_lock); conn = smc_lgr_find_conn(ntohl(cdc->token), lgr); read_unlock_bh(&lgr->conns_lock); - if (!conn) + if (!conn || conn->out_of_sync) return; smc = container_of(conn, struct smc_sock, conn); - if (!cdc->prod_flags.failover_validation) { - if (smc_cdc_before(ntohs(cdc->seqno), - conn->local_rx_ctrl.seqno)) - /* received seqno is old */ - return; + if (cdc->prod_flags.failover_validation) { + smc_cdc_msg_validate(smc, cdc, link); + return; } + if (smc_cdc_before(ntohs(cdc->seqno), + conn->local_rx_ctrl.seqno)) + /* received seqno is old */ + return; + smc_cdc_msg_recv(smc, cdc); } diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index a558ce0bde97..b5633fa19b6d 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -615,6 +615,8 @@ void smc_conn_free(struct smc_connection *conn) tasklet_kill(&conn->rx_tsklet); } else { smc_cdc_tx_dismiss_slots(conn); + if (current_work() != &conn->abort_work) + cancel_work_sync(&conn->abort_work); } if (!list_empty(&lgr->list)) { smc_lgr_unregister_conn(conn); @@ -996,6 +998,18 @@ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev) } } +/* abort connection, abort_work scheduled from tasklet context */ +static void smc_conn_abort_work(struct work_struct *work) +{ + struct smc_connection *conn = container_of(work, + struct smc_connection, + abort_work); + struct smc_sock *smc = container_of(conn, struct smc_sock, conn); + + smc_conn_kill(conn, true); + sock_put(&smc->sk); /* sock_hold done by schedulers of abort_work */ +} + /* link is up - establish alternate link if applicable */ static void smcr_link_up(struct smc_link_group *lgr, struct smc_ib_device *smcibdev, u8 ibport) @@ -1302,6 +1316,7 @@ create: conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE; conn->local_tx_ctrl.len = SMC_WR_TX_SIZE; conn->urg_state = SMC_URG_READ; + INIT_WORK(&smc->conn.abort_work, smc_conn_abort_work); if (ini->is_smcd) { conn->rx_off = sizeof(struct smcd_cdc_msg); smcd_cdc_rx_init(conn); /* init tasklet for this conn */ -- cgit v1.2.3 From 09c61d24f96dfef7791debfcaf96efe067ab2ba8 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Mon, 4 May 2020 14:18:41 +0200 Subject: net/smc: wait for departure of an IB message Introduce smc_wr_tx_send_wait() to send an IB message and wait for the tx completion event of the message. This makes sure that the message is no longer in-flight when the function returns. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_core.h | 1 + net/smc/smc_wr.c | 39 +++++++++++++++++++++++++++++++++++++++ net/smc/smc_wr.h | 2 ++ 3 files changed, 42 insertions(+) (limited to 'net') diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 584f11230c4f..86eebbadc8f6 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -85,6 +85,7 @@ struct smc_link { struct smc_rdma_sges *wr_tx_rdma_sges;/*RDMA WRITE gather meta data*/ struct smc_rdma_wr *wr_tx_rdmas; /* WR RDMA WRITE */ struct smc_wr_tx_pend *wr_tx_pends; /* WR send waiting for CQE */ + struct completion *wr_tx_compl; /* WR send CQE completion */ /* above four vectors have wr_tx_cnt elements and use the same index */ dma_addr_t wr_tx_dma_addr; /* DMA address of wr_tx_bufs */ atomic_long_t wr_tx_id; /* seq # of last sent WR */ diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 3fd27bea4f7a..7239ba9b99dc 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -44,6 +44,7 @@ struct smc_wr_tx_pend { /* control data for a pending send request */ struct smc_link *link; u32 idx; struct smc_wr_tx_pend_priv priv; + u8 compl_requested; }; /******************************** send queue *********************************/ @@ -103,6 +104,8 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) if (pnd_snd_idx == link->wr_tx_cnt) return; link->wr_tx_pends[pnd_snd_idx].wc_status = wc->status; + if (link->wr_tx_pends[pnd_snd_idx].compl_requested) + complete(&link->wr_tx_compl[pnd_snd_idx]); memcpy(&pnd_snd, &link->wr_tx_pends[pnd_snd_idx], sizeof(pnd_snd)); /* clear the full struct smc_wr_tx_pend including .priv */ memset(&link->wr_tx_pends[pnd_snd_idx], 0, @@ -275,6 +278,33 @@ int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv) return rc; } +/* Send prepared WR slot via ib_post_send and wait for send completion + * notification. + * @priv: pointer to smc_wr_tx_pend_priv identifying prepared message buffer + */ +int smc_wr_tx_send_wait(struct smc_link *link, struct smc_wr_tx_pend_priv *priv, + unsigned long timeout) +{ + struct smc_wr_tx_pend *pend; + int rc; + + pend = container_of(priv, struct smc_wr_tx_pend, priv); + pend->compl_requested = 1; + init_completion(&link->wr_tx_compl[pend->idx]); + + rc = smc_wr_tx_send(link, priv); + if (rc) + return rc; + /* wait for completion by smc_wr_tx_process_cqe() */ + rc = wait_for_completion_interruptible_timeout( + &link->wr_tx_compl[pend->idx], timeout); + if (rc <= 0) + rc = -ENODATA; + if (rc > 0) + rc = 0; + return rc; +} + /* Register a memory region and wait for result. */ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr) { @@ -555,6 +585,8 @@ void smc_wr_free_link(struct smc_link *lnk) void smc_wr_free_link_mem(struct smc_link *lnk) { + kfree(lnk->wr_tx_compl); + lnk->wr_tx_compl = NULL; kfree(lnk->wr_tx_pends); lnk->wr_tx_pends = NULL; kfree(lnk->wr_tx_mask); @@ -625,8 +657,15 @@ int smc_wr_alloc_link_mem(struct smc_link *link) GFP_KERNEL); if (!link->wr_tx_pends) goto no_mem_wr_tx_mask; + link->wr_tx_compl = kcalloc(SMC_WR_BUF_CNT, + sizeof(link->wr_tx_compl[0]), + GFP_KERNEL); + if (!link->wr_tx_compl) + goto no_mem_wr_tx_pends; return 0; +no_mem_wr_tx_pends: + kfree(link->wr_tx_pends); no_mem_wr_tx_mask: kfree(link->wr_tx_mask); no_mem_wr_rx_sges: diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h index f7eaeb3391f3..423b8709f1c9 100644 --- a/net/smc/smc_wr.h +++ b/net/smc/smc_wr.h @@ -101,6 +101,8 @@ int smc_wr_tx_put_slot(struct smc_link *link, struct smc_wr_tx_pend_priv *wr_pend_priv); int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *wr_pend_priv); +int smc_wr_tx_send_wait(struct smc_link *link, struct smc_wr_tx_pend_priv *priv, + unsigned long timeout); void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context); void smc_wr_tx_dismiss_slots(struct smc_link *lnk, u8 wr_rx_hdr_type, smc_wr_tx_filter filter, -- cgit v1.2.3 From f3811fd7bc97587b142fed9edf8c726694220cb2 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Mon, 4 May 2020 14:18:42 +0200 Subject: net/smc: send DELETE_LINK, ALL message and wait for send to complete Add smc_llc_send_message_wait() which uses smc_wr_tx_send_wait() to send an LLC message and waits for the message send to complete. smc_llc_send_link_delete_all() calls the new function to send an DELETE_LINK,ALL LLC message. The RFC states that the sender of this type of message needs to wait for the completion event of the message transmission and can terminate the link afterwards. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_core.c | 5 +++++ net/smc/smc_llc.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ net/smc/smc_llc.h | 2 ++ 3 files changed, 51 insertions(+) (limited to 'net') diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index b5633fa19b6d..8f630b76c5a4 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -238,6 +238,9 @@ static void smc_lgr_free_work(struct work_struct *work) spin_unlock_bh(lgr_lock); cancel_delayed_work(&lgr->free_work); + if (!lgr->is_smcd && !lgr->terminating) + smc_llc_send_link_delete_all(lgr, true, + SMC_LLC_DEL_PROG_INIT_TERM); if (lgr->is_smcd && !lgr->terminating) smc_ism_signal_shutdown(lgr); if (!lgr->is_smcd) { @@ -847,6 +850,8 @@ static void smc_lgr_cleanup(struct smc_link_group *lgr) smc_ism_put_vlan(lgr->smcd, lgr->vlan_id); put_device(&lgr->smcd->dev); } else { + smc_llc_send_link_delete_all(lgr, false, + SMC_LLC_DEL_OP_INIT_TERM); for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { struct smc_link *lnk = &lgr->lnk[i]; diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 8d2368accbad..0ea7ad6188ae 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -560,6 +560,25 @@ static int smc_llc_send_message(struct smc_link *link, void *llcbuf) return smc_wr_tx_send(link, pend); } +/* schedule an llc send on link, may wait for buffers, + * and wait for send completion notification. + * @return 0 on success + */ +static int smc_llc_send_message_wait(struct smc_link *link, void *llcbuf) +{ + struct smc_wr_tx_pend_priv *pend; + struct smc_wr_buf *wr_buf; + int rc; + + if (!smc_link_usable(link)) + return -ENOLINK; + rc = smc_llc_add_pending_send(link, &wr_buf, &pend); + if (rc) + return rc; + memcpy(wr_buf, llcbuf, sizeof(union smc_llc_msg)); + return smc_wr_tx_send_wait(link, pend, SMC_LLC_WAIT_TIME); +} + /********************************* receive ***********************************/ static int smc_llc_alloc_alt_link(struct smc_link_group *lgr, @@ -1215,6 +1234,29 @@ out: kfree(qentry); } +/* try to send a DELETE LINK ALL request on any active link, + * waiting for send completion + */ +void smc_llc_send_link_delete_all(struct smc_link_group *lgr, bool ord, u32 rsn) +{ + struct smc_llc_msg_del_link delllc = {0}; + int i; + + delllc.hd.common.type = SMC_LLC_DELETE_LINK; + delllc.hd.length = sizeof(delllc); + if (ord) + delllc.hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY; + delllc.hd.flags |= SMC_LLC_FLAG_DEL_LINK_ALL; + delllc.reason = htonl(rsn); + + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (!smc_link_usable(&lgr->lnk[i])) + continue; + if (!smc_llc_send_message_wait(&lgr->lnk[i], &delllc)) + break; + } +} + static void smc_llc_process_srv_delete_link(struct smc_link_group *lgr) { struct smc_llc_msg_del_link *del_llc; @@ -1230,6 +1272,8 @@ static void smc_llc_process_srv_delete_link(struct smc_link_group *lgr) if (qentry->msg.delete_link.hd.flags & SMC_LLC_FLAG_DEL_LINK_ALL) { /* delete entire lgr */ + smc_llc_send_link_delete_all(lgr, true, ntohl( + qentry->msg.delete_link.reason)); smc_lgr_terminate_sched(lgr); goto out; } diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index c335fc5f363c..6d2a5d943b83 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -89,6 +89,8 @@ struct smc_llc_qentry *smc_llc_wait(struct smc_link_group *lgr, int time_out, u8 exp_msg); struct smc_llc_qentry *smc_llc_flow_qentry_clr(struct smc_llc_flow *flow); void smc_llc_flow_qentry_del(struct smc_llc_flow *flow); +void smc_llc_send_link_delete_all(struct smc_link_group *lgr, bool ord, + u32 rsn); int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry); int smc_llc_srv_add_link(struct smc_link *link); void smc_llc_srv_add_link_local(struct smc_link *link); -- cgit v1.2.3 From 56bc3b2094b428d808dd1704fdb3086c66bcb310 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Mon, 4 May 2020 14:18:43 +0200 Subject: net/smc: assign link to a new connection For new connections, assign a link from the link group, using some simple load balancing. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_core.c | 65 ++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 46 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 8f630b76c5a4..9c19b9aa3719 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -121,16 +121,59 @@ static void smc_lgr_add_alert_token(struct smc_connection *conn) rb_insert_color(&conn->alert_node, &conn->lgr->conns_all); } +/* assign an SMC-R link to the connection */ +static int smcr_lgr_conn_assign_link(struct smc_connection *conn, bool first) +{ + enum smc_link_state expected = first ? SMC_LNK_ACTIVATING : + SMC_LNK_ACTIVE; + int i, j; + + /* do link balancing */ + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + struct smc_link *lnk = &conn->lgr->lnk[i]; + + if (lnk->state != expected) + continue; + if (conn->lgr->role == SMC_CLNT) { + conn->lnk = lnk; /* temporary, SMC server assigns link*/ + break; + } + if (conn->lgr->conns_num % 2) { + for (j = i + 1; j < SMC_LINKS_PER_LGR_MAX; j++) { + struct smc_link *lnk2; + + lnk2 = &conn->lgr->lnk[j]; + if (lnk2->state == expected) { + conn->lnk = lnk2; + break; + } + } + } + if (!conn->lnk) + conn->lnk = lnk; + break; + } + if (!conn->lnk) + return SMC_CLC_DECL_NOACTLINK; + return 0; +} + /* Register connection in link group by assigning an alert token * registered in a search tree. * Requires @conns_lock * Note that '0' is a reserved value and not assigned. */ -static int smc_lgr_register_conn(struct smc_connection *conn) +static int smc_lgr_register_conn(struct smc_connection *conn, bool first) { struct smc_sock *smc = container_of(conn, struct smc_sock, conn); static atomic_t nexttoken = ATOMIC_INIT(0); + int rc; + if (!conn->lgr->is_smcd) { + rc = smcr_lgr_conn_assign_link(conn, first); + if (rc) + return rc; + } /* find a new alert_token_local value not yet used by some connection * in this link group */ @@ -141,22 +184,6 @@ static int smc_lgr_register_conn(struct smc_connection *conn) conn->alert_token_local = 0; } smc_lgr_add_alert_token(conn); - - /* assign the new connection to a link */ - if (!conn->lgr->is_smcd) { - struct smc_link *lnk; - int i; - - /* tbd - link balancing */ - for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { - lnk = &conn->lgr->lnk[i]; - if (lnk->state == SMC_LNK_ACTIVATING || - lnk->state == SMC_LNK_ACTIVE) - conn->lnk = lnk; - } - if (!conn->lnk) - return SMC_CLC_DECL_NOACTLINK; - } conn->lgr->conns_num++; return 0; } @@ -1285,7 +1312,7 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) /* link group found */ ini->cln_first_contact = SMC_REUSE_CONTACT; conn->lgr = lgr; - rc = smc_lgr_register_conn(conn); /* add conn to lgr */ + rc = smc_lgr_register_conn(conn, false); write_unlock_bh(&lgr->conns_lock); if (!rc && delayed_work_pending(&lgr->free_work)) cancel_delayed_work(&lgr->free_work); @@ -1313,7 +1340,7 @@ create: goto out; lgr = conn->lgr; write_lock_bh(&lgr->conns_lock); - rc = smc_lgr_register_conn(conn); /* add smc conn to lgr */ + rc = smc_lgr_register_conn(conn, true); write_unlock_bh(&lgr->conns_lock); if (rc) goto out; -- cgit v1.2.3 From ad6c111b8ae760114df6765d5a5ed1b09020d45d Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Mon, 4 May 2020 14:18:44 +0200 Subject: net/smc: asymmetric link tagging New connections must not be assigned to asymmetric links. Add asymmetric link tagging using new link variable link_is_asym. The new helpers smcr_lgr_set_type() and smcr_lgr_set_type_asym() are called to set the state of the link group, and tag all links accordingly. smcr_lgr_conn_assign_link() respects the link tagging and will not assign new connections to links tagged as asymmetric link. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_core.c | 26 +++++++++++++++++++++++--- net/smc/smc_core.h | 4 ++++ net/smc/smc_llc.c | 20 ++++++++++++++------ 3 files changed, 41 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 9c19b9aa3719..be15b30a1234 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -132,7 +132,7 @@ static int smcr_lgr_conn_assign_link(struct smc_connection *conn, bool first) for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { struct smc_link *lnk = &conn->lgr->lnk[i]; - if (lnk->state != expected) + if (lnk->state != expected || lnk->link_is_asym) continue; if (conn->lgr->role == SMC_CLNT) { conn->lnk = lnk; /* temporary, SMC server assigns link*/ @@ -143,7 +143,8 @@ static int smcr_lgr_conn_assign_link(struct smc_connection *conn, bool first) struct smc_link *lnk2; lnk2 = &conn->lgr->lnk[j]; - if (lnk2->state == expected) { + if (lnk2->state == expected && + !lnk2->link_is_asym) { conn->lnk = lnk2; break; } @@ -1030,6 +1031,25 @@ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev) } } +/* set new lgr type and clear all asymmetric link tagging */ +void smcr_lgr_set_type(struct smc_link_group *lgr, enum smc_lgr_type new_type) +{ + int i; + + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) + if (smc_link_usable(&lgr->lnk[i])) + lgr->lnk[i].link_is_asym = false; + lgr->type = new_type; +} + +/* set new lgr type and tag a link as asymmetric */ +void smcr_lgr_set_type_asym(struct smc_link_group *lgr, + enum smc_lgr_type new_type, int asym_lnk_idx) +{ + smcr_lgr_set_type(lgr, new_type); + lgr->lnk[asym_lnk_idx].link_is_asym = true; +} + /* abort connection, abort_work scheduled from tasklet context */ static void smc_conn_abort_work(struct work_struct *work) { @@ -1123,7 +1143,7 @@ static void smcr_link_down(struct smc_link *lnk) smcr_link_clear(lnk); return; } - lgr->type = SMC_LGR_SINGLE; + smcr_lgr_set_type(lgr, SMC_LGR_SINGLE); del_link_id = lnk->link_id; if (lgr->role == SMC_SERV) { diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 86eebbadc8f6..6ed7ab6d89d5 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -117,6 +117,7 @@ struct smc_link { u8 peer_gid[SMC_GID_SIZE]; /* gid of peer*/ u8 link_id; /* unique # within link group */ u8 link_idx; /* index in lgr link array */ + u8 link_is_asym; /* is link asymmetric? */ struct smc_link_group *lgr; /* parent link group */ struct work_struct link_down_wrk; /* wrk to bring link down */ @@ -380,6 +381,9 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, void smcr_link_clear(struct smc_link *lnk); int smcr_buf_map_lgr(struct smc_link *lnk); int smcr_buf_reg_lgr(struct smc_link *lnk); +void smcr_lgr_set_type(struct smc_link_group *lgr, enum smc_lgr_type new_type); +void smcr_lgr_set_type_asym(struct smc_link_group *lgr, + enum smc_lgr_type new_type, int asym_lnk_idx); int smcr_link_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc); struct smc_link *smc_switch_conns(struct smc_link_group *lgr, struct smc_link *from_lnk, bool is_dev_err); diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 0ea7ad6188ae..f65b2aac6b52 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -796,7 +796,11 @@ static int smc_llc_cli_conf_link(struct smc_link *link, return -ENOLINK; } smc_llc_link_active(link_new); - lgr->type = lgr_new_t; + if (lgr_new_t == SMC_LGR_ASYMMETRIC_LOCAL || + lgr_new_t == SMC_LGR_ASYMMETRIC_PEER) + smcr_lgr_set_type_asym(lgr, lgr_new_t, link_new->link_idx); + else + smcr_lgr_set_type(lgr, lgr_new_t); return 0; } @@ -1038,7 +1042,11 @@ static int smc_llc_srv_conf_link(struct smc_link *link, return -ENOLINK; } smc_llc_link_active(link_new); - lgr->type = lgr_new_t; + if (lgr_new_t == SMC_LGR_ASYMMETRIC_LOCAL || + lgr_new_t == SMC_LGR_ASYMMETRIC_PEER) + smcr_lgr_set_type_asym(lgr, lgr_new_t, link_new->link_idx); + else + smcr_lgr_set_type(lgr, lgr_new_t); smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); return 0; } @@ -1223,9 +1231,9 @@ static void smc_llc_process_cli_delete_link(struct smc_link_group *lgr) if (lnk_del == lnk_asym) { /* expected deletion of asym link, don't change lgr state */ } else if (active_links == 1) { - lgr->type = SMC_LGR_SINGLE; + smcr_lgr_set_type(lgr, SMC_LGR_SINGLE); } else if (!active_links) { - lgr->type = SMC_LGR_NONE; + smcr_lgr_set_type(lgr, SMC_LGR_NONE); smc_lgr_terminate_sched(lgr); } out_unlock: @@ -1314,9 +1322,9 @@ static void smc_llc_process_srv_delete_link(struct smc_link_group *lgr) active_links = smc_llc_active_link_count(lgr); if (active_links == 1) { - lgr->type = SMC_LGR_SINGLE; + smcr_lgr_set_type(lgr, SMC_LGR_SINGLE); } else if (!active_links) { - lgr->type = SMC_LGR_NONE; + smcr_lgr_set_type(lgr, SMC_LGR_NONE); smc_lgr_terminate_sched(lgr); } -- cgit v1.2.3 From 3e0c40afce4ea5b08bb7e3f65c55157817116640 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Mon, 4 May 2020 14:18:45 +0200 Subject: net/smc: add termination reason and handle LLC protocol violation Allow to set the reason code for the link group termination, and set meaningful values before termination processing is triggered. This reason code is sent to the peer in the final delete link message. When the LLC request or response layer receives a message type that was not handled, drop a warning and terminate the link group. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_core.c | 8 ++++++-- net/smc/smc_core.h | 2 ++ net/smc/smc_llc.c | 14 ++++++++++++++ net/smc/smc_llc.h | 8 ++++++++ 4 files changed, 30 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index be15b30a1234..b6f93b44f9c7 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -878,8 +878,11 @@ static void smc_lgr_cleanup(struct smc_link_group *lgr) smc_ism_put_vlan(lgr->smcd, lgr->vlan_id); put_device(&lgr->smcd->dev); } else { - smc_llc_send_link_delete_all(lgr, false, - SMC_LLC_DEL_OP_INIT_TERM); + u32 rsn = lgr->llc_termination_rsn; + + if (!rsn) + rsn = SMC_LLC_DEL_PROG_INIT_TERM; + smc_llc_send_link_delete_all(lgr, false, rsn); for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { struct smc_link *lnk = &lgr->lnk[i]; @@ -1018,6 +1021,7 @@ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev) list_for_each_entry_safe(lgr, lg, &lgr_free_list, list) { list_del_init(&lgr->list); + smc_llc_set_termination_rsn(lgr, SMC_LLC_DEL_OP_INIT_TERM); __smc_lgr_terminate(lgr, false); } diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 6ed7ab6d89d5..32bc45af9a1a 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -271,6 +271,8 @@ struct smc_link_group { /* protects llc flow */ int llc_testlink_time; /* link keep alive time */ + u32 llc_termination_rsn; + /* rsn code for termination */ }; struct { /* SMC-D */ u64 peer_gid; diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index f65b2aac6b52..482acf80e26e 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -1420,6 +1420,14 @@ static void smc_llc_rmt_delete_rkey(struct smc_link_group *lgr) smc_llc_flow_qentry_del(&lgr->llc_flow_rmt); } +static void smc_llc_protocol_violation(struct smc_link_group *lgr, u8 type) +{ + pr_warn_ratelimited("smc: SMC-R lg %*phN LLC protocol violation: " + "llc_type %d\n", SMC_LGR_ID_SIZE, &lgr->id, type); + smc_llc_set_termination_rsn(lgr, SMC_LLC_DEL_PROT_VIOL); + smc_lgr_terminate_sched(lgr); +} + /* flush the llc event queue */ static void smc_llc_event_flush(struct smc_link_group *lgr) { @@ -1520,6 +1528,9 @@ static void smc_llc_event_handler(struct smc_llc_qentry *qentry) smc_llc_flow_stop(lgr, &lgr->llc_flow_rmt); } return; + default: + smc_llc_protocol_violation(lgr, llc->raw.hdr.common.type); + break; } out: kfree(qentry); @@ -1579,6 +1590,9 @@ static void smc_llc_rx_response(struct smc_link *link, case SMC_LLC_CONFIRM_RKEY_CONT: /* not used because max links is 3 */ break; + default: + smc_llc_protocol_violation(link->lgr, llc_type); + break; } kfree(qentry); } diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index 6d2a5d943b83..f5882ebf357b 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -60,6 +60,14 @@ static inline struct smc_link *smc_llc_usable_link(struct smc_link_group *lgr) return NULL; } +/* set the termination reason code for the link group */ +static inline void smc_llc_set_termination_rsn(struct smc_link_group *lgr, + u32 rsn) +{ + if (!lgr->llc_termination_rsn) + lgr->llc_termination_rsn = rsn; +} + /* transmit */ int smc_llc_send_confirm_link(struct smc_link *lnk, enum smc_llc_reqresp reqresp); -- cgit v1.2.3 From a52bcc919b14c9d78f03b2b4ff604e5ca69c7e6d Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Mon, 4 May 2020 14:18:46 +0200 Subject: net/smc: improve termination processing Add helper smcr_lgr_link_deactivate_all() and eliminate duplicate code. In smc_lgr_free(), clear the smc-r links before smc_lgr_free_bufs() is called so buffers are already prepared for free. The usage of the soft parameter in __smc_lgr_terminate() is no longer needed, smc_lgr_free() can be called directly. smc_lgr_terminate_sched() and smc_smcd_terminate() set lgr->freeing to indicate that the link group will be freed soon to avoid unnecessary schedules of the free worker. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_core.c | 61 +++++++++++++++++++++++++++--------------------------- 1 file changed, 31 insertions(+), 30 deletions(-) (limited to 'net') diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index b6f93b44f9c7..fb391bc6781e 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -237,6 +237,19 @@ void smc_lgr_cleanup_early(struct smc_connection *conn) smc_lgr_schedule_free_work_fast(lgr); } +static void smcr_lgr_link_deactivate_all(struct smc_link_group *lgr) +{ + int i; + + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + struct smc_link *lnk = &lgr->lnk[i]; + + if (smc_link_usable(lnk)) + lnk->state = SMC_LNK_INACTIVE; + } + wake_up_interruptible_all(&lgr->llc_waiter); +} + static void smc_lgr_free(struct smc_link_group *lgr); static void smc_lgr_free_work(struct work_struct *work) @@ -246,7 +259,6 @@ static void smc_lgr_free_work(struct work_struct *work) free_work); spinlock_t *lgr_lock; bool conns; - int i; smc_lgr_list_head(lgr, &lgr_lock); spin_lock_bh(lgr_lock); @@ -271,15 +283,8 @@ static void smc_lgr_free_work(struct work_struct *work) SMC_LLC_DEL_PROG_INIT_TERM); if (lgr->is_smcd && !lgr->terminating) smc_ism_signal_shutdown(lgr); - if (!lgr->is_smcd) { - for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { - struct smc_link *lnk = &lgr->lnk[i]; - - if (smc_link_usable(lnk)) - lnk->state = SMC_LNK_INACTIVE; - } - wake_up_interruptible_all(&lgr->llc_waiter); - } + if (!lgr->is_smcd) + smcr_lgr_link_deactivate_all(lgr); smc_lgr_free(lgr); } @@ -802,6 +807,16 @@ static void smc_lgr_free(struct smc_link_group *lgr) { int i; + if (!lgr->is_smcd) { + mutex_lock(&lgr->llc_conf_mutex); + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (lgr->lnk[i].state != SMC_LNK_UNUSED) + smcr_link_clear(&lgr->lnk[i]); + } + mutex_unlock(&lgr->llc_conf_mutex); + smc_llc_lgr_clear(lgr); + } + smc_lgr_free_bufs(lgr); if (lgr->is_smcd) { if (!lgr->terminating) { @@ -811,11 +826,6 @@ static void smc_lgr_free(struct smc_link_group *lgr) if (!atomic_dec_return(&lgr->smcd->lgr_cnt)) wake_up(&lgr->smcd->lgrs_deleted); } else { - for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { - if (lgr->lnk[i].state != SMC_LNK_UNUSED) - smcr_link_clear(&lgr->lnk[i]); - } - smc_llc_lgr_clear(lgr); if (!atomic_dec_return(&lgr_cnt)) wake_up(&lgrs_deleted); } @@ -870,8 +880,6 @@ static void smc_conn_kill(struct smc_connection *conn, bool soft) static void smc_lgr_cleanup(struct smc_link_group *lgr) { - int i; - if (lgr->is_smcd) { smc_ism_signal_shutdown(lgr); smcd_unregister_all_dmbs(lgr); @@ -883,13 +891,7 @@ static void smc_lgr_cleanup(struct smc_link_group *lgr) if (!rsn) rsn = SMC_LLC_DEL_PROG_INIT_TERM; smc_llc_send_link_delete_all(lgr, false, rsn); - for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { - struct smc_link *lnk = &lgr->lnk[i]; - - if (smc_link_usable(lnk)) - lnk->state = SMC_LNK_INACTIVE; - } - wake_up_interruptible_all(&lgr->llc_waiter); + smcr_lgr_link_deactivate_all(lgr); } } @@ -905,8 +907,8 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft) if (lgr->terminating) return; /* lgr already terminating */ - if (!soft) - cancel_delayed_work_sync(&lgr->free_work); + /* cancel free_work sync, will terminate when lgr->freeing is set */ + cancel_delayed_work_sync(&lgr->free_work); lgr->terminating = 1; /* kill remaining link group connections */ @@ -926,10 +928,7 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft) } read_unlock_bh(&lgr->conns_lock); smc_lgr_cleanup(lgr); - if (soft) - smc_lgr_schedule_free_work_fast(lgr); - else - smc_lgr_free(lgr); + smc_lgr_free(lgr); } /* unlink link group and schedule termination */ @@ -944,6 +943,7 @@ void smc_lgr_terminate_sched(struct smc_link_group *lgr) return; /* lgr already terminating */ } list_del_init(&lgr->list); + lgr->freeing = 1; spin_unlock_bh(lgr_lock); schedule_work(&lgr->terminate_work); } @@ -962,6 +962,7 @@ void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, unsigned short vlan) if (peer_gid) /* peer triggered termination */ lgr->peer_shutdown = 1; list_move(&lgr->list, &lgr_free_list); + lgr->freeing = 1; } } spin_unlock_bh(&dev->lgr_lock); -- cgit v1.2.3 From 45fa8da0bf5cb447fcf835d184e2d3b745376e69 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Mon, 4 May 2020 14:18:47 +0200 Subject: net/smc: create improved SMC-R link_uid The link_uid of an SMC-R link is exchanged between SMC peers and its value can be used for debugging purposes. Create a unique link_uid during link initialization and use it in communication with SMC-R peers. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_core.c | 1 + net/smc/smc_core.h | 4 +++- net/smc/smc_llc.c | 18 ++++++++++++++---- net/smc/smc_llc.h | 1 + 4 files changed, 19 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index fb391bc6781e..fb5f685ff494 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -331,6 +331,7 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, lnk->smcibdev = ini->ib_dev; lnk->ibport = ini->ib_port; lnk->path_mtu = ini->ib_dev->pattr[ini->ib_port - 1].active_mtu; + smc_llc_link_set_uid(lnk); INIT_WORK(&lnk->link_down_wrk, smc_link_down_work); if (!ini->ib_dev->initialized) { rc = (int)smc_ib_setup_per_ibdev(ini->ib_dev); diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 32bc45af9a1a..e2ace20db7fd 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -70,6 +70,8 @@ struct smc_rdma_wr { /* work requests per message struct ib_rdma_wr wr_tx_rdma[SMC_MAX_RDMA_WRITES]; }; +#define SMC_LGR_ID_SIZE 4 + struct smc_link { struct smc_ib_device *smcibdev; /* ib-device */ u8 ibport; /* port - values 1 | 2 */ @@ -116,6 +118,7 @@ struct smc_link { u8 peer_mac[ETH_ALEN]; /* = gid[8:10||13:15] */ u8 peer_gid[SMC_GID_SIZE]; /* gid of peer*/ u8 link_id; /* unique # within link group */ + u8 link_uid[SMC_LGR_ID_SIZE]; /* unique lnk id */ u8 link_idx; /* index in lgr link array */ u8 link_is_asym; /* is link asymmetric? */ struct smc_link_group *lgr; /* parent link group */ @@ -178,7 +181,6 @@ struct smc_rtoken { /* address/key of remote RMB */ u32 rkey; }; -#define SMC_LGR_ID_SIZE 4 #define SMC_BUF_MIN_SIZE 16384 /* minimum size of an RMB */ #define SMC_RMBE_SIZES 16 /* number of distinct RMBE sizes */ /* theoretically, the RFC states that largest size would be 512K, diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 482acf80e26e..afb889d60881 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -361,7 +361,6 @@ static int smc_llc_add_pending_send(struct smc_link *link, int smc_llc_send_confirm_link(struct smc_link *link, enum smc_llc_reqresp reqresp) { - struct smc_link_group *lgr = smc_get_lgr(link); struct smc_llc_msg_confirm_link *confllc; struct smc_wr_tx_pend_priv *pend; struct smc_wr_buf *wr_buf; @@ -382,7 +381,7 @@ int smc_llc_send_confirm_link(struct smc_link *link, memcpy(confllc->sender_gid, link->gid, SMC_GID_SIZE); hton24(confllc->sender_qp_num, link->roce_qp->qp_num); confllc->link_num = link->link_id; - memcpy(confllc->link_uid, lgr->id, SMC_LGR_ID_SIZE); + memcpy(confllc->link_uid, link->link_uid, SMC_LGR_ID_SIZE); confllc->max_links = SMC_LLC_ADD_LNK_MAX_LINKS; /* send llc message */ rc = smc_wr_tx_send(link, pend); @@ -845,7 +844,8 @@ int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry) if (rc) goto out_reject; smc_llc_save_add_link_info(lnk_new, llc); - lnk_new->link_id = llc->link_num; + lnk_new->link_id = llc->link_num; /* SMC server assigns link id */ + smc_llc_link_set_uid(lnk_new); rc = smc_ib_ready_link(lnk_new); if (rc) @@ -1775,12 +1775,22 @@ out: return rc; } +void smc_llc_link_set_uid(struct smc_link *link) +{ + __be32 link_uid; + + link_uid = htonl(*((u32 *)link->lgr->id) + link->link_id); + memcpy(link->link_uid, &link_uid, SMC_LGR_ID_SIZE); +} + /* evaluate confirm link request or response */ int smc_llc_eval_conf_link(struct smc_llc_qentry *qentry, enum smc_llc_reqresp type) { - if (type == SMC_LLC_REQ) /* SMC server assigns link_id */ + if (type == SMC_LLC_REQ) { /* SMC server assigns link_id */ qentry->link->link_id = qentry->msg.confirm_link.link_num; + smc_llc_link_set_uid(qentry->link); + } if (!(qentry->msg.raw.hdr.flags & SMC_LLC_FLAG_NO_RMBE_EYEC)) return -ENOTSUPP; return 0; diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index f5882ebf357b..1b68f229cb99 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -92,6 +92,7 @@ int smc_llc_flow_initiate(struct smc_link_group *lgr, void smc_llc_flow_stop(struct smc_link_group *lgr, struct smc_llc_flow *flow); int smc_llc_eval_conf_link(struct smc_llc_qentry *qentry, enum smc_llc_reqresp type); +void smc_llc_link_set_uid(struct smc_link *link); struct smc_llc_qentry *smc_llc_wait(struct smc_link_group *lgr, struct smc_link *lnk, int time_out, u8 exp_msg); -- cgit v1.2.3 From 649758fff327eeb184713db8b0b0ebfa28693077 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Mon, 4 May 2020 14:18:48 +0200 Subject: net/smc: save SMC-R peer link_uid During SMC-R link establishment the peers exchange the link_uid that is used for debugging purposes. Save the peer link_uid in smc_link so it can be retrieved by the smc_diag netlink interface. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 2 ++ net/smc/smc_core.h | 1 + net/smc/smc_llc.c | 9 +++++++++ net/smc/smc_llc.h | 1 + 4 files changed, 13 insertions(+) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index c67272007f41..4e4421c95ca1 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -390,6 +390,7 @@ static int smcr_clnt_conf_first_link(struct smc_sock *smc) SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc; } + smc_llc_save_peer_uid(qentry); rc = smc_llc_eval_conf_link(qentry, SMC_LLC_REQ); smc_llc_flow_qentry_del(&link->lgr->llc_flow_lcl); if (rc) @@ -1056,6 +1057,7 @@ static int smcr_serv_conf_first_link(struct smc_sock *smc) SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc; } + smc_llc_save_peer_uid(qentry); rc = smc_llc_eval_conf_link(qentry, SMC_LLC_RESP); smc_llc_flow_qentry_del(&link->lgr->llc_flow_lcl); if (rc) diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index e2ace20db7fd..4ae76802214f 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -119,6 +119,7 @@ struct smc_link { u8 peer_gid[SMC_GID_SIZE]; /* gid of peer*/ u8 link_id; /* unique # within link group */ u8 link_uid[SMC_LGR_ID_SIZE]; /* unique lnk id */ + u8 peer_link_uid[SMC_LGR_ID_SIZE]; /* peer uid */ u8 link_idx; /* index in lgr link array */ u8 link_is_asym; /* is link asymmetric? */ struct smc_link_group *lgr; /* parent link group */ diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index afb889d60881..66ddc9cf5e2f 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -770,6 +770,7 @@ static int smc_llc_cli_conf_link(struct smc_link *link, smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); return -ENOLINK; } + smc_llc_save_peer_uid(qentry); smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); rc = smc_ib_modify_qp_rts(link_new); @@ -1041,6 +1042,7 @@ static int smc_llc_srv_conf_link(struct smc_link *link, false, SMC_LLC_DEL_LOST_PATH); return -ENOLINK; } + smc_llc_save_peer_uid(qentry); smc_llc_link_active(link_new); if (lgr_new_t == SMC_LGR_ASYMMETRIC_LOCAL || lgr_new_t == SMC_LGR_ASYMMETRIC_PEER) @@ -1783,6 +1785,13 @@ void smc_llc_link_set_uid(struct smc_link *link) memcpy(link->link_uid, &link_uid, SMC_LGR_ID_SIZE); } +/* save peers link user id, used for debug purposes */ +void smc_llc_save_peer_uid(struct smc_llc_qentry *qentry) +{ + memcpy(qentry->link->peer_link_uid, qentry->msg.confirm_link.link_uid, + SMC_LGR_ID_SIZE); +} + /* evaluate confirm link request or response */ int smc_llc_eval_conf_link(struct smc_llc_qentry *qentry, enum smc_llc_reqresp type) diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index 1b68f229cb99..55287376112d 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -93,6 +93,7 @@ void smc_llc_flow_stop(struct smc_link_group *lgr, struct smc_llc_flow *flow); int smc_llc_eval_conf_link(struct smc_llc_qentry *qentry, enum smc_llc_reqresp type); void smc_llc_link_set_uid(struct smc_link *link); +void smc_llc_save_peer_uid(struct smc_llc_qentry *qentry); struct smc_llc_qentry *smc_llc_wait(struct smc_link_group *lgr, struct smc_link *lnk, int time_out, u8 exp_msg); -- cgit v1.2.3 From bf6dba76d278d296b385b436d3ac7de56c190d44 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 30 Apr 2020 13:42:22 +0200 Subject: net: sched: fallback to qdisc noqueue if default qdisc setup fail Currently if the default qdisc setup/init fails, the device ends up with qdisc "noop", which causes all TX packets to get dropped. With the introduction of sysctl net/core/default_qdisc it is possible to change the default qdisc to be more advanced, which opens for the possibility that Qdisc_ops->init() can fail. This patch detect these kind of failures, and choose to fallback to qdisc "noqueue", which is so simple that its init call will not fail. This allows the interface to continue functioning. V2: As this also captures memory failures, which are transient, the device is not kept in IFF_NO_QUEUE state. This allows the net_device to retry to default qdisc assignment. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- net/sched/sch_generic.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 2efd5b61acef..ad24fa1a51e6 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -1037,10 +1037,9 @@ static void attach_one_default_qdisc(struct net_device *dev, ops = &pfifo_fast_ops; qdisc = qdisc_create_dflt(dev_queue, ops, TC_H_ROOT, NULL); - if (!qdisc) { - netdev_info(dev, "activation failed\n"); + if (!qdisc) return; - } + if (!netif_is_multiqueue(dev)) qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; dev_queue->qdisc_sleeping = qdisc; @@ -1065,6 +1064,18 @@ static void attach_default_qdiscs(struct net_device *dev) qdisc->ops->attach(qdisc); } } + + /* Detect default qdisc setup/init failed and fallback to "noqueue" */ + if (dev->qdisc == &noop_qdisc) { + netdev_warn(dev, "default qdisc (%s) fail, fallback to %s\n", + default_qdisc_ops->id, noqueue_qdisc_ops.id); + dev->priv_flags |= IFF_NO_QUEUE; + netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL); + dev->qdisc = txq->qdisc_sleeping; + qdisc_refcount_inc(dev->qdisc); + dev->priv_flags ^= IFF_NO_QUEUE; + } + #ifdef CONFIG_NET_SCHED if (dev->qdisc != &noop_qdisc) qdisc_hash_add(dev->qdisc, false); -- cgit v1.2.3 From 39d010504e6b4485d7ceee167743620dd33f4417 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 1 May 2020 07:07:41 -0700 Subject: net_sched: sch_fq: add horizon attribute QUIC servers would like to use SO_TXTIME, without having CAP_NET_ADMIN, to efficiently pace UDP packets. As far as sch_fq is concerned, we need to add safety checks, so that a buggy application does not fill the qdisc with packets having delivery time far in the future. This patch adds a configurable horizon (default: 10 seconds), and a configurable policy when a packet is beyond the horizon at enqueue() time: - either drop the packet (default policy) - or cap its delivery time to the horizon. $ tc -s -d qd sh dev eth0 qdisc fq 8022: root refcnt 257 limit 10000p flow_limit 100p buckets 1024 orphan_mask 1023 quantum 10Kb initial_quantum 51160b low_rate_threshold 550Kbit refill_delay 40.0ms timer_slack 10.000us horizon 10.000s Sent 1234215879 bytes 837099 pkt (dropped 21, overlimits 0 requeues 6) backlog 0b 0p requeues 6 flows 1191 (inactive 1177 throttled 0) gc 0 highprio 0 throttled 692 latency 11.480us pkts_too_long 0 alloc_errors 0 horizon_drops 21 horizon_caps 0 v2: fixed an overflow on 32bit kernels in fq_init(), reported by kbuild test robot Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Signed-off-by: David S. Miller --- include/uapi/linux/pkt_sched.h | 6 +++++ net/sched/sch_fq.c | 59 ++++++++++++++++++++++++++++++++++++++---- 2 files changed, 60 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 0c02737c8f47..a95f3ae7ab37 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -913,6 +913,10 @@ enum { TCA_FQ_TIMER_SLACK, /* timer slack */ + TCA_FQ_HORIZON, /* time horizon in us */ + + TCA_FQ_HORIZON_DROP, /* drop packets beyond horizon, or cap their EDT */ + __TCA_FQ_MAX }; @@ -932,6 +936,8 @@ struct tc_fq_qd_stats { __u32 throttled_flows; __u32 unthrottle_latency_ns; __u64 ce_mark; /* packets above ce_threshold */ + __u64 horizon_drops; + __u64 horizon_caps; }; /* Heavy-Hitter Filter */ diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 4f0104243cc2..8f06a808c59a 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -100,6 +100,7 @@ struct fq_sched_data { struct rb_root delayed; /* for rate limited flows */ u64 time_next_delayed_flow; + u64 ktime_cache; /* copy of last ktime_get_ns() */ unsigned long unthrottle_latency_ns; struct fq_flow internal; /* for non classified or high prio packets */ @@ -109,12 +110,13 @@ struct fq_sched_data { u32 flow_plimit; /* max packets per flow */ unsigned long flow_max_rate; /* optional max rate per flow */ u64 ce_threshold; + u64 horizon; /* horizon in ns */ u32 orphan_mask; /* mask for orphaned skb */ u32 low_rate_threshold; struct rb_root *fq_root; u8 rate_enable; u8 fq_trees_log; - + u8 horizon_drop; u32 flows; u32 inactive_flows; u32 throttled_flows; @@ -123,6 +125,8 @@ struct fq_sched_data { u64 stat_internal_packets; u64 stat_throttled; u64 stat_ce_mark; + u64 stat_horizon_drops; + u64 stat_horizon_caps; u64 stat_flows_plimit; u64 stat_pkts_too_long; u64 stat_allocation_errors; @@ -402,8 +406,6 @@ static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb) struct rb_node **p, *parent; struct sk_buff *head, *aux; - fq_skb_cb(skb)->time_to_send = skb->tstamp ?: ktime_get_ns(); - head = flow->head; if (!head || fq_skb_cb(skb)->time_to_send >= fq_skb_cb(flow->tail)->time_to_send) { @@ -431,6 +433,12 @@ static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb) rb_insert_color(&skb->rbnode, &flow->t_root); } +static bool fq_packet_beyond_horizon(const struct sk_buff *skb, + const struct fq_sched_data *q) +{ + return unlikely((s64)skb->tstamp > (s64)(q->ktime_cache + q->horizon)); +} + static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { @@ -440,6 +448,28 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (unlikely(sch->q.qlen >= sch->limit)) return qdisc_drop(skb, sch, to_free); + if (!skb->tstamp) { + fq_skb_cb(skb)->time_to_send = q->ktime_cache = ktime_get_ns(); + } else { + /* Check if packet timestamp is too far in the future. + * Try first if our cached value, to avoid ktime_get_ns() + * cost in most cases. + */ + if (fq_packet_beyond_horizon(skb, q)) { + /* Refresh our cache and check another time */ + q->ktime_cache = ktime_get_ns(); + if (fq_packet_beyond_horizon(skb, q)) { + if (q->horizon_drop) { + q->stat_horizon_drops++; + return qdisc_drop(skb, sch, to_free); + } + q->stat_horizon_caps++; + skb->tstamp = q->ktime_cache + q->horizon; + } + } + fq_skb_cb(skb)->time_to_send = skb->tstamp; + } + f = fq_classify(skb, q); if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) { q->stat_flows_plimit++; @@ -512,7 +542,7 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch) goto out; } - now = ktime_get_ns(); + q->ktime_cache = now = ktime_get_ns(); fq_check_throttled(q, now); begin: head = &q->new_flows; @@ -765,6 +795,8 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = { [TCA_FQ_LOW_RATE_THRESHOLD] = { .type = NLA_U32 }, [TCA_FQ_CE_THRESHOLD] = { .type = NLA_U32 }, [TCA_FQ_TIMER_SLACK] = { .type = NLA_U32 }, + [TCA_FQ_HORIZON] = { .type = NLA_U32 }, + [TCA_FQ_HORIZON_DROP] = { .type = NLA_U8 }, }; static int fq_change(struct Qdisc *sch, struct nlattr *opt, @@ -854,7 +886,15 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt, if (tb[TCA_FQ_TIMER_SLACK]) q->timer_slack = nla_get_u32(tb[TCA_FQ_TIMER_SLACK]); + if (tb[TCA_FQ_HORIZON]) + q->horizon = (u64)NSEC_PER_USEC * + nla_get_u32(tb[TCA_FQ_HORIZON]); + + if (tb[TCA_FQ_HORIZON_DROP]) + q->horizon_drop = nla_get_u8(tb[TCA_FQ_HORIZON_DROP]); + if (!err) { + sch_tree_unlock(sch); err = fq_resize(sch, fq_log); sch_tree_lock(sch); @@ -907,6 +947,9 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt, q->timer_slack = 10 * NSEC_PER_USEC; /* 10 usec of hrtimer slack */ + q->horizon = 10ULL * NSEC_PER_SEC; /* 10 seconds */ + q->horizon_drop = 1; /* by default, drop packets beyond horizon */ + /* Default ce_threshold of 4294 seconds */ q->ce_threshold = (u64)NSEC_PER_USEC * ~0U; @@ -924,6 +967,7 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) { struct fq_sched_data *q = qdisc_priv(sch); u64 ce_threshold = q->ce_threshold; + u64 horizon = q->horizon; struct nlattr *opts; opts = nla_nest_start_noflag(skb, TCA_OPTIONS); @@ -933,6 +977,7 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) /* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */ do_div(ce_threshold, NSEC_PER_USEC); + do_div(horizon, NSEC_PER_USEC); if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) || nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) || @@ -948,7 +993,9 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) q->low_rate_threshold) || nla_put_u32(skb, TCA_FQ_CE_THRESHOLD, (u32)ce_threshold) || nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log) || - nla_put_u32(skb, TCA_FQ_TIMER_SLACK, q->timer_slack)) + nla_put_u32(skb, TCA_FQ_TIMER_SLACK, q->timer_slack) || + nla_put_u32(skb, TCA_FQ_HORIZON, (u32)horizon) || + nla_put_u8(skb, TCA_FQ_HORIZON_DROP, q->horizon_drop)) goto nla_put_failure; return nla_nest_end(skb, opts); @@ -979,6 +1026,8 @@ static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d) st.unthrottle_latency_ns = min_t(unsigned long, q->unthrottle_latency_ns, ~0U); st.ce_mark = q->stat_ce_mark; + st.horizon_drops = q->stat_horizon_drops; + st.horizon_caps = q->stat_horizon_caps; sch_tree_unlock(sch); return gnet_stats_copy_app(d, &st, sizeof(st)); -- cgit v1.2.3 From dd86fec7e06ab792fe470c66a67ff42bf5d72b91 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 1 May 2020 09:40:40 -0700 Subject: devlink: factor out building a snapshot notification We'll need to send snapshot info back on the socket which requested a snapshot to be created. Factor out constructing a snapshot description from the broadcast notification code. v3: new patch Signed-off-by: Jakub Kicinski Reviewed-by: Jiri Pirko Reviewed-by: Jacob Keller Signed-off-by: David S. Miller --- net/core/devlink.c | 39 ++++++++++++++++++++++++++++----------- 1 file changed, 28 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/core/devlink.c b/net/core/devlink.c index 80f97722f31f..2b7c60c18b99 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -3716,24 +3716,26 @@ nla_put_failure: return err; } -static void devlink_nl_region_notify(struct devlink_region *region, - struct devlink_snapshot *snapshot, - enum devlink_command cmd) +static struct sk_buff * +devlink_nl_region_notify_build(struct devlink_region *region, + struct devlink_snapshot *snapshot, + enum devlink_command cmd, u32 portid, u32 seq) { struct devlink *devlink = region->devlink; struct sk_buff *msg; void *hdr; int err; - WARN_ON(cmd != DEVLINK_CMD_REGION_NEW && cmd != DEVLINK_CMD_REGION_DEL); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) - return; + return ERR_PTR(-ENOMEM); - hdr = genlmsg_put(msg, 0, 0, &devlink_nl_family, 0, cmd); - if (!hdr) + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, 0, cmd); + if (!hdr) { + err = -EMSGSIZE; goto out_free_msg; + } err = devlink_nl_put_handle(msg, devlink); if (err) @@ -3757,15 +3759,30 @@ static void devlink_nl_region_notify(struct devlink_region *region, } genlmsg_end(msg, hdr); - genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), - msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); - - return; + return msg; out_cancel_msg: genlmsg_cancel(msg, hdr); out_free_msg: nlmsg_free(msg); + return ERR_PTR(err); +} + +static void devlink_nl_region_notify(struct devlink_region *region, + struct devlink_snapshot *snapshot, + enum devlink_command cmd) +{ + struct devlink *devlink = region->devlink; + struct sk_buff *msg; + + WARN_ON(cmd != DEVLINK_CMD_REGION_NEW && cmd != DEVLINK_CMD_REGION_DEL); + + msg = devlink_nl_region_notify_build(region, snapshot, cmd, 0, 0); + if (IS_ERR(msg)) + return; + + genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), + msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); } /** -- cgit v1.2.3 From 043b3e22768d5d909cb1474fc21ae2fbaf026c0c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 1 May 2020 09:40:41 -0700 Subject: devlink: let kernel allocate region snapshot id Currently users have to choose a free snapshot id before calling DEVLINK_CMD_REGION_NEW. This is potentially racy and inconvenient. Make the DEVLINK_ATTR_REGION_SNAPSHOT_ID optional and try to allocate id automatically. Send a message back to the caller with the snapshot info. Example use: $ devlink region new netdevsim/netdevsim1/dummy netdevsim/netdevsim1/dummy: snapshot 1 $ id=$(devlink -j region new netdevsim/netdevsim1/dummy | \ jq '.[][][][]') $ devlink region dump netdevsim/netdevsim1/dummy snapshot $id [...] $ devlink region del netdevsim/netdevsim1/dummy snapshot $id v4: - inline the notification code v3: - send the notification only once snapshot creation completed. v2: - don't wrap the line containing extack; - add a few sentences to the docs. Signed-off-by: Jakub Kicinski Reviewed-by: Jacob Keller Signed-off-by: David S. Miller --- .../networking/devlink/devlink-region.rst | 7 ++- net/core/devlink.c | 57 +++++++++++++++++----- .../selftests/drivers/net/netdevsim/devlink.sh | 13 +++++ 3 files changed, 62 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/Documentation/networking/devlink/devlink-region.rst b/Documentation/networking/devlink/devlink-region.rst index 04e04d1ff627..daf35427fce1 100644 --- a/Documentation/networking/devlink/devlink-region.rst +++ b/Documentation/networking/devlink/devlink-region.rst @@ -23,7 +23,9 @@ states, but see also :doc:`devlink-health` Regions may optionally support capturing a snapshot on demand via the ``DEVLINK_CMD_REGION_NEW`` netlink message. A driver wishing to allow requested snapshots must implement the ``.snapshot`` callback for the region -in its ``devlink_region_ops`` structure. +in its ``devlink_region_ops`` structure. If snapshot id is not set in +the ``DEVLINK_CMD_REGION_NEW`` request kernel will allocate one and send +the snapshot information to user space. example usage ------------- @@ -45,7 +47,8 @@ example usage $ devlink region del pci/0000:00:05.0/cr-space snapshot 1 # Request an immediate snapshot, if supported by the region - $ devlink region new pci/0000:00:05.0/cr-space snapshot 5 + $ devlink region new pci/0000:00:05.0/cr-space + pci/0000:00:05.0/cr-space: snapshot 5 # Dump a snapshot: $ devlink region dump pci/0000:00:05.0/fw-health snapshot 1 diff --git a/net/core/devlink.c b/net/core/devlink.c index 2b7c60c18b99..43a9d5be73ca 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -4086,6 +4086,8 @@ static int devlink_nl_cmd_region_new(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; + struct devlink_snapshot *snapshot; + struct nlattr *snapshot_id_attr; struct devlink_region *region; const char *region_name; u32 snapshot_id; @@ -4097,11 +4099,6 @@ devlink_nl_cmd_region_new(struct sk_buff *skb, struct genl_info *info) return -EINVAL; } - if (!info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]) { - NL_SET_ERR_MSG_MOD(info->extack, "No snapshot id provided"); - return -EINVAL; - } - region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]); region = devlink_region_get_by_name(devlink, region_name); if (!region) { @@ -4119,16 +4116,25 @@ devlink_nl_cmd_region_new(struct sk_buff *skb, struct genl_info *info) return -ENOSPC; } - snapshot_id = nla_get_u32(info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]); + snapshot_id_attr = info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]; + if (snapshot_id_attr) { + snapshot_id = nla_get_u32(snapshot_id_attr); - if (devlink_region_snapshot_get_by_id(region, snapshot_id)) { - NL_SET_ERR_MSG_MOD(info->extack, "The requested snapshot id is already in use"); - return -EEXIST; - } + if (devlink_region_snapshot_get_by_id(region, snapshot_id)) { + NL_SET_ERR_MSG_MOD(info->extack, "The requested snapshot id is already in use"); + return -EEXIST; + } - err = __devlink_snapshot_id_insert(devlink, snapshot_id); - if (err) - return err; + err = __devlink_snapshot_id_insert(devlink, snapshot_id); + if (err) + return err; + } else { + err = __devlink_region_snapshot_id_get(devlink, &snapshot_id); + if (err) { + NL_SET_ERR_MSG_MOD(info->extack, "Failed to allocate a new snapshot id"); + return err; + } + } err = region->ops->snapshot(devlink, info->extack, &data); if (err) @@ -4138,6 +4144,27 @@ devlink_nl_cmd_region_new(struct sk_buff *skb, struct genl_info *info) if (err) goto err_snapshot_create; + if (!snapshot_id_attr) { + struct sk_buff *msg; + + snapshot = devlink_region_snapshot_get_by_id(region, + snapshot_id); + if (WARN_ON(!snapshot)) + return -EINVAL; + + msg = devlink_nl_region_notify_build(region, snapshot, + DEVLINK_CMD_REGION_NEW, + info->snd_portid, + info->snd_seq); + err = PTR_ERR_OR_ZERO(msg); + if (err) + goto err_notify; + + err = genlmsg_reply(msg, info); + if (err) + goto err_notify; + } + return 0; err_snapshot_create: @@ -4145,6 +4172,10 @@ err_snapshot_create: err_snapshot_capture: __devlink_snapshot_id_decrement(devlink, snapshot_id); return err; + +err_notify: + devlink_region_snapshot_del(region, snapshot); + return err; } static int devlink_nl_cmd_region_read_chunk_fill(struct sk_buff *msg, diff --git a/tools/testing/selftests/drivers/net/netdevsim/devlink.sh b/tools/testing/selftests/drivers/net/netdevsim/devlink.sh index 9f9741444549..ad539eccddcb 100755 --- a/tools/testing/selftests/drivers/net/netdevsim/devlink.sh +++ b/tools/testing/selftests/drivers/net/netdevsim/devlink.sh @@ -151,6 +151,19 @@ regions_test() check_region_snapshot_count dummy post-second-delete 2 + sid=$(devlink -j region new $DL_HANDLE/dummy | jq '.[][][][]') + check_err $? "Failed to create a new snapshot with id allocated by the kernel" + + check_region_snapshot_count dummy post-first-request 3 + + devlink region dump $DL_HANDLE/dummy snapshot $sid >> /dev/null + check_err $? "Failed to dump a snapshot with id allocated by the kernel" + + devlink region del $DL_HANDLE/dummy snapshot $sid + check_err $? "Failed to delete snapshot with id allocated by the kernel" + + check_region_snapshot_count dummy post-first-request 2 + log_test "regions test" } -- cgit v1.2.3 From 1a33e10e4a95cb109ff1145098175df3113313ef Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sat, 2 May 2020 22:22:19 -0700 Subject: net: partially revert dynamic lockdep key changes This patch reverts the folowing commits: commit 064ff66e2bef84f1153087612032b5b9eab005bd "bonding: add missing netdev_update_lockdep_key()" commit 53d374979ef147ab51f5d632dfe20b14aebeccd0 "net: avoid updating qdisc_xmit_lock_key in netdev_update_lockdep_key()" commit 1f26c0d3d24125992ab0026b0dab16c08df947c7 "net: fix kernel-doc warning in " commit ab92d68fc22f9afab480153bd82a20f6e2533769 "net: core: add generic lockdep keys" but keeps the addr_list_lock_key because we still lock addr_list_lock nestedly on stack devices, unlikely xmit_lock this is safe because we don't take addr_list_lock on any fast path. Reported-and-tested-by: syzbot+aaa6fa4949cc5d9b7b25@syzkaller.appspotmail.com Cc: Dmitry Vyukov Cc: Taehee Yoo Signed-off-by: Cong Wang Acked-by: Taehee Yoo Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 1 + drivers/net/ethernet/netronome/nfp/nfp_net_repr.c | 16 ++++ drivers/net/hamradio/bpqether.c | 20 +++++ drivers/net/hyperv/netvsc_drv.c | 2 + drivers/net/ipvlan/ipvlan_main.c | 2 + drivers/net/macsec.c | 2 + drivers/net/macvlan.c | 2 + drivers/net/ppp/ppp_generic.c | 2 + drivers/net/team/team.c | 1 + drivers/net/vrf.c | 1 + drivers/net/wireless/intersil/hostap/hostap_hw.c | 22 ++++++ include/linux/netdevice.h | 27 +++++-- net/8021q/vlan_dev.c | 21 ++++++ net/batman-adv/soft-interface.c | 30 ++++++++ net/bluetooth/6lowpan.c | 8 ++ net/core/dev.c | 90 ++++++++++++++++++----- net/dsa/slave.c | 12 +++ net/ieee802154/6lowpan/core.c | 8 ++ net/l2tp/l2tp_eth.c | 1 + net/netrom/af_netrom.c | 21 ++++++ net/rose/af_rose.c | 21 ++++++ net/sched/sch_generic.c | 17 +++-- 22 files changed, 294 insertions(+), 33 deletions(-) (limited to 'net') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 2e70e43c5df5..d01871321d22 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -4898,6 +4898,7 @@ static int bond_init(struct net_device *bond_dev) spin_lock_init(&bond->stats_lock); lockdep_register_key(&bond->stats_lock_key); lockdep_set_class(&bond->stats_lock, &bond->stats_lock_key); + netdev_lockdep_set_classes(bond_dev); list_add_tail(&bond->bond_list, &bn->dev_list); diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c index 79d72c88bbef..b3cabc274121 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c @@ -299,6 +299,20 @@ static void nfp_repr_clean(struct nfp_repr *repr) nfp_port_free(repr->port); } +static struct lock_class_key nfp_repr_netdev_xmit_lock_key; + +static void nfp_repr_set_lockdep_class_one(struct net_device *dev, + struct netdev_queue *txq, + void *_unused) +{ + lockdep_set_class(&txq->_xmit_lock, &nfp_repr_netdev_xmit_lock_key); +} + +static void nfp_repr_set_lockdep_class(struct net_device *dev) +{ + netdev_for_each_tx_queue(dev, nfp_repr_set_lockdep_class_one, NULL); +} + int nfp_repr_init(struct nfp_app *app, struct net_device *netdev, u32 cmsg_port_id, struct nfp_port *port, struct net_device *pf_netdev) @@ -308,6 +322,8 @@ int nfp_repr_init(struct nfp_app *app, struct net_device *netdev, u32 repr_cap = nn->tlv_caps.repr_cap; int err; + nfp_repr_set_lockdep_class(netdev); + repr->port = port; repr->dst = metadata_dst_alloc(0, METADATA_HW_PORT_MUX, GFP_KERNEL); if (!repr->dst) diff --git a/drivers/net/hamradio/bpqether.c b/drivers/net/hamradio/bpqether.c index fbea6f232819..206688154fdf 100644 --- a/drivers/net/hamradio/bpqether.c +++ b/drivers/net/hamradio/bpqether.c @@ -107,6 +107,25 @@ struct bpqdev { static LIST_HEAD(bpq_devices); +/* + * bpqether network devices are paired with ethernet devices below them, so + * form a special "super class" of normal ethernet devices; split their locks + * off into a separate class since they always nest. + */ +static struct lock_class_key bpq_netdev_xmit_lock_key; + +static void bpq_set_lockdep_class_one(struct net_device *dev, + struct netdev_queue *txq, + void *_unused) +{ + lockdep_set_class(&txq->_xmit_lock, &bpq_netdev_xmit_lock_key); +} + +static void bpq_set_lockdep_class(struct net_device *dev) +{ + netdev_for_each_tx_queue(dev, bpq_set_lockdep_class_one, NULL); +} + /* ------------------------------------------------------------------------ */ @@ -477,6 +496,7 @@ static int bpq_new_device(struct net_device *edev) err = register_netdevice(ndev); if (err) goto error; + bpq_set_lockdep_class(ndev); /* List protected by RTNL */ list_add_rcu(&bpq->bpq_list, &bpq_devices); diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index d8e86bdbfba1..c0b647a4c893 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -2456,6 +2456,8 @@ static int netvsc_probe(struct hv_device *dev, NETIF_F_HW_VLAN_CTAG_RX; net->vlan_features = net->features; + netdev_lockdep_set_classes(net); + /* MTU range: 68 - 1500 or 65521 */ net->min_mtu = NETVSC_MTU_MIN; if (nvdev->nvsp_version >= NVSP_PROTOCOL_VERSION_2) diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index f195f278a83a..15e87c097b0b 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -131,6 +131,8 @@ static int ipvlan_init(struct net_device *dev) dev->gso_max_segs = phy_dev->gso_max_segs; dev->hard_header_len = phy_dev->hard_header_len; + netdev_lockdep_set_classes(dev); + ipvlan->pcpu_stats = netdev_alloc_pcpu_stats(struct ipvl_pcpu_stats); if (!ipvlan->pcpu_stats) return -ENOMEM; diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index 758baf7cb8a1..ea3f25cc79ef 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -4047,6 +4047,8 @@ static int macsec_newlink(struct net *net, struct net_device *dev, if (err < 0) return err; + netdev_lockdep_set_classes(dev); + err = netdev_upper_dev_link(real_dev, dev, extack); if (err < 0) goto unregister; diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index d45600e0a38c..34eb073cdd74 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -890,6 +890,8 @@ static int macvlan_init(struct net_device *dev) dev->gso_max_segs = lowerdev->gso_max_segs; dev->hard_header_len = lowerdev->hard_header_len; + netdev_lockdep_set_classes(dev); + vlan->pcpu_stats = netdev_alloc_pcpu_stats(struct vlan_pcpu_stats); if (!vlan->pcpu_stats) return -ENOMEM; diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c index 22cc2cb9d878..7d005896a0f9 100644 --- a/drivers/net/ppp/ppp_generic.c +++ b/drivers/net/ppp/ppp_generic.c @@ -1410,6 +1410,8 @@ static int ppp_dev_init(struct net_device *dev) { struct ppp *ppp; + netdev_lockdep_set_classes(dev); + ppp = netdev_priv(dev); /* Let the netdevice take a reference on the ppp file. This ensures * that ppp_destroy_interface() won't run before the device gets diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c index 04845a4017f9..8c1e02752ff6 100644 --- a/drivers/net/team/team.c +++ b/drivers/net/team/team.c @@ -1647,6 +1647,7 @@ static int team_init(struct net_device *dev) lockdep_register_key(&team->team_lock_key); __mutex_init(&team->lock, "team->team_lock_key", &team->team_lock_key); + netdev_lockdep_set_classes(dev); return 0; diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 56f8aab46f89..43928a1c2f2a 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -867,6 +867,7 @@ static int vrf_dev_init(struct net_device *dev) /* similarly, oper state is irrelevant; set to up to avoid confusion */ dev->operstate = IF_OPER_UP; + netdev_lockdep_set_classes(dev); return 0; out_rth: diff --git a/drivers/net/wireless/intersil/hostap/hostap_hw.c b/drivers/net/wireless/intersil/hostap/hostap_hw.c index 58212c532c90..aadf3dec5bf3 100644 --- a/drivers/net/wireless/intersil/hostap/hostap_hw.c +++ b/drivers/net/wireless/intersil/hostap/hostap_hw.c @@ -3041,6 +3041,27 @@ static void prism2_clear_set_tim_queue(local_info_t *local) } } + +/* + * HostAP uses two layers of net devices, where the inner + * layer gets called all the time from the outer layer. + * This is a natural nesting, which needs a split lock type. + */ +static struct lock_class_key hostap_netdev_xmit_lock_key; + +static void prism2_set_lockdep_class_one(struct net_device *dev, + struct netdev_queue *txq, + void *_unused) +{ + lockdep_set_class(&txq->_xmit_lock, + &hostap_netdev_xmit_lock_key); +} + +static void prism2_set_lockdep_class(struct net_device *dev) +{ + netdev_for_each_tx_queue(dev, prism2_set_lockdep_class_one, NULL); +} + static struct net_device * prism2_init_local_data(struct prism2_helper_functions *funcs, int card_idx, struct device *sdev) @@ -3199,6 +3220,7 @@ while (0) if (ret >= 0) ret = register_netdevice(dev); + prism2_set_lockdep_class(dev); rtnl_unlock(); if (ret < 0) { printk(KERN_WARNING "%s: register netdevice failed!\n", diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5a8d40f1ffe2..7725efd6e48a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1805,13 +1805,11 @@ enum netdev_priv_flags { * @phydev: Physical device may attach itself * for hardware timestamping * @sfp_bus: attached &struct sfp_bus structure. - * @qdisc_tx_busylock_key: lockdep class annotating Qdisc->busylock - * spinlock - * @qdisc_running_key: lockdep class annotating Qdisc->running seqcount - * @qdisc_xmit_lock_key: lockdep class annotating - * netdev_queue->_xmit_lock spinlock + * * @addr_list_lock_key: lockdep class annotating * net_device->addr_list_lock spinlock + * @qdisc_tx_busylock: lockdep class annotating Qdisc->busylock spinlock + * @qdisc_running_key: lockdep class annotating Qdisc->running seqcount * * @proto_down: protocol port state information can be sent to the * switch driver and used to set the phys state of the @@ -2112,10 +2110,9 @@ struct net_device { #endif struct phy_device *phydev; struct sfp_bus *sfp_bus; - struct lock_class_key qdisc_tx_busylock_key; - struct lock_class_key qdisc_running_key; - struct lock_class_key qdisc_xmit_lock_key; struct lock_class_key addr_list_lock_key; + struct lock_class_key *qdisc_tx_busylock; + struct lock_class_key *qdisc_running_key; bool proto_down; unsigned wol_enabled:1; @@ -2200,6 +2197,20 @@ static inline void netdev_for_each_tx_queue(struct net_device *dev, f(dev, &dev->_tx[i], arg); } +#define netdev_lockdep_set_classes(dev) \ +{ \ + static struct lock_class_key qdisc_tx_busylock_key; \ + static struct lock_class_key qdisc_running_key; \ + static struct lock_class_key qdisc_xmit_lock_key; \ + unsigned int i; \ + \ + (dev)->qdisc_tx_busylock = &qdisc_tx_busylock_key; \ + (dev)->qdisc_running_key = &qdisc_running_key; \ + for (i = 0; i < (dev)->num_tx_queues; i++) \ + lockdep_set_class(&(dev)->_tx[i]._xmit_lock, \ + &qdisc_xmit_lock_key); \ +} + u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev); struct netdev_queue *netdev_core_pick_tx(struct net_device *dev, diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 990b9fde28c6..319220b2341d 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -489,6 +489,25 @@ static void vlan_dev_set_rx_mode(struct net_device *vlan_dev) dev_uc_sync(vlan_dev_priv(vlan_dev)->real_dev, vlan_dev); } +/* + * vlan network devices have devices nesting below it, and are a special + * "super class" of normal network devices; split their locks off into a + * separate class since they always nest. + */ +static struct lock_class_key vlan_netdev_xmit_lock_key; + +static void vlan_dev_set_lockdep_one(struct net_device *dev, + struct netdev_queue *txq, + void *unused) +{ + lockdep_set_class(&txq->_xmit_lock, &vlan_netdev_xmit_lock_key); +} + +static void vlan_dev_set_lockdep_class(struct net_device *dev) +{ + netdev_for_each_tx_queue(dev, vlan_dev_set_lockdep_one, NULL); +} + static const struct header_ops vlan_header_ops = { .create = vlan_dev_hard_header, .parse = eth_header_parse, @@ -579,6 +598,8 @@ static int vlan_dev_init(struct net_device *dev) SET_NETDEV_DEVTYPE(dev, &vlan_type); + vlan_dev_set_lockdep_class(dev); + vlan->vlan_pcpu_stats = netdev_alloc_pcpu_stats(struct vlan_pcpu_stats); if (!vlan->vlan_pcpu_stats) return -ENOMEM; diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 5f05a728f347..822af540b854 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -739,6 +739,34 @@ static int batadv_interface_kill_vid(struct net_device *dev, __be16 proto, return 0; } +/* batman-adv network devices have devices nesting below it and are a special + * "super class" of normal network devices; split their locks off into a + * separate class since they always nest. + */ +static struct lock_class_key batadv_netdev_xmit_lock_key; + +/** + * batadv_set_lockdep_class_one() - Set lockdep class for a single tx queue + * @dev: device which owns the tx queue + * @txq: tx queue to modify + * @_unused: always NULL + */ +static void batadv_set_lockdep_class_one(struct net_device *dev, + struct netdev_queue *txq, + void *_unused) +{ + lockdep_set_class(&txq->_xmit_lock, &batadv_netdev_xmit_lock_key); +} + +/** + * batadv_set_lockdep_class() - Set txq and addr_list lockdep class + * @dev: network device to modify + */ +static void batadv_set_lockdep_class(struct net_device *dev) +{ + netdev_for_each_tx_queue(dev, batadv_set_lockdep_class_one, NULL); +} + /** * batadv_softif_init_late() - late stage initialization of soft interface * @dev: registered network device to modify @@ -752,6 +780,8 @@ static int batadv_softif_init_late(struct net_device *dev) int ret; size_t cnt_len = sizeof(u64) * BATADV_CNT_NUM; + batadv_set_lockdep_class(dev); + bat_priv = netdev_priv(dev); bat_priv->soft_iface = dev; diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 4febc82a7c76..bb55d92691b0 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -571,7 +571,15 @@ static netdev_tx_t bt_xmit(struct sk_buff *skb, struct net_device *netdev) return err < 0 ? NET_XMIT_DROP : err; } +static int bt_dev_init(struct net_device *dev) +{ + netdev_lockdep_set_classes(dev); + + return 0; +} + static const struct net_device_ops netdev_ops = { + .ndo_init = bt_dev_init, .ndo_start_xmit = bt_xmit, }; diff --git a/net/core/dev.c b/net/core/dev.c index afff16849c26..f8d83922a6af 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -398,6 +398,74 @@ static RAW_NOTIFIER_HEAD(netdev_chain); DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); EXPORT_PER_CPU_SYMBOL(softnet_data); +#ifdef CONFIG_LOCKDEP +/* + * register_netdevice() inits txq->_xmit_lock and sets lockdep class + * according to dev->type + */ +static const unsigned short netdev_lock_type[] = { + ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25, + ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET, + ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM, + ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP, + ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD, + ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25, + ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP, + ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD, + ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI, + ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE, + ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET, + ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL, + ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM, + ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE, + ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE}; + +static const char *const netdev_lock_name[] = { + "_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25", + "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET", + "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM", + "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP", + "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD", + "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25", + "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP", + "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD", + "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI", + "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE", + "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET", + "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL", + "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM", + "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE", + "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"}; + +static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)]; + +static inline unsigned short netdev_lock_pos(unsigned short dev_type) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++) + if (netdev_lock_type[i] == dev_type) + return i; + /* the last key is used by default */ + return ARRAY_SIZE(netdev_lock_type) - 1; +} + +static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, + unsigned short dev_type) +{ + int i; + + i = netdev_lock_pos(dev_type); + lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i], + netdev_lock_name[i]); +} +#else +static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, + unsigned short dev_type) +{ +} +#endif + /******************************************************************************* * * Protocol management and registration routines @@ -9208,7 +9276,7 @@ static void netdev_init_one_queue(struct net_device *dev, { /* Initialize queue lock */ spin_lock_init(&queue->_xmit_lock); - lockdep_set_class(&queue->_xmit_lock, &dev->qdisc_xmit_lock_key); + netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type); queue->xmit_lock_owner = -1; netdev_queue_numa_node_write(queue, NUMA_NO_NODE); queue->dev = dev; @@ -9255,22 +9323,6 @@ void netif_tx_stop_all_queues(struct net_device *dev) } EXPORT_SYMBOL(netif_tx_stop_all_queues); -static void netdev_register_lockdep_key(struct net_device *dev) -{ - lockdep_register_key(&dev->qdisc_tx_busylock_key); - lockdep_register_key(&dev->qdisc_running_key); - lockdep_register_key(&dev->qdisc_xmit_lock_key); - lockdep_register_key(&dev->addr_list_lock_key); -} - -static void netdev_unregister_lockdep_key(struct net_device *dev) -{ - lockdep_unregister_key(&dev->qdisc_tx_busylock_key); - lockdep_unregister_key(&dev->qdisc_running_key); - lockdep_unregister_key(&dev->qdisc_xmit_lock_key); - lockdep_unregister_key(&dev->addr_list_lock_key); -} - void netdev_update_lockdep_key(struct net_device *dev) { lockdep_unregister_key(&dev->addr_list_lock_key); @@ -9837,7 +9889,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev_net_set(dev, &init_net); - netdev_register_lockdep_key(dev); + lockdep_register_key(&dev->addr_list_lock_key); dev->gso_max_size = GSO_MAX_SIZE; dev->gso_max_segs = GSO_MAX_SEGS; @@ -9926,7 +9978,7 @@ void free_netdev(struct net_device *dev) free_percpu(dev->xdp_bulkq); dev->xdp_bulkq = NULL; - netdev_unregister_lockdep_key(dev); + lockdep_unregister_key(&dev->addr_list_lock_key); /* Compatibility with error handling in drivers */ if (dev->reg_state == NETREG_UNINITIALIZED) { diff --git a/net/dsa/slave.c b/net/dsa/slave.c index ba8bf90dc0cc..fa2634043751 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1671,6 +1671,15 @@ static int dsa_slave_phy_setup(struct net_device *slave_dev) return ret; } +static struct lock_class_key dsa_slave_netdev_xmit_lock_key; +static void dsa_slave_set_lockdep_class_one(struct net_device *dev, + struct netdev_queue *txq, + void *_unused) +{ + lockdep_set_class(&txq->_xmit_lock, + &dsa_slave_netdev_xmit_lock_key); +} + int dsa_slave_suspend(struct net_device *slave_dev) { struct dsa_port *dp = dsa_slave_to_port(slave_dev); @@ -1754,6 +1763,9 @@ int dsa_slave_create(struct dsa_port *port) slave_dev->max_mtu = ETH_MAX_MTU; SET_NETDEV_DEVTYPE(slave_dev, &dsa_type); + netdev_for_each_tx_queue(slave_dev, dsa_slave_set_lockdep_class_one, + NULL); + SET_NETDEV_DEV(slave_dev, port->ds->dev); slave_dev->dev.of_node = port->dn; slave_dev->vlan_features = master->vlan_features; diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c index c0b107cdd715..3297e7fa9945 100644 --- a/net/ieee802154/6lowpan/core.c +++ b/net/ieee802154/6lowpan/core.c @@ -58,6 +58,13 @@ static const struct header_ops lowpan_header_ops = { .create = lowpan_header_create, }; +static int lowpan_dev_init(struct net_device *ldev) +{ + netdev_lockdep_set_classes(ldev); + + return 0; +} + static int lowpan_open(struct net_device *dev) { if (!open_count) @@ -89,6 +96,7 @@ static int lowpan_get_iflink(const struct net_device *dev) } static const struct net_device_ops lowpan_netdev_ops = { + .ndo_init = lowpan_dev_init, .ndo_start_xmit = lowpan_xmit, .ndo_open = lowpan_open, .ndo_stop = lowpan_stop, diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index d3b520b9b2c9..fd5ac2788e45 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -56,6 +56,7 @@ static int l2tp_eth_dev_init(struct net_device *dev) { eth_hw_addr_random(dev); eth_broadcast_addr(dev->broadcast); + netdev_lockdep_set_classes(dev); return 0; } diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 7b1a74f74aad..eccc7d366e17 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -63,6 +63,26 @@ static DEFINE_SPINLOCK(nr_list_lock); static const struct proto_ops nr_proto_ops; +/* + * NETROM network devices are virtual network devices encapsulating NETROM + * frames into AX.25 which will be sent through an AX.25 device, so form a + * special "super class" of normal net devices; split their locks off into a + * separate class since they always nest. + */ +static struct lock_class_key nr_netdev_xmit_lock_key; + +static void nr_set_lockdep_one(struct net_device *dev, + struct netdev_queue *txq, + void *_unused) +{ + lockdep_set_class(&txq->_xmit_lock, &nr_netdev_xmit_lock_key); +} + +static void nr_set_lockdep_key(struct net_device *dev) +{ + netdev_for_each_tx_queue(dev, nr_set_lockdep_one, NULL); +} + /* * Socket removal during an interrupt is now safe. */ @@ -1394,6 +1414,7 @@ static int __init nr_proto_init(void) free_netdev(dev); goto fail; } + nr_set_lockdep_key(dev); dev_nr[i] = dev; } diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 1e8eeb044b07..e7a872207b46 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -64,6 +64,26 @@ static const struct proto_ops rose_proto_ops; ax25_address rose_callsign; +/* + * ROSE network devices are virtual network devices encapsulating ROSE + * frames into AX.25 which will be sent through an AX.25 device, so form a + * special "super class" of normal net devices; split their locks off into a + * separate class since they always nest. + */ +static struct lock_class_key rose_netdev_xmit_lock_key; + +static void rose_set_lockdep_one(struct net_device *dev, + struct netdev_queue *txq, + void *_unused) +{ + lockdep_set_class(&txq->_xmit_lock, &rose_netdev_xmit_lock_key); +} + +static void rose_set_lockdep_key(struct net_device *dev) +{ + netdev_for_each_tx_queue(dev, rose_set_lockdep_one, NULL); +} + /* * Convert a ROSE address into text. */ @@ -1511,6 +1531,7 @@ static int __init rose_proto_init(void) free_netdev(dev); goto fail; } + rose_set_lockdep_key(dev); dev_rose[i] = dev; } diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index ad24fa1a51e6..ebc55d884247 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -794,6 +794,9 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = { }; EXPORT_SYMBOL(pfifo_fast_ops); +static struct lock_class_key qdisc_tx_busylock; +static struct lock_class_key qdisc_running_key; + struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, const struct Qdisc_ops *ops, struct netlink_ext_ack *extack) @@ -846,9 +849,17 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, } spin_lock_init(&sch->busylock); + lockdep_set_class(&sch->busylock, + dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); + /* seqlock has the same scope of busylock, for NOLOCK qdisc */ spin_lock_init(&sch->seqlock); + lockdep_set_class(&sch->busylock, + dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); + seqcount_init(&sch->running); + lockdep_set_class(&sch->running, + dev->qdisc_running_key ?: &qdisc_running_key); sch->ops = ops; sch->flags = ops->static_flags; @@ -859,12 +870,6 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, dev_hold(dev); refcount_set(&sch->refcnt, 1); - if (sch != &noop_qdisc) { - lockdep_set_class(&sch->busylock, &dev->qdisc_tx_busylock_key); - lockdep_set_class(&sch->seqlock, &dev->qdisc_tx_busylock_key); - lockdep_set_class(&sch->running, &dev->qdisc_running_key); - } - return sch; errout1: kfree(p); -- cgit v1.2.3 From e4e5aefc113510c03d34e182ab30bc0cc196675c Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Mon, 4 May 2020 15:33:51 +0200 Subject: xsk: Change two variable names for increased clarity Change two variables names so that it is clearer what they represent. The first one is xsk_list that in fact only contains the list of AF_XDP sockets with a Tx component. Change this to xsk_tx_list for improved clarity. The second variable is size in the ring structure. One might think that this is the size of the ring, but it is in fact the size of the umem, copied into the ring structure to improve performance. Rename this variable umem_size to avoid any confusion. Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Acked-by: Jonathan Lemon Link: https://lore.kernel.org/bpf/1588599232-24897-2-git-send-email-magnus.karlsson@intel.com --- include/net/xdp_sock.h | 4 ++-- net/xdp/xdp_umem.c | 14 +++++++------- net/xdp/xsk.c | 8 ++++---- net/xdp/xsk_queue.c | 4 ++-- net/xdp/xsk_queue.h | 8 ++++---- 5 files changed, 19 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index e86ec48ef627..b72f1f4c3b15 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -62,8 +62,8 @@ struct xdp_umem { struct net_device *dev; struct xdp_umem_fq_reuse *fq_reuse; bool zc; - spinlock_t xsk_list_lock; - struct list_head xsk_list; + spinlock_t xsk_tx_list_lock; + struct list_head xsk_tx_list; }; /* Nodes are linked in the struct xdp_sock map_list field, and used to diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index ed7a6060f73c..7211f4572760 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -30,9 +30,9 @@ void xdp_add_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs) if (!xs->tx) return; - spin_lock_irqsave(&umem->xsk_list_lock, flags); - list_add_rcu(&xs->list, &umem->xsk_list); - spin_unlock_irqrestore(&umem->xsk_list_lock, flags); + spin_lock_irqsave(&umem->xsk_tx_list_lock, flags); + list_add_rcu(&xs->list, &umem->xsk_tx_list); + spin_unlock_irqrestore(&umem->xsk_tx_list_lock, flags); } void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs) @@ -42,9 +42,9 @@ void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs) if (!xs->tx) return; - spin_lock_irqsave(&umem->xsk_list_lock, flags); + spin_lock_irqsave(&umem->xsk_tx_list_lock, flags); list_del_rcu(&xs->list); - spin_unlock_irqrestore(&umem->xsk_list_lock, flags); + spin_unlock_irqrestore(&umem->xsk_tx_list_lock, flags); } /* The umem is stored both in the _rx struct and the _tx struct as we do @@ -395,8 +395,8 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) umem->pgs = NULL; umem->user = NULL; umem->flags = mr->flags; - INIT_LIST_HEAD(&umem->xsk_list); - spin_lock_init(&umem->xsk_list_lock); + INIT_LIST_HEAD(&umem->xsk_tx_list); + spin_lock_init(&umem->xsk_tx_list_lock); refcount_set(&umem->users, 1); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index f6e6609f70a3..45ffd67b367d 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -75,7 +75,7 @@ void xsk_set_tx_need_wakeup(struct xdp_umem *umem) return; rcu_read_lock(); - list_for_each_entry_rcu(xs, &umem->xsk_list, list) { + list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) { xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP; } rcu_read_unlock(); @@ -102,7 +102,7 @@ void xsk_clear_tx_need_wakeup(struct xdp_umem *umem) return; rcu_read_lock(); - list_for_each_entry_rcu(xs, &umem->xsk_list, list) { + list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) { xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP; } rcu_read_unlock(); @@ -305,7 +305,7 @@ void xsk_umem_consume_tx_done(struct xdp_umem *umem) struct xdp_sock *xs; rcu_read_lock(); - list_for_each_entry_rcu(xs, &umem->xsk_list, list) { + list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) { __xskq_cons_release(xs->tx); xs->sk.sk_write_space(&xs->sk); } @@ -318,7 +318,7 @@ bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc) struct xdp_sock *xs; rcu_read_lock(); - list_for_each_entry_rcu(xs, &umem->xsk_list, list) { + list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) { if (!xskq_cons_peek_desc(xs->tx, desc, umem)) continue; diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c index c90e9c1e3c63..57fb81bd593c 100644 --- a/net/xdp/xsk_queue.c +++ b/net/xdp/xsk_queue.c @@ -9,12 +9,12 @@ #include "xsk_queue.h" -void xskq_set_umem(struct xsk_queue *q, u64 size, u64 chunk_mask) +void xskq_set_umem(struct xsk_queue *q, u64 umem_size, u64 chunk_mask) { if (!q) return; - q->size = size; + q->umem_size = umem_size; q->chunk_mask = chunk_mask; } diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index b50bb5c76da5..648733ec24ac 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -30,7 +30,7 @@ struct xdp_umem_ring { struct xsk_queue { u64 chunk_mask; - u64 size; + u64 umem_size; u32 ring_mask; u32 nentries; u32 cached_prod; @@ -123,7 +123,7 @@ static inline bool xskq_cons_is_valid_unaligned(struct xsk_queue *q, u64 base_addr = xsk_umem_extract_addr(addr); addr = xsk_umem_add_offset_to_addr(addr); - if (base_addr >= q->size || addr >= q->size || + if (base_addr >= q->umem_size || addr >= q->umem_size || xskq_cons_crosses_non_contig_pg(umem, addr, length)) { q->invalid_descs++; return false; @@ -134,7 +134,7 @@ static inline bool xskq_cons_is_valid_unaligned(struct xsk_queue *q, static inline bool xskq_cons_is_valid_addr(struct xsk_queue *q, u64 addr) { - if (addr >= q->size) { + if (addr >= q->umem_size) { q->invalid_descs++; return false; } @@ -379,7 +379,7 @@ static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q) return q ? q->invalid_descs : 0; } -void xskq_set_umem(struct xsk_queue *q, u64 size, u64 chunk_mask); +void xskq_set_umem(struct xsk_queue *q, u64 umem_size, u64 chunk_mask); struct xsk_queue *xskq_create(u32 nentries, bool umem_queue); void xskq_destroy(struct xsk_queue *q_ops); -- cgit v1.2.3 From 07bf2d97d1f37e7ac8d7be2d84ff108d43556a1d Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Mon, 4 May 2020 15:33:52 +0200 Subject: xsk: Remove unnecessary member in xdp_umem Remove the unnecessary member of address in struct xdp_umem as it is only used during the umem registration. No need to carry this around as it is not used during run-time nor when unregistering the umem. Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Acked-by: Jonathan Lemon Link: https://lore.kernel.org/bpf/1588599232-24897-3-git-send-email-magnus.karlsson@intel.com --- include/net/xdp_sock.h | 1 - net/xdp/xdp_umem.c | 7 +++---- 2 files changed, 3 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index b72f1f4c3b15..67191ccaab85 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -50,7 +50,6 @@ struct xdp_umem { u32 headroom; u32 chunk_size_nohr; struct user_struct *user; - unsigned long address; refcount_t users; struct work_struct work; struct page **pgs; diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 7211f4572760..37ace3bc0d48 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -279,7 +279,7 @@ void xdp_put_umem(struct xdp_umem *umem) } } -static int xdp_umem_pin_pages(struct xdp_umem *umem) +static int xdp_umem_pin_pages(struct xdp_umem *umem, unsigned long address) { unsigned int gup_flags = FOLL_WRITE; long npgs; @@ -291,7 +291,7 @@ static int xdp_umem_pin_pages(struct xdp_umem *umem) return -ENOMEM; down_read(¤t->mm->mmap_sem); - npgs = pin_user_pages(umem->address, umem->npgs, + npgs = pin_user_pages(address, umem->npgs, gup_flags | FOLL_LONGTERM, &umem->pgs[0], NULL); up_read(¤t->mm->mmap_sem); @@ -385,7 +385,6 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) if (headroom >= chunk_size - XDP_PACKET_HEADROOM) return -EINVAL; - umem->address = (unsigned long)addr; umem->chunk_mask = unaligned_chunks ? XSK_UNALIGNED_BUF_ADDR_MASK : ~((u64)chunk_size - 1); umem->size = size; @@ -404,7 +403,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) if (err) return err; - err = xdp_umem_pin_pages(umem); + err = xdp_umem_pin_pages(umem, (unsigned long)addr); if (err) goto out_account; -- cgit v1.2.3 From 592138a88d967ae9279275ef275b729e866a552a Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Tue, 5 May 2020 16:47:36 +0800 Subject: net: sched: choke: Remove unused inline function choke_set_classid There's no callers in-tree anymore since commit 5952fde10c35 ("net: sched: choke: remove dead filter classify code") Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- net/sched/sch_choke.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'net') diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index a36974e9c601..2d350c734375 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -142,11 +142,6 @@ static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb) return (struct choke_skb_cb *)qdisc_skb_cb(skb)->data; } -static inline void choke_set_classid(struct sk_buff *skb, u16 classid) -{ - choke_skb_cb(skb)->classid = classid; -} - /* * Compare flow of two packets * Returns true only if source and destination address and port match. -- cgit v1.2.3 From fe121e078da1fd8a061ab22f26c5911f8ebf46cb Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 5 May 2020 12:08:02 -0700 Subject: sch_choke: Remove classid from choke_skb_cb. Suggested by Cong Wang. Signed-off-by: David S. Miller --- net/sched/sch_choke.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index 2d350c734375..59ff466ec7cb 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -131,7 +131,6 @@ static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx, } struct choke_skb_cb { - u16 classid; u8 keys_valid; struct flow_keys_digest keys; }; -- cgit v1.2.3 From 0a99be434d145079d0509473b19e840629d851c2 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Tue, 5 May 2020 15:01:20 +0200 Subject: net/smc: log important pnetid and state change events Print to system log when SMC links are available or go down, link group state changes or pnetids are applied to and removed from devices. The log entries are triggered by either user configuration actions or adapter activation/deactivation events and are not expected to happen often. The entries help SMC users to keep track of the SMC link group status and to detect when actions are needed (like to add replacements for failed adapters). Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 6 ++---- net/smc/smc_core.c | 34 +++++++++++++++++++++++++++++----- net/smc/smc_core.h | 2 +- net/smc/smc_ib.c | 11 +++++++++++ net/smc/smc_ism.c | 6 ++++++ net/smc/smc_llc.c | 25 +++++++++++++++++++------ net/smc/smc_llc.h | 2 +- net/smc/smc_pnet.c | 47 ++++++++++++++++++++++++++++++++++++++++++++--- 8 files changed, 113 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 4e4421c95ca1..903321543838 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -378,8 +378,6 @@ static int smcr_clnt_conf_first_link(struct smc_sock *smc) struct smc_llc_qentry *qentry; int rc; - link->lgr->type = SMC_LGR_SINGLE; - /* receive CONFIRM LINK request from server over RoCE fabric */ qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME, SMC_LLC_CONFIRM_LINK); @@ -414,6 +412,7 @@ static int smcr_clnt_conf_first_link(struct smc_sock *smc) return SMC_CLC_DECL_TIMEOUT_CL; smc_llc_link_active(link); + smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE); /* optional 2nd link, receive ADD LINK request from server */ qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME, @@ -1037,8 +1036,6 @@ static int smcr_serv_conf_first_link(struct smc_sock *smc) struct smc_llc_qentry *qentry; int rc; - link->lgr->type = SMC_LGR_SINGLE; - if (smcr_link_reg_rmb(link, smc->conn.rmb_desc)) return SMC_CLC_DECL_ERR_REGRMB; @@ -1067,6 +1064,7 @@ static int smcr_serv_conf_first_link(struct smc_sock *smc) smc->conn.rmb_desc->is_conf_rkey = true; smc_llc_link_active(link); + smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE); /* initial contact - try to establish second link */ smc_llc_srv_add_link(link); diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index fb5f685ff494..65de700e1f17 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -369,7 +369,7 @@ dealloc_pd: free_link_mem: smc_wr_free_link_mem(lnk); clear_llc_lnk: - smc_llc_link_clear(lnk); + smc_llc_link_clear(lnk, false); out: put_device(&ini->ib_dev->ibdev->dev); memset(lnk, 0, sizeof(struct smc_link)); @@ -718,14 +718,14 @@ static void smcr_rtoken_clear_link(struct smc_link *lnk) } /* must be called under lgr->llc_conf_mutex lock */ -void smcr_link_clear(struct smc_link *lnk) +void smcr_link_clear(struct smc_link *lnk, bool log) { struct smc_ib_device *smcibdev; if (!lnk->lgr || lnk->state == SMC_LNK_UNUSED) return; lnk->peer_qpn = 0; - smc_llc_link_clear(lnk); + smc_llc_link_clear(lnk, log); smcr_buf_unmap_lgr(lnk); smcr_rtoken_clear_link(lnk); smc_ib_modify_qp_reset(lnk); @@ -812,7 +812,7 @@ static void smc_lgr_free(struct smc_link_group *lgr) mutex_lock(&lgr->llc_conf_mutex); for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { if (lgr->lnk[i].state != SMC_LNK_UNUSED) - smcr_link_clear(&lgr->lnk[i]); + smcr_link_clear(&lgr->lnk[i], false); } mutex_unlock(&lgr->llc_conf_mutex); smc_llc_lgr_clear(lgr); @@ -1040,12 +1040,36 @@ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev) /* set new lgr type and clear all asymmetric link tagging */ void smcr_lgr_set_type(struct smc_link_group *lgr, enum smc_lgr_type new_type) { + char *lgr_type = ""; int i; for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) if (smc_link_usable(&lgr->lnk[i])) lgr->lnk[i].link_is_asym = false; + if (lgr->type == new_type) + return; lgr->type = new_type; + + switch (lgr->type) { + case SMC_LGR_NONE: + lgr_type = "NONE"; + break; + case SMC_LGR_SINGLE: + lgr_type = "SINGLE"; + break; + case SMC_LGR_SYMMETRIC: + lgr_type = "SYMMETRIC"; + break; + case SMC_LGR_ASYMMETRIC_PEER: + lgr_type = "ASYMMETRIC_PEER"; + break; + case SMC_LGR_ASYMMETRIC_LOCAL: + lgr_type = "ASYMMETRIC_LOCAL"; + break; + } + pr_warn_ratelimited("smc: SMC-R lg %*phN state changed: " + "%s, pnetid %.16s\n", SMC_LGR_ID_SIZE, &lgr->id, + lgr_type, lgr->pnet_id); } /* set new lgr type and tag a link as asymmetric */ @@ -1146,7 +1170,7 @@ static void smcr_link_down(struct smc_link *lnk) smc_ib_modify_qp_reset(lnk); to_lnk = smc_switch_conns(lgr, lnk, true); if (!to_lnk) { /* no backup link available */ - smcr_link_clear(lnk); + smcr_link_clear(lnk, true); return; } smcr_lgr_set_type(lgr, SMC_LGR_SINGLE); diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 4ae76802214f..86d160f0d187 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -383,7 +383,7 @@ void smc_core_exit(void); int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, u8 link_idx, struct smc_init_info *ini); -void smcr_link_clear(struct smc_link *lnk); +void smcr_link_clear(struct smc_link *lnk, bool log); int smcr_buf_map_lgr(struct smc_link *lnk); int smcr_buf_reg_lgr(struct smc_link *lnk); void smcr_lgr_set_type(struct smc_link_group *lgr, enum smc_lgr_type new_type); diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 2c743caad69a..f0a5064bf9bd 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -575,6 +575,8 @@ static void smc_ib_add_dev(struct ib_device *ibdev) /* trigger reading of the port attributes */ port_cnt = smcibdev->ibdev->phys_port_cnt; + pr_warn_ratelimited("smc: adding ib device %s with port count %d\n", + smcibdev->ibdev->name, port_cnt); for (i = 0; i < min_t(size_t, port_cnt, SMC_MAX_PORTS); i++) { @@ -583,6 +585,13 @@ static void smc_ib_add_dev(struct ib_device *ibdev) if (smc_pnetid_by_dev_port(ibdev->dev.parent, i, smcibdev->pnetid[i])) smc_pnetid_by_table_ib(smcibdev, i + 1); + pr_warn_ratelimited("smc: ib device %s port %d has pnetid " + "%.16s%s\n", + smcibdev->ibdev->name, i + 1, + smcibdev->pnetid[i], + smcibdev->pnetid_by_user[i] ? + " (user defined)" : + ""); } schedule_work(&smcibdev->port_event_work); } @@ -599,6 +608,8 @@ static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data) spin_lock(&smc_ib_devices.lock); list_del_init(&smcibdev->list); /* remove from smc_ib_devices */ spin_unlock(&smc_ib_devices.lock); + pr_warn_ratelimited("smc: removing ib device %s\n", + smcibdev->ibdev->name); smc_smcr_terminate_all(smcibdev); smc_ib_cleanup_per_ibdev(smcibdev); ib_unregister_event_handler(&smcibdev->event_handler); diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index 32be2da2cb85..91f85fc09fb8 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -321,12 +321,18 @@ int smcd_register_dev(struct smcd_dev *smcd) list_add_tail(&smcd->list, &smcd_dev_list.list); spin_unlock(&smcd_dev_list.lock); + pr_warn_ratelimited("smc: adding smcd device %s with pnetid %.16s%s\n", + dev_name(&smcd->dev), smcd->pnetid, + smcd->pnetid_by_user ? " (user defined)" : ""); + return device_add(&smcd->dev); } EXPORT_SYMBOL_GPL(smcd_register_dev); void smcd_unregister_dev(struct smcd_dev *smcd) { + pr_warn_ratelimited("smc: removing smcd device %s\n", + dev_name(&smcd->dev)); spin_lock(&smcd_dev_list.lock); list_del_init(&smcd->list); spin_unlock(&smcd_dev_list.lock); diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 66ddc9cf5e2f..4cc583678ac7 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -870,7 +870,7 @@ int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry) if (!rc) goto out; out_clear_lnk: - smcr_link_clear(lnk_new); + smcr_link_clear(lnk_new, false); out_reject: smc_llc_cli_add_link_reject(qentry); out: @@ -977,7 +977,7 @@ static void smc_llc_delete_asym_link(struct smc_link_group *lgr) } smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); out_free: - smcr_link_clear(lnk_asym); + smcr_link_clear(lnk_asym, true); } static int smc_llc_srv_rkey_exchange(struct smc_link *link, @@ -1121,7 +1121,7 @@ int smc_llc_srv_add_link(struct smc_link *link) goto out_err; return 0; out_err: - smcr_link_clear(link_new); + smcr_link_clear(link_new, false); return rc; } @@ -1227,7 +1227,7 @@ static void smc_llc_process_cli_delete_link(struct smc_link_group *lgr) smc_switch_conns(lgr, lnk_del, false); smc_wr_tx_wait_no_pending_sends(lnk_del); } - smcr_link_clear(lnk_del); + smcr_link_clear(lnk_del, true); active_links = smc_llc_active_link_count(lgr); if (lnk_del == lnk_asym) { @@ -1320,7 +1320,7 @@ static void smc_llc_process_srv_delete_link(struct smc_link_group *lgr) } } } - smcr_link_clear(lnk_del); + smcr_link_clear(lnk_del, true); active_links = smc_llc_active_link_count(lgr); if (active_links == 1) { @@ -1711,6 +1711,12 @@ int smc_llc_link_init(struct smc_link *link) void smc_llc_link_active(struct smc_link *link) { + pr_warn_ratelimited("smc: SMC-R lg %*phN link added: id %*phN, " + "peerid %*phN, ibdev %s, ibport %d\n", + SMC_LGR_ID_SIZE, &link->lgr->id, + SMC_LGR_ID_SIZE, &link->link_uid, + SMC_LGR_ID_SIZE, &link->peer_link_uid, + link->smcibdev->ibdev->name, link->ibport); link->state = SMC_LNK_ACTIVE; if (link->lgr->llc_testlink_time) { link->llc_testlink_time = link->lgr->llc_testlink_time * HZ; @@ -1720,8 +1726,15 @@ void smc_llc_link_active(struct smc_link *link) } /* called in worker context */ -void smc_llc_link_clear(struct smc_link *link) +void smc_llc_link_clear(struct smc_link *link, bool log) { + if (log) + pr_warn_ratelimited("smc: SMC-R lg %*phN link removed: id %*phN" + ", peerid %*phN, ibdev %s, ibport %d\n", + SMC_LGR_ID_SIZE, &link->lgr->id, + SMC_LGR_ID_SIZE, &link->link_uid, + SMC_LGR_ID_SIZE, &link->peer_link_uid, + link->smcibdev->ibdev->name, link->ibport); complete(&link->llc_testlink_resp); cancel_delayed_work_sync(&link->llc_testlink_wrk); smc_wr_wakeup_reg_wait(link); diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index 55287376112d..a5d2fe3eea61 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -82,7 +82,7 @@ void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc); void smc_llc_lgr_clear(struct smc_link_group *lgr); int smc_llc_link_init(struct smc_link *link); void smc_llc_link_active(struct smc_link *link); -void smc_llc_link_clear(struct smc_link *link); +void smc_llc_link_clear(struct smc_link *link, bool log); int smc_llc_do_confirm_rkey(struct smc_link *send_link, struct smc_buf_desc *rmb_desc); int smc_llc_do_delete_rkey(struct smc_link_group *lgr, diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 50c96e843fab..be03f1260d59 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -110,8 +110,14 @@ static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name) if (!pnet_name || smc_pnet_match(pnetelem->pnet_name, pnet_name)) { list_del(&pnetelem->list); - if (pnetelem->type == SMC_PNET_ETH && pnetelem->ndev) + if (pnetelem->type == SMC_PNET_ETH && pnetelem->ndev) { dev_put(pnetelem->ndev); + pr_warn_ratelimited("smc: net device %s " + "erased user defined " + "pnetid %.16s\n", + pnetelem->eth_name, + pnetelem->pnet_name); + } kfree(pnetelem); rc = 0; } @@ -130,6 +136,12 @@ static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name) (!pnet_name || smc_pnet_match(pnet_name, ibdev->pnetid[ibport]))) { + pr_warn_ratelimited("smc: ib device %s ibport " + "%d erased user defined " + "pnetid %.16s\n", + ibdev->ibdev->name, + ibport + 1, + ibdev->pnetid[ibport]); memset(ibdev->pnetid[ibport], 0, SMC_MAX_PNETID_LEN); ibdev->pnetid_by_user[ibport] = false; @@ -144,6 +156,10 @@ static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name) if (smcd_dev->pnetid_by_user && (!pnet_name || smc_pnet_match(pnet_name, smcd_dev->pnetid))) { + pr_warn_ratelimited("smc: smcd device %s " + "erased user defined pnetid " + "%.16s\n", dev_name(&smcd_dev->dev), + smcd_dev->pnetid); memset(smcd_dev->pnetid, 0, SMC_MAX_PNETID_LEN); smcd_dev->pnetid_by_user = false; rc = 0; @@ -174,6 +190,10 @@ static int smc_pnet_add_by_ndev(struct net_device *ndev) dev_hold(ndev); pnetelem->ndev = ndev; rc = 0; + pr_warn_ratelimited("smc: adding net device %s with " + "user defined pnetid %.16s\n", + pnetelem->eth_name, + pnetelem->pnet_name); break; } } @@ -201,6 +221,10 @@ static int smc_pnet_remove_by_ndev(struct net_device *ndev) dev_put(pnetelem->ndev); pnetelem->ndev = NULL; rc = 0; + pr_warn_ratelimited("smc: removing net device %s with " + "user defined pnetid %.16s\n", + pnetelem->eth_name, + pnetelem->pnet_name); break; } } @@ -357,6 +381,10 @@ static int smc_pnet_add_eth(struct smc_pnettable *pnettable, struct net *net, kfree(new_pe); goto out_put; } + if (ndev) + pr_warn_ratelimited("smc: net device %s " + "applied user defined pnetid %.16s\n", + new_pe->eth_name, new_pe->pnet_name); return 0; out_put: @@ -377,11 +405,24 @@ static int smc_pnet_add_ib(struct smc_pnettable *pnettable, char *ib_name, /* try to apply the pnetid to active devices */ ib_dev = smc_pnet_find_ib(ib_name); - if (ib_dev) + if (ib_dev) { ibdev_applied = smc_pnet_apply_ib(ib_dev, ib_port, pnet_name); + if (ibdev_applied) + pr_warn_ratelimited("smc: ib device %s ibport %d " + "applied user defined pnetid " + "%.16s\n", ib_dev->ibdev->name, + ib_port, + ib_dev->pnetid[ib_port - 1]); + } smcd_dev = smc_pnet_find_smcd(ib_name); - if (smcd_dev) + if (smcd_dev) { smcddev_applied = smc_pnet_apply_smcd(smcd_dev, pnet_name); + if (smcddev_applied) + pr_warn_ratelimited("smc: smcd device %s " + "applied user defined pnetid " + "%.16s\n", dev_name(&smcd_dev->dev), + smcd_dev->pnetid); + } /* Apply fails when a device has a hardware-defined pnetid set, do not * add a pnet table entry in that case. */ -- cgit v1.2.3 From fea805237dd984a71a2c5e5cf074a15505d5ba31 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Tue, 5 May 2020 15:01:21 +0200 Subject: net/smc: remove unused inline function smc_curs_read commit bac6de7b6370 ("net/smc: eliminate cursor read and write calls") left behind this. Signed-off-by: YueHaibing Signed-off-by: Karsten Graul Signed-off-by: David S. Miller --- net/smc/smc_cdc.h | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'net') diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h index 9cfabc9af120..2ddcc5fb5ceb 100644 --- a/net/smc/smc_cdc.h +++ b/net/smc/smc_cdc.h @@ -97,23 +97,6 @@ static inline void smc_curs_add(int size, union smc_host_cursor *curs, } } -/* SMC cursors are 8 bytes long and require atomic reading and writing */ -static inline u64 smc_curs_read(union smc_host_cursor *curs, - struct smc_connection *conn) -{ -#ifndef KERNEL_HAS_ATOMIC64 - unsigned long flags; - u64 ret; - - spin_lock_irqsave(&conn->acurs_lock, flags); - ret = curs->acurs; - spin_unlock_irqrestore(&conn->acurs_lock, flags); - return ret; -#else - return atomic64_read(&curs->acurs); -#endif -} - /* Copy cursor src into tgt */ static inline void smc_curs_copy(union smc_host_cursor *tgt, union smc_host_cursor *src, -- cgit v1.2.3 From f989d546a2d5a9f001f6f8be49d98c10ab9b1897 Mon Sep 17 00:00:00 2001 From: William Tu Date: Tue, 5 May 2020 09:05:06 -0700 Subject: erspan: Add type I version 0 support. The Type I ERSPAN frame format is based on the barebones IP + GRE(4-byte) encapsulation on top of the raw mirrored frame. Both type I and II use 0x88BE as protocol type. Unlike type II and III, no sequence number or key is required. To creat a type I erspan tunnel device: $ ip link add dev erspan11 type erspan \ local 172.16.1.100 remote 172.16.1.200 \ erspan_ver 0 Signed-off-by: William Tu Signed-off-by: David S. Miller --- include/net/erspan.h | 19 +++++++++++++++-- net/ipv4/ip_gre.c | 58 ++++++++++++++++++++++++++++++++++++++-------------- 2 files changed, 60 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/include/net/erspan.h b/include/net/erspan.h index b39643ef4c95..0d9e86bd9893 100644 --- a/include/net/erspan.h +++ b/include/net/erspan.h @@ -2,7 +2,19 @@ #define __LINUX_ERSPAN_H /* - * GRE header for ERSPAN encapsulation (8 octets [34:41]) -- 8 bytes + * GRE header for ERSPAN type I encapsulation (4 octets [34:37]) + * 0 1 2 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * |0|0|0|0|0|00000|000000000|00000| Protocol Type for ERSPAN | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * + * The Type I ERSPAN frame format is based on the barebones IP + GRE + * encapsulation (as described above) on top of the raw mirrored frame. + * There is no extra ERSPAN header. + * + * + * GRE header for ERSPAN type II and II encapsulation (8 octets [34:41]) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ @@ -43,7 +55,7 @@ * | Platform Specific Info | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * - * GRE proto ERSPAN type II = 0x88BE, type III = 0x22EB + * GRE proto ERSPAN type I/II = 0x88BE, type III = 0x22EB */ #include @@ -139,6 +151,9 @@ static inline u8 get_hwid(const struct erspan_md2 *md2) static inline int erspan_hdr_len(int version) { + if (version == 0) + return 0; + return sizeof(struct erspan_base_hdr) + (version == 1 ? ERSPAN_V1_MDSIZE : ERSPAN_V2_MDSIZE); } diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 029b24eeafba..e29cd48674d7 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -248,6 +248,15 @@ static void gre_err(struct sk_buff *skb, u32 info) ipgre_err(skb, info, &tpi); } +static bool is_erspan_type1(int gre_hdr_len) +{ + /* Both ERSPAN type I (version 0) and type II (version 1) use + * protocol 0x88BE, but the type I has only 4-byte GRE header, + * while type II has 8-byte. + */ + return gre_hdr_len == 4; +} + static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, int gre_hdr_len) { @@ -262,17 +271,26 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, int len; itn = net_generic(net, erspan_net_id); - iph = ip_hdr(skb); - ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len); - ver = ershdr->ver; - - tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, - tpi->flags | TUNNEL_KEY, - iph->saddr, iph->daddr, tpi->key); + if (is_erspan_type1(gre_hdr_len)) { + ver = 0; + tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, + tpi->flags | TUNNEL_NO_KEY, + iph->saddr, iph->daddr, 0); + } else { + ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len); + ver = ershdr->ver; + tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, + tpi->flags | TUNNEL_KEY, + iph->saddr, iph->daddr, tpi->key); + } if (tunnel) { - len = gre_hdr_len + erspan_hdr_len(ver); + if (is_erspan_type1(gre_hdr_len)) + len = gre_hdr_len; + else + len = gre_hdr_len + erspan_hdr_len(ver); + if (unlikely(!pskb_may_pull(skb, len))) return PACKET_REJECT; @@ -665,7 +683,10 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb, } /* Push ERSPAN header */ - if (tunnel->erspan_ver == 1) { + if (tunnel->erspan_ver == 0) { + proto = htons(ETH_P_ERSPAN); + tunnel->parms.o_flags &= ~TUNNEL_SEQ; + } else if (tunnel->erspan_ver == 1) { erspan_build_header(skb, ntohl(tunnel->parms.o_key), tunnel->index, truncate, true); @@ -1066,7 +1087,10 @@ static int erspan_validate(struct nlattr *tb[], struct nlattr *data[], if (ret) return ret; - /* ERSPAN should only have GRE sequence and key flag */ + if (nla_get_u8(data[IFLA_GRE_ERSPAN_VER]) == 0) + return 0; + + /* ERSPAN type II/III should only have GRE sequence and key flag */ if (data[IFLA_GRE_OFLAGS]) flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]); if (data[IFLA_GRE_IFLAGS]) @@ -1174,7 +1198,7 @@ static int erspan_netlink_parms(struct net_device *dev, if (data[IFLA_GRE_ERSPAN_VER]) { t->erspan_ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]); - if (t->erspan_ver != 1 && t->erspan_ver != 2) + if (t->erspan_ver > 2) return -EINVAL; } @@ -1259,7 +1283,11 @@ static int erspan_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); - tunnel->tun_hlen = 8; + if (tunnel->erspan_ver == 0) + tunnel->tun_hlen = 4; /* 4-byte GRE hdr. */ + else + tunnel->tun_hlen = 8; /* 8-byte GRE hdr. */ + tunnel->parms.iph.protocol = IPPROTO_GRE; tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen + erspan_hdr_len(tunnel->erspan_ver); @@ -1456,8 +1484,8 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) struct ip_tunnel_parm *p = &t->parms; __be16 o_flags = p->o_flags; - if (t->erspan_ver == 1 || t->erspan_ver == 2) { - if (!t->collect_md) + if (t->erspan_ver <= 2) { + if (t->erspan_ver != 0 && !t->collect_md) o_flags |= TUNNEL_KEY; if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, t->erspan_ver)) @@ -1466,7 +1494,7 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) if (t->erspan_ver == 1) { if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, t->index)) goto nla_put_failure; - } else { + } else if (t->erspan_ver == 2) { if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, t->dir)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, t->hwid)) -- cgit v1.2.3 From 6d64be3da282908bb17b0803b9edad8852ffea56 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 4 May 2020 10:06:03 +0200 Subject: xfrm: avoid extract_output indirection for ipv4 We can use a direct call for ipv4, so move the needed functions to net/xfrm/xfrm_output.c and call them directly. For ipv6 the indirection can be avoided as well but it will need a bit more work -- to ease review it will be done in another patch. Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 1 - net/ipv4/xfrm4_output.c | 40 ---------------------------------------- net/ipv4/xfrm4_state.c | 1 - net/xfrm/xfrm_output.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 46 insertions(+), 42 deletions(-) (limited to 'net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 2577666c34c8..397007324abd 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1580,7 +1580,6 @@ static inline int xfrm4_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi) return xfrm_input(skb, nexthdr, spi, 0); } -int xfrm4_extract_output(struct xfrm_state *x, struct sk_buff *skb); int xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb); int xfrm4_output_finish(struct sock *sk, struct sk_buff *skb); int xfrm4_protocol_register(struct xfrm4_protocol *handler, unsigned char protocol); diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index 89ba7c87de5d..21c8fa0a31ed 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -14,46 +14,6 @@ #include #include -static int xfrm4_tunnel_check_size(struct sk_buff *skb) -{ - int mtu, ret = 0; - - if (IPCB(skb)->flags & IPSKB_XFRM_TUNNEL_SIZE) - goto out; - - if (!(ip_hdr(skb)->frag_off & htons(IP_DF)) || skb->ignore_df) - goto out; - - mtu = dst_mtu(skb_dst(skb)); - if ((!skb_is_gso(skb) && skb->len > mtu) || - (skb_is_gso(skb) && - !skb_gso_validate_network_len(skb, ip_skb_dst_mtu(skb->sk, skb)))) { - skb->protocol = htons(ETH_P_IP); - - if (skb->sk) - xfrm_local_error(skb, mtu); - else - icmp_send(skb, ICMP_DEST_UNREACH, - ICMP_FRAG_NEEDED, htonl(mtu)); - ret = -EMSGSIZE; - } -out: - return ret; -} - -int xfrm4_extract_output(struct xfrm_state *x, struct sk_buff *skb) -{ - int err; - - err = xfrm4_tunnel_check_size(skb); - if (err) - return err; - - XFRM_MODE_SKB_CB(skb)->protocol = ip_hdr(skb)->protocol; - - return xfrm4_extract_header(skb); -} - int xfrm4_output_finish(struct sock *sk, struct sk_buff *skb) { memset(IPCB(skb), 0, sizeof(*IPCB(skb))); diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c index f8ed3c3bb928..d7c200779e4f 100644 --- a/net/ipv4/xfrm4_state.c +++ b/net/ipv4/xfrm4_state.c @@ -37,7 +37,6 @@ static struct xfrm_state_afinfo xfrm4_state_afinfo = { .output = xfrm4_output, .output_finish = xfrm4_output_finish, .extract_input = xfrm4_extract_input, - .extract_output = xfrm4_extract_output, .transport_finish = xfrm4_transport_finish, .local_error = xfrm4_local_error, }; diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 2fd3d990d992..a7b3af7f7a1e 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -609,6 +610,47 @@ out: } EXPORT_SYMBOL_GPL(xfrm_output); +static int xfrm4_tunnel_check_size(struct sk_buff *skb) +{ + int mtu, ret = 0; + + if (IPCB(skb)->flags & IPSKB_XFRM_TUNNEL_SIZE) + goto out; + + if (!(ip_hdr(skb)->frag_off & htons(IP_DF)) || skb->ignore_df) + goto out; + + mtu = dst_mtu(skb_dst(skb)); + if ((!skb_is_gso(skb) && skb->len > mtu) || + (skb_is_gso(skb) && + !skb_gso_validate_network_len(skb, ip_skb_dst_mtu(skb->sk, skb)))) { + skb->protocol = htons(ETH_P_IP); + + if (skb->sk) + xfrm_local_error(skb, mtu); + else + icmp_send(skb, ICMP_DEST_UNREACH, + ICMP_FRAG_NEEDED, htonl(mtu)); + ret = -EMSGSIZE; + } +out: + return ret; +} + +static int xfrm4_extract_output(struct xfrm_state *x, struct sk_buff *skb) +{ + int err; + + err = xfrm4_tunnel_check_size(skb); + if (err) + return err; + + XFRM_MODE_SKB_CB(skb)->protocol = ip_hdr(skb)->protocol; + + xfrm4_extract_header(skb); + return 0; +} + static int xfrm_inner_extract_output(struct xfrm_state *x, struct sk_buff *skb) { const struct xfrm_state_afinfo *afinfo; @@ -624,6 +666,10 @@ static int xfrm_inner_extract_output(struct xfrm_state *x, struct sk_buff *skb) if (inner_mode == NULL) return -EAFNOSUPPORT; + switch (inner_mode->family) { + case AF_INET: + return xfrm4_extract_output(x, skb); + } rcu_read_lock(); afinfo = xfrm_state_afinfo_get_rcu(inner_mode->family); if (likely(afinfo)) -- cgit v1.2.3 From a269fbfc4e9ffe48c1f8142e60a49b6f2e588c58 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 4 May 2020 10:06:04 +0200 Subject: xfrm: state: remove extract_input indirection from xfrm_state_afinfo In order to keep CONFIG_IPV6=m working, xfrm6_extract_header needs to be duplicated. It will be removed again in a followup change when the remaining caller is moved to net/xfrm as well. Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 3 --- net/ipv4/xfrm4_input.c | 5 ----- net/ipv4/xfrm4_state.c | 1 - net/ipv6/xfrm6_input.c | 5 ----- net/ipv6/xfrm6_output.c | 17 ++++++++++++++++- net/ipv6/xfrm6_state.c | 24 ------------------------ net/xfrm/xfrm_inout.h | 18 ++++++++++++++++++ net/xfrm/xfrm_input.c | 21 +++++++++++---------- 8 files changed, 45 insertions(+), 49 deletions(-) (limited to 'net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 397007324abd..a21c1dea5340 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -362,8 +362,6 @@ struct xfrm_state_afinfo { int (*output)(struct net *net, struct sock *sk, struct sk_buff *skb); int (*output_finish)(struct sock *sk, struct sk_buff *skb); - int (*extract_input)(struct xfrm_state *x, - struct sk_buff *skb); int (*extract_output)(struct xfrm_state *x, struct sk_buff *skb); int (*transport_finish)(struct sk_buff *skb, @@ -1587,7 +1585,6 @@ int xfrm4_protocol_deregister(struct xfrm4_protocol *handler, unsigned char prot int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short family); int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family); void xfrm4_local_error(struct sk_buff *skb, u32 mtu); -int xfrm6_extract_header(struct sk_buff *skb); int xfrm6_extract_input(struct xfrm_state *x, struct sk_buff *skb); int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi, struct ip6_tnl *t); diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index f8de2482a529..ad2afeef4f10 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -18,11 +18,6 @@ #include #include -int xfrm4_extract_input(struct xfrm_state *x, struct sk_buff *skb) -{ - return xfrm4_extract_header(skb); -} - static int xfrm4_rcv_encap_finish2(struct net *net, struct sock *sk, struct sk_buff *skb) { diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c index d7c200779e4f..521fc1bc069c 100644 --- a/net/ipv4/xfrm4_state.c +++ b/net/ipv4/xfrm4_state.c @@ -36,7 +36,6 @@ static struct xfrm_state_afinfo xfrm4_state_afinfo = { .proto = IPPROTO_IPIP, .output = xfrm4_output, .output_finish = xfrm4_output_finish, - .extract_input = xfrm4_extract_input, .transport_finish = xfrm4_transport_finish, .local_error = xfrm4_local_error, }; diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c index 56f52353b324..04cbeefd8982 100644 --- a/net/ipv6/xfrm6_input.c +++ b/net/ipv6/xfrm6_input.c @@ -17,11 +17,6 @@ #include #include -int xfrm6_extract_input(struct xfrm_state *x, struct sk_buff *skb) -{ - return xfrm6_extract_header(skb); -} - int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi, struct ip6_tnl *t) { diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index fbe51d40bd7e..855078a43fc7 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -94,6 +94,20 @@ out: return ret; } +static void __xfrm6_extract_header(struct sk_buff *skb) +{ + struct ipv6hdr *iph = ipv6_hdr(skb); + + XFRM_MODE_SKB_CB(skb)->ihl = sizeof(*iph); + XFRM_MODE_SKB_CB(skb)->id = 0; + XFRM_MODE_SKB_CB(skb)->frag_off = htons(IP_DF); + XFRM_MODE_SKB_CB(skb)->tos = ipv6_get_dsfield(iph); + XFRM_MODE_SKB_CB(skb)->ttl = iph->hop_limit; + XFRM_MODE_SKB_CB(skb)->optlen = 0; + memcpy(XFRM_MODE_SKB_CB(skb)->flow_lbl, iph->flow_lbl, + sizeof(XFRM_MODE_SKB_CB(skb)->flow_lbl)); +} + int xfrm6_extract_output(struct xfrm_state *x, struct sk_buff *skb) { int err; @@ -104,7 +118,8 @@ int xfrm6_extract_output(struct xfrm_state *x, struct sk_buff *skb) XFRM_MODE_SKB_CB(skb)->protocol = ipv6_hdr(skb)->nexthdr; - return xfrm6_extract_header(skb); + __xfrm6_extract_header(skb); + return 0; } int xfrm6_output_finish(struct sock *sk, struct sk_buff *skb) diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c index 78daadecbdef..8fbf5a68ee6e 100644 --- a/net/ipv6/xfrm6_state.c +++ b/net/ipv6/xfrm6_state.c @@ -13,36 +13,12 @@ */ #include -#include -#include -#include -#include -#include -#include -#include - -int xfrm6_extract_header(struct sk_buff *skb) -{ - struct ipv6hdr *iph = ipv6_hdr(skb); - - XFRM_MODE_SKB_CB(skb)->ihl = sizeof(*iph); - XFRM_MODE_SKB_CB(skb)->id = 0; - XFRM_MODE_SKB_CB(skb)->frag_off = htons(IP_DF); - XFRM_MODE_SKB_CB(skb)->tos = ipv6_get_dsfield(iph); - XFRM_MODE_SKB_CB(skb)->ttl = iph->hop_limit; - XFRM_MODE_SKB_CB(skb)->optlen = 0; - memcpy(XFRM_MODE_SKB_CB(skb)->flow_lbl, iph->flow_lbl, - sizeof(XFRM_MODE_SKB_CB(skb)->flow_lbl)); - - return 0; -} static struct xfrm_state_afinfo xfrm6_state_afinfo = { .family = AF_INET6, .proto = IPPROTO_IPV6, .output = xfrm6_output, .output_finish = xfrm6_output_finish, - .extract_input = xfrm6_extract_input, .extract_output = xfrm6_extract_output, .transport_finish = xfrm6_transport_finish, .local_error = xfrm6_local_error, diff --git a/net/xfrm/xfrm_inout.h b/net/xfrm/xfrm_inout.h index c7b0318938e2..e24abac92dc2 100644 --- a/net/xfrm/xfrm_inout.h +++ b/net/xfrm/xfrm_inout.h @@ -6,6 +6,24 @@ #ifndef XFRM_INOUT_H #define XFRM_INOUT_H 1 +static inline void xfrm6_extract_header(struct sk_buff *skb) +{ +#if IS_ENABLED(CONFIG_IPV6) + struct ipv6hdr *iph = ipv6_hdr(skb); + + XFRM_MODE_SKB_CB(skb)->ihl = sizeof(*iph); + XFRM_MODE_SKB_CB(skb)->id = 0; + XFRM_MODE_SKB_CB(skb)->frag_off = htons(IP_DF); + XFRM_MODE_SKB_CB(skb)->tos = ipv6_get_dsfield(iph); + XFRM_MODE_SKB_CB(skb)->ttl = iph->hop_limit; + XFRM_MODE_SKB_CB(skb)->optlen = 0; + memcpy(XFRM_MODE_SKB_CB(skb)->flow_lbl, iph->flow_lbl, + sizeof(XFRM_MODE_SKB_CB(skb)->flow_lbl)); +#else + WARN_ON_ONCE(1); +#endif +} + static inline void xfrm6_beet_make_header(struct sk_buff *skb) { struct ipv6hdr *iph = ipv6_hdr(skb); diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index aa35f23c4912..6db266a0cb2d 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -353,17 +353,18 @@ xfrm_inner_mode_encap_remove(struct xfrm_state *x, static int xfrm_prepare_input(struct xfrm_state *x, struct sk_buff *skb) { const struct xfrm_mode *inner_mode = &x->inner_mode; - const struct xfrm_state_afinfo *afinfo; - int err = -EAFNOSUPPORT; - - rcu_read_lock(); - afinfo = xfrm_state_afinfo_get_rcu(x->outer_mode.family); - if (likely(afinfo)) - err = afinfo->extract_input(x, skb); - rcu_read_unlock(); - if (err) - return err; + switch (x->outer_mode.family) { + case AF_INET: + xfrm4_extract_header(skb); + break; + case AF_INET6: + xfrm6_extract_header(skb); + break; + default: + WARN_ON_ONCE(1); + return -EAFNOSUPPORT; + } if (x->sel.family == AF_UNSPEC) { inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol); -- cgit v1.2.3 From 171916cbd53dec5c7b05efb56a201671d92effc1 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 4 May 2020 10:06:05 +0200 Subject: xfrm: move xfrm4_extract_header to common helper The function only initializes the XFRM CB in the skb. After previous patch xfrm4_extract_header is only called from net/xfrm/xfrm_{input,output}.c. Because of IPV6=m linker errors the ipv6 equivalent (xfrm6_extract_header) was already placed in xfrm_inout.h because we can't call functions residing in a module from the core. So do the same for the ipv4 helper and place it next to the ipv6 one. Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 1 - net/ipv4/xfrm4_state.c | 21 --------------------- net/xfrm/xfrm_inout.h | 14 ++++++++++++++ 3 files changed, 14 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index a21c1dea5340..8b956528b6e6 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1562,7 +1562,6 @@ int pktgen_xfrm_outer_mode_output(struct xfrm_state *x, struct sk_buff *skb); #endif void xfrm_local_error(struct sk_buff *skb, int mtu); -int xfrm4_extract_header(struct sk_buff *skb); int xfrm4_extract_input(struct xfrm_state *x, struct sk_buff *skb); int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type); diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c index 521fc1bc069c..b23a1711297b 100644 --- a/net/ipv4/xfrm4_state.c +++ b/net/ipv4/xfrm4_state.c @@ -8,28 +8,7 @@ * */ -#include #include -#include -#include -#include -#include - -int xfrm4_extract_header(struct sk_buff *skb) -{ - const struct iphdr *iph = ip_hdr(skb); - - XFRM_MODE_SKB_CB(skb)->ihl = sizeof(*iph); - XFRM_MODE_SKB_CB(skb)->id = iph->id; - XFRM_MODE_SKB_CB(skb)->frag_off = iph->frag_off; - XFRM_MODE_SKB_CB(skb)->tos = iph->tos; - XFRM_MODE_SKB_CB(skb)->ttl = iph->ttl; - XFRM_MODE_SKB_CB(skb)->optlen = iph->ihl * 4 - sizeof(*iph); - memset(XFRM_MODE_SKB_CB(skb)->flow_lbl, 0, - sizeof(XFRM_MODE_SKB_CB(skb)->flow_lbl)); - - return 0; -} static struct xfrm_state_afinfo xfrm4_state_afinfo = { .family = AF_INET, diff --git a/net/xfrm/xfrm_inout.h b/net/xfrm/xfrm_inout.h index e24abac92dc2..efc5e6b2e87b 100644 --- a/net/xfrm/xfrm_inout.h +++ b/net/xfrm/xfrm_inout.h @@ -6,6 +6,20 @@ #ifndef XFRM_INOUT_H #define XFRM_INOUT_H 1 +static inline void xfrm4_extract_header(struct sk_buff *skb) +{ + const struct iphdr *iph = ip_hdr(skb); + + XFRM_MODE_SKB_CB(skb)->ihl = sizeof(*iph); + XFRM_MODE_SKB_CB(skb)->id = iph->id; + XFRM_MODE_SKB_CB(skb)->frag_off = iph->frag_off; + XFRM_MODE_SKB_CB(skb)->tos = iph->tos; + XFRM_MODE_SKB_CB(skb)->ttl = iph->ttl; + XFRM_MODE_SKB_CB(skb)->optlen = iph->ihl * 4 - sizeof(*iph); + memset(XFRM_MODE_SKB_CB(skb)->flow_lbl, 0, + sizeof(XFRM_MODE_SKB_CB(skb)->flow_lbl)); +} + static inline void xfrm6_extract_header(struct sk_buff *skb) { #if IS_ENABLED(CONFIG_IPV6) -- cgit v1.2.3 From 3e50ddd8b8d5067796fc87cbbb25c71451ccb385 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 4 May 2020 10:06:06 +0200 Subject: xfrm: expose local_rxpmtu via ipv6_stubs We cannot call this function from the core kernel unless we would force CONFIG_IPV6=y. Therefore expose this via ipv6_stubs so we can call it from net/xfrm in the followup patch. Since the call is expected to be unlikely, no extra code for the IPV6=y case is added and we will always eat the indirection cost. Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- include/net/ipv6_stubs.h | 1 + include/net/xfrm.h | 1 + net/ipv6/af_inet6.c | 1 + net/ipv6/xfrm6_output.c | 2 +- 4 files changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h index 1e9e0cf7dc75..d8ab3872aa2a 100644 --- a/include/net/ipv6_stubs.h +++ b/include/net/ipv6_stubs.h @@ -57,6 +57,7 @@ struct ipv6_stub { const struct in6_addr *solicited_addr, bool router, bool solicited, bool override, bool inc_opt); #if IS_ENABLED(CONFIG_XFRM) + void (*xfrm6_local_rxpmtu)(struct sk_buff *skb, u32 mtu); int (*xfrm6_udp_encap_rcv)(struct sock *sk, struct sk_buff *skb); int (*xfrm6_rcv_encap)(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type); diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 8b956528b6e6..10295ab4cdfb 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1608,6 +1608,7 @@ int xfrm6_find_1stfragopt(struct xfrm_state *x, struct sk_buff *skb, u8 **prevhdr); #ifdef CONFIG_XFRM +void xfrm6_local_rxpmtu(struct sk_buff *skb, u32 mtu); int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb); int xfrm6_udp_encap_rcv(struct sock *sk, struct sk_buff *skb); int xfrm_user_policy(struct sock *sk, int optname, diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index cbbb00bad20e..aa4882929fd0 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -963,6 +963,7 @@ static const struct ipv6_stub ipv6_stub_impl = { .udpv6_encap_enable = udpv6_encap_enable, .ndisc_send_na = ndisc_send_na, #if IS_ENABLED(CONFIG_XFRM) + .xfrm6_local_rxpmtu = xfrm6_local_rxpmtu, .xfrm6_udp_encap_rcv = xfrm6_udp_encap_rcv, .xfrm6_rcv_encap = xfrm6_rcv_encap, #endif diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index 855078a43fc7..23e2b52cfba6 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -40,7 +40,7 @@ static int xfrm6_local_dontfrag(struct sk_buff *skb) return 0; } -static void xfrm6_local_rxpmtu(struct sk_buff *skb, u32 mtu) +void xfrm6_local_rxpmtu(struct sk_buff *skb, u32 mtu) { struct flowi6 fl6; struct sock *sk = skb->sk; -- cgit v1.2.3 From ede64dd2bfe2710549f1922a214959d966baaac3 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 4 May 2020 10:06:07 +0200 Subject: xfrm: place xfrm6_local_dontfrag in xfrm.h so next patch can re-use it from net/xfrm/xfrm_output.c without causing a linker error when IPV6 is a module. Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 16 ++++++++++++++++ net/ipv6/xfrm6_output.c | 21 ++------------------- 2 files changed, 18 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 10295ab4cdfb..8f7fb033d557 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1993,4 +1993,20 @@ static inline int xfrm_tunnel_check(struct sk_buff *skb, struct xfrm_state *x, return 0; } + +#if IS_ENABLED(CONFIG_IPV6) +static inline bool xfrm6_local_dontfrag(const struct sock *sk) +{ + int proto; + + if (!sk || sk->sk_family != AF_INET6) + return false; + + proto = sk->sk_protocol; + if (proto == IPPROTO_UDP || proto == IPPROTO_RAW) + return inet6_sk(sk)->dontfrag; + + return false; +} +#endif #endif /* _NET_XFRM_H */ diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index 23e2b52cfba6..be64f280510c 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -23,23 +23,6 @@ int xfrm6_find_1stfragopt(struct xfrm_state *x, struct sk_buff *skb, } EXPORT_SYMBOL(xfrm6_find_1stfragopt); -static int xfrm6_local_dontfrag(struct sk_buff *skb) -{ - int proto; - struct sock *sk = skb->sk; - - if (sk) { - if (sk->sk_family != AF_INET6) - return 0; - - proto = sk->sk_protocol; - if (proto == IPPROTO_UDP || proto == IPPROTO_RAW) - return inet6_sk(sk)->dontfrag; - } - - return 0; -} - void xfrm6_local_rxpmtu(struct sk_buff *skb, u32 mtu) { struct flowi6 fl6; @@ -82,7 +65,7 @@ static int xfrm6_tunnel_check_size(struct sk_buff *skb) skb->dev = dst->dev; skb->protocol = htons(ETH_P_IPV6); - if (xfrm6_local_dontfrag(skb)) + if (xfrm6_local_dontfrag(skb->sk)) xfrm6_local_rxpmtu(skb, mtu); else if (skb->sk) xfrm_local_error(skb, mtu); @@ -181,7 +164,7 @@ static int __xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb) toobig = skb->len > mtu && !skb_is_gso(skb); - if (toobig && xfrm6_local_dontfrag(skb)) { + if (toobig && xfrm6_local_dontfrag(skb->sk)) { xfrm6_local_rxpmtu(skb, mtu); kfree_skb(skb); return -EMSGSIZE; -- cgit v1.2.3 From f3075f48ddb2c4d076aeda36fa0939163e4b2816 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 4 May 2020 10:06:08 +0200 Subject: xfrm: remove extract_output indirection from xfrm_state_afinfo Move this to xfrm_output.c. This avoids the state->extract_output indirection. This patch also removes the duplicated __xfrm6_extract_header helper added in an earlier patch, we can now use the one from xfrm_inout.h . Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 3 --- net/ipv6/xfrm6_output.c | 58 ------------------------------------------ net/ipv6/xfrm6_state.c | 1 - net/xfrm/xfrm_output.c | 67 +++++++++++++++++++++++++++++++++++++++++++------ 4 files changed, 59 insertions(+), 70 deletions(-) (limited to 'net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 8f7fb033d557..db814a7e042f 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -362,8 +362,6 @@ struct xfrm_state_afinfo { int (*output)(struct net *net, struct sock *sk, struct sk_buff *skb); int (*output_finish)(struct sock *sk, struct sk_buff *skb); - int (*extract_output)(struct xfrm_state *x, - struct sk_buff *skb); int (*transport_finish)(struct sk_buff *skb, int async); void (*local_error)(struct sk_buff *skb, u32 mtu); @@ -1601,7 +1599,6 @@ int xfrm6_tunnel_register(struct xfrm6_tunnel *handler, unsigned short family); int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler, unsigned short family); __be32 xfrm6_tunnel_alloc_spi(struct net *net, xfrm_address_t *saddr); __be32 xfrm6_tunnel_spi_lookup(struct net *net, const xfrm_address_t *saddr); -int xfrm6_extract_output(struct xfrm_state *x, struct sk_buff *skb); int xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb); int xfrm6_output_finish(struct sock *sk, struct sk_buff *skb); int xfrm6_find_1stfragopt(struct xfrm_state *x, struct sk_buff *skb, diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index be64f280510c..b7d65b344679 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -47,64 +47,6 @@ void xfrm6_local_error(struct sk_buff *skb, u32 mtu) ipv6_local_error(sk, EMSGSIZE, &fl6, mtu); } -static int xfrm6_tunnel_check_size(struct sk_buff *skb) -{ - int mtu, ret = 0; - struct dst_entry *dst = skb_dst(skb); - - if (skb->ignore_df) - goto out; - - mtu = dst_mtu(dst); - if (mtu < IPV6_MIN_MTU) - mtu = IPV6_MIN_MTU; - - if ((!skb_is_gso(skb) && skb->len > mtu) || - (skb_is_gso(skb) && - !skb_gso_validate_network_len(skb, ip6_skb_dst_mtu(skb)))) { - skb->dev = dst->dev; - skb->protocol = htons(ETH_P_IPV6); - - if (xfrm6_local_dontfrag(skb->sk)) - xfrm6_local_rxpmtu(skb, mtu); - else if (skb->sk) - xfrm_local_error(skb, mtu); - else - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); - ret = -EMSGSIZE; - } -out: - return ret; -} - -static void __xfrm6_extract_header(struct sk_buff *skb) -{ - struct ipv6hdr *iph = ipv6_hdr(skb); - - XFRM_MODE_SKB_CB(skb)->ihl = sizeof(*iph); - XFRM_MODE_SKB_CB(skb)->id = 0; - XFRM_MODE_SKB_CB(skb)->frag_off = htons(IP_DF); - XFRM_MODE_SKB_CB(skb)->tos = ipv6_get_dsfield(iph); - XFRM_MODE_SKB_CB(skb)->ttl = iph->hop_limit; - XFRM_MODE_SKB_CB(skb)->optlen = 0; - memcpy(XFRM_MODE_SKB_CB(skb)->flow_lbl, iph->flow_lbl, - sizeof(XFRM_MODE_SKB_CB(skb)->flow_lbl)); -} - -int xfrm6_extract_output(struct xfrm_state *x, struct sk_buff *skb) -{ - int err; - - err = xfrm6_tunnel_check_size(skb); - if (err) - return err; - - XFRM_MODE_SKB_CB(skb)->protocol = ipv6_hdr(skb)->nexthdr; - - __xfrm6_extract_header(skb); - return 0; -} - int xfrm6_output_finish(struct sock *sk, struct sk_buff *skb) { memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c index 8fbf5a68ee6e..15247f2f78e1 100644 --- a/net/ipv6/xfrm6_state.c +++ b/net/ipv6/xfrm6_state.c @@ -19,7 +19,6 @@ static struct xfrm_state_afinfo xfrm6_state_afinfo = { .proto = IPPROTO_IPV6, .output = xfrm6_output, .output_finish = xfrm6_output_finish, - .extract_output = xfrm6_extract_output, .transport_finish = xfrm6_transport_finish, .local_error = xfrm6_local_error, }; diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index a7b3af7f7a1e..3a646df1318d 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -17,6 +17,11 @@ #include #include +#if IS_ENABLED(CONFIG_IPV6) +#include +#include +#endif + #include "xfrm_inout.h" static int xfrm_output2(struct net *net, struct sock *sk, struct sk_buff *skb); @@ -651,11 +656,60 @@ static int xfrm4_extract_output(struct xfrm_state *x, struct sk_buff *skb) return 0; } +#if IS_ENABLED(CONFIG_IPV6) +static int xfrm6_tunnel_check_size(struct sk_buff *skb) +{ + int mtu, ret = 0; + struct dst_entry *dst = skb_dst(skb); + + if (skb->ignore_df) + goto out; + + mtu = dst_mtu(dst); + if (mtu < IPV6_MIN_MTU) + mtu = IPV6_MIN_MTU; + + if ((!skb_is_gso(skb) && skb->len > mtu) || + (skb_is_gso(skb) && + !skb_gso_validate_network_len(skb, ip6_skb_dst_mtu(skb)))) { + skb->dev = dst->dev; + skb->protocol = htons(ETH_P_IPV6); + + if (xfrm6_local_dontfrag(skb->sk)) + ipv6_stub->xfrm6_local_rxpmtu(skb, mtu); + else if (skb->sk) + xfrm_local_error(skb, mtu); + else + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + ret = -EMSGSIZE; + } +out: + return ret; +} +#endif + +static int xfrm6_extract_output(struct xfrm_state *x, struct sk_buff *skb) +{ +#if IS_ENABLED(CONFIG_IPV6) + int err; + + err = xfrm6_tunnel_check_size(skb); + if (err) + return err; + + XFRM_MODE_SKB_CB(skb)->protocol = ipv6_hdr(skb)->nexthdr; + + xfrm6_extract_header(skb); + return 0; +#else + WARN_ON_ONCE(1); + return -EAFNOSUPPORT; +#endif +} + static int xfrm_inner_extract_output(struct xfrm_state *x, struct sk_buff *skb) { - const struct xfrm_state_afinfo *afinfo; const struct xfrm_mode *inner_mode; - int err = -EAFNOSUPPORT; if (x->sel.family == AF_UNSPEC) inner_mode = xfrm_ip2inner_mode(x, @@ -669,14 +723,11 @@ static int xfrm_inner_extract_output(struct xfrm_state *x, struct sk_buff *skb) switch (inner_mode->family) { case AF_INET: return xfrm4_extract_output(x, skb); + case AF_INET6: + return xfrm6_extract_output(x, skb); } - rcu_read_lock(); - afinfo = xfrm_state_afinfo_get_rcu(inner_mode->family); - if (likely(afinfo)) - err = afinfo->extract_output(x, skb); - rcu_read_unlock(); - return err; + return -EAFNOSUPPORT; } void xfrm_local_error(struct sk_buff *skb, int mtu) -- cgit v1.2.3 From 2ab6096db2f16b3a6adbad252f1be171e649028d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 4 May 2020 10:06:09 +0200 Subject: xfrm: remove output_finish indirection from xfrm_state_afinfo There are only two implementaions, one for ipv4 and one for ipv6. Both are almost identical, they clear skb->cb[], set the TRANSFORMED flag in IP(6)CB and then call the common xfrm_output() function. By placing the IPCB handling into the common function, we avoid the need for the output_finish indirection as the output functions can simply use xfrm_output(). Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 1 - net/ipv4/xfrm4_output.c | 23 +---------------------- net/ipv4/xfrm4_state.c | 1 - net/ipv6/xfrm6_output.c | 34 ++-------------------------------- net/ipv6/xfrm6_state.c | 1 - net/xfrm/xfrm_output.c | 16 ++++++++++++++++ 6 files changed, 19 insertions(+), 57 deletions(-) (limited to 'net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index db814a7e042f..094fe682f5d7 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -361,7 +361,6 @@ struct xfrm_state_afinfo { const struct xfrm_type *type_dstopts; int (*output)(struct net *net, struct sock *sk, struct sk_buff *skb); - int (*output_finish)(struct sock *sk, struct sk_buff *skb); int (*transport_finish)(struct sk_buff *skb, int async); void (*local_error)(struct sk_buff *skb, u32 mtu); diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index 21c8fa0a31ed..502eb189d852 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -14,22 +14,9 @@ #include #include -int xfrm4_output_finish(struct sock *sk, struct sk_buff *skb) -{ - memset(IPCB(skb), 0, sizeof(*IPCB(skb))); - -#ifdef CONFIG_NETFILTER - IPCB(skb)->flags |= IPSKB_XFRM_TRANSFORMED; -#endif - - return xfrm_output(sk, skb); -} - static int __xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct xfrm_state *x = skb_dst(skb)->xfrm; - const struct xfrm_state_afinfo *afinfo; - int ret = -EAFNOSUPPORT; #ifdef CONFIG_NETFILTER if (!x) { @@ -38,15 +25,7 @@ static int __xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb) } #endif - rcu_read_lock(); - afinfo = xfrm_state_afinfo_get_rcu(x->outer_mode.family); - if (likely(afinfo)) - ret = afinfo->output_finish(sk, skb); - else - kfree_skb(skb); - rcu_read_unlock(); - - return ret; + return xfrm_output(sk, skb); } int xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb) diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c index b23a1711297b..87d4db591488 100644 --- a/net/ipv4/xfrm4_state.c +++ b/net/ipv4/xfrm4_state.c @@ -14,7 +14,6 @@ static struct xfrm_state_afinfo xfrm4_state_afinfo = { .family = AF_INET, .proto = IPPROTO_IPIP, .output = xfrm4_output, - .output_finish = xfrm4_output_finish, .transport_finish = xfrm4_transport_finish, .local_error = xfrm4_local_error, }; diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index b7d65b344679..8b84d534b19d 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -47,39 +47,9 @@ void xfrm6_local_error(struct sk_buff *skb, u32 mtu) ipv6_local_error(sk, EMSGSIZE, &fl6, mtu); } -int xfrm6_output_finish(struct sock *sk, struct sk_buff *skb) -{ - memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); - -#ifdef CONFIG_NETFILTER - IP6CB(skb)->flags |= IP6SKB_XFRM_TRANSFORMED; -#endif - - return xfrm_output(sk, skb); -} - -static int __xfrm6_output_state_finish(struct xfrm_state *x, struct sock *sk, - struct sk_buff *skb) -{ - const struct xfrm_state_afinfo *afinfo; - int ret = -EAFNOSUPPORT; - - rcu_read_lock(); - afinfo = xfrm_state_afinfo_get_rcu(x->outer_mode.family); - if (likely(afinfo)) - ret = afinfo->output_finish(sk, skb); - else - kfree_skb(skb); - rcu_read_unlock(); - - return ret; -} - static int __xfrm6_output_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct xfrm_state *x = skb_dst(skb)->xfrm; - - return __xfrm6_output_state_finish(x, sk, skb); + return xfrm_output(sk, skb); } static int __xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb) @@ -121,7 +91,7 @@ static int __xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb) __xfrm6_output_finish); skip_frag: - return __xfrm6_output_state_finish(x, sk, skb); + return xfrm_output(sk, skb); } int xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb) diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c index 15247f2f78e1..6610b2198fa9 100644 --- a/net/ipv6/xfrm6_state.c +++ b/net/ipv6/xfrm6_state.c @@ -18,7 +18,6 @@ static struct xfrm_state_afinfo xfrm6_state_afinfo = { .family = AF_INET6, .proto = IPPROTO_IPV6, .output = xfrm6_output, - .output_finish = xfrm6_output_finish, .transport_finish = xfrm6_transport_finish, .local_error = xfrm6_local_error, }; diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 3a646df1318d..9c43b8dd80fb 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -571,6 +571,22 @@ int xfrm_output(struct sock *sk, struct sk_buff *skb) struct xfrm_state *x = skb_dst(skb)->xfrm; int err; + switch (x->outer_mode.family) { + case AF_INET: + memset(IPCB(skb), 0, sizeof(*IPCB(skb))); +#ifdef CONFIG_NETFILTER + IPCB(skb)->flags |= IPSKB_XFRM_TRANSFORMED; +#endif + break; + case AF_INET6: + memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); + +#ifdef CONFIG_NETFILTER + IP6CB(skb)->flags |= IP6SKB_XFRM_TRANSFORMED; +#endif + break; + } + secpath_reset(skb); if (xfrm_dev_offload_ok(skb, x)) { -- cgit v1.2.3 From 8741e18419bf4b9e3b02c0ed01ea13f1aa497fa7 Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Wed, 6 May 2020 14:16:16 +0800 Subject: net: bridge: return false in br_mrp_enabled() Fix the following coccicheck warning: net/bridge/br_private.h:1334:8-9: WARNING: return of 0/1 in function 'br_mrp_enabled' with return type bool Fixes: 6536993371fab ("bridge: mrp: Integrate MRP into the bridge") Signed-off-by: Jason Yan Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_private.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index c35647cb138a..78d3a951180d 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -1331,7 +1331,7 @@ static inline int br_mrp_process(struct net_bridge_port *p, struct sk_buff *skb) static inline bool br_mrp_enabled(struct net_bridge *br) { - return 0; + return false; } static inline void br_mrp_port_del(struct net_bridge *br, -- cgit v1.2.3 From 969c54646af0d7d94a5f0f37adbbfe024e85466e Mon Sep 17 00:00:00 2001 From: Fernando Gont Date: Fri, 1 May 2020 00:51:47 -0300 Subject: ipv6: Implement draft-ietf-6man-rfc4941bis Implement the upcoming rev of RFC4941 (IPv6 temporary addresses): https://tools.ietf.org/html/draft-ietf-6man-rfc4941bis-09 * Reduces the default Valid Lifetime to 2 days The number of extra addresses employed when Valid Lifetime was 7 days exacerbated the stress caused on network elements/devices. Additionally, the motivation for temporary addresses is indeed privacy and reduced exposure. With a default Valid Lifetime of 7 days, an address that becomes revealed by active communication is reachable and exposed for one whole week. The only use case for a Valid Lifetime of 7 days could be some application that is expecting to have long lived connections. But if you want to have a long lived connections, you shouldn't be using a temporary address in the first place. Additionally, in the era of mobile devices, general applications should nevertheless be prepared and robust to address changes (e.g. nodes swap wifi <-> 4G, etc.) * Employs different IIDs for different prefixes To avoid network activity correlation among addresses configured for different prefixes * Uses a simpler algorithm for IID generation No need to store "history" anywhere Signed-off-by: Fernando Gont Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.rst | 2 +- include/net/if_inet6.h | 1 - net/ipv6/addrconf.c | 91 +++++++++++++++------------------- 3 files changed, 40 insertions(+), 54 deletions(-) (limited to 'net') diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 50b440d29a13..b72f89d5694c 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -2065,7 +2065,7 @@ use_tempaddr - INTEGER temp_valid_lft - INTEGER valid lifetime (in seconds) for temporary addresses. - Default: 604800 (7 days) + Default: 172800 (2 days) temp_prefered_lft - INTEGER Preferred lifetime (in seconds) for temporary addresses. diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h index a01981d7108f..212eb278bda6 100644 --- a/include/net/if_inet6.h +++ b/include/net/if_inet6.h @@ -190,7 +190,6 @@ struct inet6_dev { int dead; u32 desync_factor; - u8 rndid[8]; struct list_head tempaddr_list; struct in6_addr token; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 26e666fe9a0e..fd885f06c4ed 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -135,8 +135,7 @@ static inline void addrconf_sysctl_unregister(struct inet6_dev *idev) } #endif -static void ipv6_regen_rndid(struct inet6_dev *idev); -static void ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpaddr); +static void ipv6_gen_rnd_iid(struct in6_addr *addr); static int ipv6_generate_eui64(u8 *eui, struct net_device *dev); static int ipv6_count_addresses(const struct inet6_dev *idev); @@ -432,8 +431,7 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) dev->type == ARPHRD_SIT || dev->type == ARPHRD_NONE) { ndev->cnf.use_tempaddr = -1; - } else - ipv6_regen_rndid(ndev); + } ndev->token = in6addr_any; @@ -1306,29 +1304,21 @@ out: in6_ifa_put(ifp); } -static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp, - struct inet6_ifaddr *ift, - bool block) +static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp, bool block) { struct inet6_dev *idev = ifp->idev; - struct in6_addr addr, *tmpaddr; unsigned long tmp_tstamp, age; unsigned long regen_advance; - struct ifa6_config cfg; - int ret = 0; unsigned long now = jiffies; - long max_desync_factor; s32 cnf_temp_preferred_lft; + struct inet6_ifaddr *ift; + struct ifa6_config cfg; + long max_desync_factor; + struct in6_addr addr; + int ret = 0; write_lock_bh(&idev->lock); - if (ift) { - spin_lock_bh(&ift->lock); - memcpy(&addr.s6_addr[8], &ift->addr.s6_addr[8], 8); - spin_unlock_bh(&ift->lock); - tmpaddr = &addr; - } else { - tmpaddr = NULL; - } + retry: in6_dev_hold(idev); if (idev->cnf.use_tempaddr <= 0) { @@ -1351,8 +1341,8 @@ retry: } in6_ifa_hold(ifp); memcpy(addr.s6_addr, ifp->addr.s6_addr, 8); - ipv6_try_regen_rndid(idev, tmpaddr); - memcpy(&addr.s6_addr[8], idev->rndid, 8); + ipv6_gen_rnd_iid(&addr); + age = (now - ifp->tstamp) / HZ; regen_advance = idev->cnf.regen_max_retry * @@ -1417,7 +1407,6 @@ retry: in6_ifa_put(ifp); in6_dev_put(idev); pr_info("%s: retry temporary address regeneration\n", __func__); - tmpaddr = &addr; write_lock_bh(&idev->lock); goto retry; } @@ -2032,7 +2021,7 @@ static void addrconf_dad_stop(struct inet6_ifaddr *ifp, int dad_failed) if (ifpub) { in6_ifa_hold(ifpub); spin_unlock_bh(&ifp->lock); - ipv6_create_tempaddr(ifpub, ifp, true); + ipv6_create_tempaddr(ifpub, true); in6_ifa_put(ifpub); } else { spin_unlock_bh(&ifp->lock); @@ -2329,40 +2318,38 @@ static int ipv6_inherit_eui64(u8 *eui, struct inet6_dev *idev) return err; } -/* (re)generation of randomized interface identifier (RFC 3041 3.2, 3.5) */ -static void ipv6_regen_rndid(struct inet6_dev *idev) +/* Generation of a randomized Interface Identifier + * draft-ietf-6man-rfc4941bis, Section 3.3.1 + */ + +static void ipv6_gen_rnd_iid(struct in6_addr *addr) { regen: - get_random_bytes(idev->rndid, sizeof(idev->rndid)); - idev->rndid[0] &= ~0x02; + get_random_bytes(&addr->s6_addr[8], 8); - /* - * : - * check if generated address is not inappropriate + /* , Section 3.3.1: + * check if generated address is not inappropriate: * - * - Reserved subnet anycast (RFC 2526) - * 11111101 11....11 1xxxxxxx - * - ISATAP (RFC4214) 6.1 - * 00-00-5E-FE-xx-xx-xx-xx - * - value 0 - * - XXX: already assigned to an address on the device + * - Reserved IPv6 Interface Identifers + * - XXX: already assigned to an address on the device */ - if (idev->rndid[0] == 0xfd && - (idev->rndid[1]&idev->rndid[2]&idev->rndid[3]&idev->rndid[4]&idev->rndid[5]&idev->rndid[6]) == 0xff && - (idev->rndid[7]&0x80)) + + /* Subnet-router anycast: 0000:0000:0000:0000 */ + if (!(addr->s6_addr32[2] | addr->s6_addr32[3])) goto regen; - if ((idev->rndid[0]|idev->rndid[1]) == 0) { - if (idev->rndid[2] == 0x5e && idev->rndid[3] == 0xfe) - goto regen; - if ((idev->rndid[2]|idev->rndid[3]|idev->rndid[4]|idev->rndid[5]|idev->rndid[6]|idev->rndid[7]) == 0x00) - goto regen; - } -} -static void ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpaddr) -{ - if (tmpaddr && memcmp(idev->rndid, &tmpaddr->s6_addr[8], 8) == 0) - ipv6_regen_rndid(idev); + /* IANA Ethernet block: 0200:5EFF:FE00:0000-0200:5EFF:FE00:5212 + * Proxy Mobile IPv6: 0200:5EFF:FE00:5213 + * IANA Ethernet block: 0200:5EFF:FE00:5214-0200:5EFF:FEFF:FFFF + */ + if (ntohl(addr->s6_addr32[2]) == 0x02005eff && + (ntohl(addr->s6_addr32[3]) & 0Xff000000) == 0xfe000000) + goto regen; + + /* Reserved subnet anycast addresses */ + if (ntohl(addr->s6_addr32[2]) == 0xfdffffff && + ntohl(addr->s6_addr32[3]) >= 0Xffffff80) + goto regen; } /* @@ -2544,7 +2531,7 @@ static void manage_tempaddrs(struct inet6_dev *idev, * no temporary address currently exists. */ read_unlock_bh(&idev->lock); - ipv6_create_tempaddr(ifp, NULL, false); + ipv6_create_tempaddr(ifp, false); } else { read_unlock_bh(&idev->lock); } @@ -4531,7 +4518,7 @@ restart: ifpub->regen_count = 0; spin_unlock(&ifpub->lock); rcu_read_unlock_bh(); - ipv6_create_tempaddr(ifpub, ifp, true); + ipv6_create_tempaddr(ifpub, true); in6_ifa_put(ifpub); in6_ifa_put(ifp); rcu_read_lock_bh(); -- cgit v1.2.3 From 8dc242ad661c2694a582541c2264ffc0e7c4d27d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 4 May 2020 11:27:49 -0700 Subject: tcp: refine tcp_pacing_delay() for very low pacing rates With the addition of horizon feature to sch_fq, we noticed some suboptimal behavior of extremely low pacing rate TCP flows, especially when TCP is not aware of a drop happening in lower stacks. Back in commit 3f80e08f40cd ("tcp: add tcp_reset_xmit_timer() helper"), tcp_pacing_delay() was added to estimate an extra delay to add to standard rto timers. This patch removes the skb argument from this helper and tcp_reset_xmit_timer() because it makes more sense to simply consider the time at which next packet is allowed to be sent, instead of the time of whatever packet has been sent. This avoids arming RTO timer too soon and removes spurious horizon drops. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 21 ++++++++------------- net/ipv4/tcp_input.c | 4 ++-- net/ipv4/tcp_output.c | 8 +++----- 3 files changed, 13 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 1beed50522b1..43b87a8d4790 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1289,26 +1289,22 @@ static inline bool tcp_needs_internal_pacing(const struct sock *sk) return smp_load_acquire(&sk->sk_pacing_status) == SK_PACING_NEEDED; } -/* Return in jiffies the delay before one skb is sent. - * If @skb is NULL, we look at EDT for next packet being sent on the socket. +/* Estimates in how many jiffies next packet for this flow can be sent. + * Scheduling a retransmit timer too early would be silly. */ -static inline unsigned long tcp_pacing_delay(const struct sock *sk, - const struct sk_buff *skb) +static inline unsigned long tcp_pacing_delay(const struct sock *sk) { - s64 pacing_delay = skb ? skb->tstamp : tcp_sk(sk)->tcp_wstamp_ns; + s64 delay = tcp_sk(sk)->tcp_wstamp_ns - tcp_sk(sk)->tcp_clock_cache; - pacing_delay -= tcp_sk(sk)->tcp_clock_cache; - - return pacing_delay > 0 ? nsecs_to_jiffies(pacing_delay) : 0; + return delay > 0 ? nsecs_to_jiffies(delay) : 0; } static inline void tcp_reset_xmit_timer(struct sock *sk, const int what, unsigned long when, - const unsigned long max_when, - const struct sk_buff *skb) + const unsigned long max_when) { - inet_csk_reset_xmit_timer(sk, what, when + tcp_pacing_delay(sk, skb), + inet_csk_reset_xmit_timer(sk, what, when + tcp_pacing_delay(sk), max_when); } @@ -1336,8 +1332,7 @@ static inline void tcp_check_probe_timer(struct sock *sk) { if (!tcp_sk(sk)->packets_out && !inet_csk(sk)->icsk_pending) tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, - tcp_probe0_base(sk), TCP_RTO_MAX, - NULL); + tcp_probe0_base(sk), TCP_RTO_MAX); } static inline void tcp_init_wl(struct tcp_sock *tp, u32 seq) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d68128a672ab..7d205b2a733c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3014,7 +3014,7 @@ void tcp_rearm_rto(struct sock *sk) rto = usecs_to_jiffies(max_t(int, delta_us, 1)); } tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, - TCP_RTO_MAX, tcp_rtx_queue_head(sk)); + TCP_RTO_MAX); } } @@ -3291,7 +3291,7 @@ static void tcp_ack_probe(struct sock *sk) unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX); tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, - when, TCP_RTO_MAX, NULL); + when, TCP_RTO_MAX); } } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index c414aeb1efa9..32c9db902f18 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2593,8 +2593,7 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto) if (rto_delta_us > 0) timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us)); - tcp_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, - TCP_RTO_MAX, NULL); + tcp_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, TCP_RTO_MAX); return true; } @@ -3174,8 +3173,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT) tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, - TCP_RTO_MAX, - skb); + TCP_RTO_MAX); } } @@ -3907,7 +3905,7 @@ void tcp_send_probe0(struct sock *sk) */ timeout = TCP_RESOURCE_PROBE_INTERVAL; } - tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, timeout, TCP_RTO_MAX, NULL); + tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, timeout, TCP_RTO_MAX); } int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) -- cgit v1.2.3 From 916e6d1a5ef17a6b3bffad0f086f173cde4240d8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 4 May 2020 11:27:50 -0700 Subject: tcp: defer xmit timer reset in tcp_xmit_retransmit_queue() As hinted in prior change ("tcp: refine tcp_pacing_delay() for very low pacing rates"), it is probably best arming the xmit timer only when all the packets have been scheduled, rather than when the head of rtx queue has been re-sent. This does matter for flows having extremely low pacing rates, since their tp->tcp_wstamp_ns could be far in the future. Note that the regular xmit path has a stronger limit in tcp_small_queue_check(), meaning it is less likely to go beyond the pacing horizon. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 32c9db902f18..a50e1990a845 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3112,6 +3112,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) const struct inet_connection_sock *icsk = inet_csk(sk); struct sk_buff *skb, *rtx_head, *hole = NULL; struct tcp_sock *tp = tcp_sk(sk); + bool rearm_timer = false; u32 max_segs; int mib_idx; @@ -3134,7 +3135,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) segs = tp->snd_cwnd - tcp_packets_in_flight(tp); if (segs <= 0) - return; + break; sacked = TCP_SKB_CB(skb)->sacked; /* In case tcp_shift_skb_data() have aggregated large skbs, * we need to make sure not sending too bigs TSO packets @@ -3159,10 +3160,10 @@ void tcp_xmit_retransmit_queue(struct sock *sk) continue; if (tcp_small_queue_check(sk, skb, 1)) - return; + break; if (tcp_retransmit_skb(sk, skb, segs)) - return; + break; NET_ADD_STATS(sock_net(sk), mib_idx, tcp_skb_pcount(skb)); @@ -3171,10 +3172,13 @@ void tcp_xmit_retransmit_queue(struct sock *sk) if (skb == rtx_head && icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT) - tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - inet_csk(sk)->icsk_rto, - TCP_RTO_MAX); + rearm_timer = true; + } + if (rearm_timer) + tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + inet_csk(sk)->icsk_rto, + TCP_RTO_MAX); } /* We allow to exceed memory limits for FIN packets to expedite -- cgit v1.2.3 From bdbdac7649fac05f88c9f7ab18121a17fb591687 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Tue, 5 May 2020 08:35:05 +0200 Subject: ethtool: provide UAPI for PHY master/slave configuration. This UAPI is needed for BroadR-Reach 100BASE-T1 devices. Due to lack of auto-negotiation support, we needed to be able to configure the MASTER-SLAVE role of the port manually or from an application in user space. The same UAPI can be used for 1000BASE-T or MultiGBASE-T devices to force MASTER or SLAVE role. See IEEE 802.3-2018: 22.2.4.3.7 MASTER-SLAVE control register (Register 9) 22.2.4.3.8 MASTER-SLAVE status register (Register 10) 40.5.2 MASTER-SLAVE configuration resolution 45.2.1.185.1 MASTER-SLAVE config value (1.2100.14) 45.2.7.10 MultiGBASE-T AN control 1 register (Register 7.32) The MASTER-SLAVE role affects the clock configuration: ------------------------------------------------------------------------------- When the PHY is configured as MASTER, the PMA Transmit function shall source TX_TCLK from a local clock source. When configured as SLAVE, the PMA Transmit function shall source TX_TCLK from the clock recovered from data stream provided by MASTER. iMX6Q KSZ9031 XXX ------\ /-----------\ /------------\ | | | | | MAC |<----RGMII----->| PHY Slave |<------>| PHY Master | |<--- 125 MHz ---+-<------/ | | \ | ------/ \-----------/ \------------/ ^ \-TX_TCLK ------------------------------------------------------------------------------- Since some clock or link related issues are only reproducible in a specific MASTER-SLAVE-role, MAC and PHY configuration, it is beneficial to provide generic (not 100BASE-T1 specific) interface to the user space for configuration flexibility and trouble shooting. Signed-off-by: Oleksij Rempel Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- Documentation/networking/ethtool-netlink.rst | 35 ++++++----- drivers/net/phy/phy.c | 4 +- drivers/net/phy/phy_device.c | 94 ++++++++++++++++++++++++++++ include/linux/phy.h | 3 + include/uapi/linux/ethtool.h | 16 ++++- include/uapi/linux/ethtool_netlink.h | 2 + include/uapi/linux/mii.h | 2 + net/ethtool/ioctl.c | 6 ++ net/ethtool/linkmodes.c | 53 ++++++++++++++++ 9 files changed, 197 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index 567326491f80..8f5cefc539cf 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -392,14 +392,16 @@ Request contents: Kernel response contents: - ==================================== ====== ========================== - ``ETHTOOL_A_LINKMODES_HEADER`` nested reply header - ``ETHTOOL_A_LINKMODES_AUTONEG`` u8 autonegotiation status - ``ETHTOOL_A_LINKMODES_OURS`` bitset advertised link modes - ``ETHTOOL_A_LINKMODES_PEER`` bitset partner link modes - ``ETHTOOL_A_LINKMODES_SPEED`` u32 link speed (Mb/s) - ``ETHTOOL_A_LINKMODES_DUPLEX`` u8 duplex mode - ==================================== ====== ========================== + ========================================== ====== ========================== + ``ETHTOOL_A_LINKMODES_HEADER`` nested reply header + ``ETHTOOL_A_LINKMODES_AUTONEG`` u8 autonegotiation status + ``ETHTOOL_A_LINKMODES_OURS`` bitset advertised link modes + ``ETHTOOL_A_LINKMODES_PEER`` bitset partner link modes + ``ETHTOOL_A_LINKMODES_SPEED`` u32 link speed (Mb/s) + ``ETHTOOL_A_LINKMODES_DUPLEX`` u8 duplex mode + ``ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG`` u8 Master/slave port mode + ``ETHTOOL_A_LINKMODES_MASTER_SLAVE_STATE`` u8 Master/slave port state + ========================================== ====== ========================== For ``ETHTOOL_A_LINKMODES_OURS``, value represents advertised modes and mask represents supported modes. ``ETHTOOL_A_LINKMODES_PEER`` in the reply is a bit @@ -414,14 +416,15 @@ LINKMODES_SET Request contents: - ==================================== ====== ========================== - ``ETHTOOL_A_LINKMODES_HEADER`` nested request header - ``ETHTOOL_A_LINKMODES_AUTONEG`` u8 autonegotiation status - ``ETHTOOL_A_LINKMODES_OURS`` bitset advertised link modes - ``ETHTOOL_A_LINKMODES_PEER`` bitset partner link modes - ``ETHTOOL_A_LINKMODES_SPEED`` u32 link speed (Mb/s) - ``ETHTOOL_A_LINKMODES_DUPLEX`` u8 duplex mode - ==================================== ====== ========================== + ========================================== ====== ========================== + ``ETHTOOL_A_LINKMODES_HEADER`` nested request header + ``ETHTOOL_A_LINKMODES_AUTONEG`` u8 autonegotiation status + ``ETHTOOL_A_LINKMODES_OURS`` bitset advertised link modes + ``ETHTOOL_A_LINKMODES_PEER`` bitset partner link modes + ``ETHTOOL_A_LINKMODES_SPEED`` u32 link speed (Mb/s) + ``ETHTOOL_A_LINKMODES_DUPLEX`` u8 duplex mode + ``ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG`` u8 Master/slave port mode + ========================================== ====== ========================== ``ETHTOOL_A_LINKMODES_OURS`` bit set allows setting advertised link modes. If autonegotiation is on (either set now or kept from before), advertised modes diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index 72c69a9c8a98..8c22d02b4218 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -295,7 +295,7 @@ int phy_ethtool_ksettings_set(struct phy_device *phydev, phydev->advertising, autoneg == AUTONEG_ENABLE); phydev->duplex = duplex; - + phydev->master_slave_set = cmd->base.master_slave_cfg; phydev->mdix_ctrl = cmd->base.eth_tp_mdix_ctrl; /* Restart the PHY */ @@ -314,6 +314,8 @@ void phy_ethtool_ksettings_get(struct phy_device *phydev, cmd->base.speed = phydev->speed; cmd->base.duplex = phydev->duplex; + cmd->base.master_slave_cfg = phydev->master_slave_get; + cmd->base.master_slave_state = phydev->master_slave_state; if (phydev->interface == PHY_INTERFACE_MODE_MOCA) cmd->base.port = PORT_BNC; else diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index b1c5e4503bc4..83fc8e1b5793 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -1913,6 +1913,90 @@ int genphy_setup_forced(struct phy_device *phydev) } EXPORT_SYMBOL(genphy_setup_forced); +static int genphy_setup_master_slave(struct phy_device *phydev) +{ + u16 ctl = 0; + + if (!phydev->is_gigabit_capable) + return 0; + + switch (phydev->master_slave_set) { + case MASTER_SLAVE_CFG_MASTER_PREFERRED: + ctl |= CTL1000_PREFER_MASTER; + break; + case MASTER_SLAVE_CFG_SLAVE_PREFERRED: + break; + case MASTER_SLAVE_CFG_MASTER_FORCE: + ctl |= CTL1000_AS_MASTER; + /* fallthrough */ + case MASTER_SLAVE_CFG_SLAVE_FORCE: + ctl |= CTL1000_ENABLE_MASTER; + break; + case MASTER_SLAVE_CFG_UNKNOWN: + case MASTER_SLAVE_CFG_UNSUPPORTED: + return 0; + default: + phydev_warn(phydev, "Unsupported Master/Slave mode\n"); + return -EOPNOTSUPP; + } + + return phy_modify_changed(phydev, MII_CTRL1000, + (CTL1000_ENABLE_MASTER | CTL1000_AS_MASTER | + CTL1000_PREFER_MASTER), ctl); +} + +static int genphy_read_master_slave(struct phy_device *phydev) +{ + int cfg, state; + u16 val; + + if (!phydev->is_gigabit_capable) { + phydev->master_slave_get = MASTER_SLAVE_CFG_UNSUPPORTED; + phydev->master_slave_state = MASTER_SLAVE_STATE_UNSUPPORTED; + return 0; + } + + phydev->master_slave_get = MASTER_SLAVE_CFG_UNKNOWN; + phydev->master_slave_state = MASTER_SLAVE_STATE_UNKNOWN; + + val = phy_read(phydev, MII_CTRL1000); + if (val < 0) + return val; + + if (val & CTL1000_ENABLE_MASTER) { + if (val & CTL1000_AS_MASTER) + cfg = MASTER_SLAVE_CFG_MASTER_FORCE; + else + cfg = MASTER_SLAVE_CFG_SLAVE_FORCE; + } else { + if (val & CTL1000_PREFER_MASTER) + cfg = MASTER_SLAVE_CFG_MASTER_PREFERRED; + else + cfg = MASTER_SLAVE_CFG_SLAVE_PREFERRED; + } + + val = phy_read(phydev, MII_STAT1000); + if (val < 0) + return val; + + if (val & LPA_1000MSFAIL) { + state = MASTER_SLAVE_STATE_ERR; + } else if (phydev->link) { + /* this bits are valid only for active link */ + if (val & LPA_1000MSRES) + state = MASTER_SLAVE_STATE_MASTER; + else + state = MASTER_SLAVE_STATE_SLAVE; + } else { + state = MASTER_SLAVE_STATE_UNKNOWN; + } + + phydev->master_slave_get = cfg; + phydev->master_slave_state = state; + + return 0; +} + /** * genphy_restart_aneg - Enable and Restart Autonegotiation * @phydev: target phy_device struct @@ -1971,6 +2055,12 @@ int __genphy_config_aneg(struct phy_device *phydev, bool changed) if (genphy_config_eee_advert(phydev)) changed = true; + err = genphy_setup_master_slave(phydev); + if (err < 0) + return err; + else if (err) + changed = true; + if (AUTONEG_ENABLE != phydev->autoneg) return genphy_setup_forced(phydev); @@ -2205,6 +2295,10 @@ int genphy_read_status(struct phy_device *phydev) phydev->pause = 0; phydev->asym_pause = 0; + err = genphy_read_master_slave(phydev); + if (err < 0) + return err; + err = genphy_read_lpa(phydev); if (err < 0) return err; diff --git a/include/linux/phy.h b/include/linux/phy.h index 1d36ac608159..a2b91b5f9d0a 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -477,6 +477,9 @@ struct phy_device { int duplex; int pause; int asym_pause; + u8 master_slave_get; + u8 master_slave_set; + u8 master_slave_state; /* Union of PHY and Attached devices' supported link modes */ /* See ethtool.h for more info */ diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 92f737f10117..f4662b3a9e1e 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -1666,6 +1666,18 @@ static inline int ethtool_validate_duplex(__u8 duplex) return 0; } +#define MASTER_SLAVE_CFG_UNSUPPORTED 0 +#define MASTER_SLAVE_CFG_UNKNOWN 1 +#define MASTER_SLAVE_CFG_MASTER_PREFERRED 2 +#define MASTER_SLAVE_CFG_SLAVE_PREFERRED 3 +#define MASTER_SLAVE_CFG_MASTER_FORCE 4 +#define MASTER_SLAVE_CFG_SLAVE_FORCE 5 +#define MASTER_SLAVE_STATE_UNSUPPORTED 0 +#define MASTER_SLAVE_STATE_UNKNOWN 1 +#define MASTER_SLAVE_STATE_MASTER 2 +#define MASTER_SLAVE_STATE_SLAVE 3 +#define MASTER_SLAVE_STATE_ERR 4 + /* Which connector port. */ #define PORT_TP 0x00 #define PORT_AUI 0x01 @@ -1904,7 +1916,9 @@ struct ethtool_link_settings { __u8 eth_tp_mdix_ctrl; __s8 link_mode_masks_nwords; __u8 transceiver; - __u8 reserved1[3]; + __u8 master_slave_cfg; + __u8 master_slave_state; + __u8 reserved1[1]; __u32 reserved[7]; __u32 link_mode_masks[0]; /* layout of link_mode_masks fields: diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 7fde76366ba4..bf1d310e20bc 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -216,6 +216,8 @@ enum { ETHTOOL_A_LINKMODES_PEER, /* bitset */ ETHTOOL_A_LINKMODES_SPEED, /* u32 */ ETHTOOL_A_LINKMODES_DUPLEX, /* u8 */ + ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG, /* u8 */ + ETHTOOL_A_LINKMODES_MASTER_SLAVE_STATE, /* u8 */ /* add new constants above here */ __ETHTOOL_A_LINKMODES_CNT, diff --git a/include/uapi/linux/mii.h b/include/uapi/linux/mii.h index 90f9b4e1ba27..39f7c44baf53 100644 --- a/include/uapi/linux/mii.h +++ b/include/uapi/linux/mii.h @@ -151,11 +151,13 @@ /* 1000BASE-T Control register */ #define ADVERTISE_1000FULL 0x0200 /* Advertise 1000BASE-T full duplex */ #define ADVERTISE_1000HALF 0x0100 /* Advertise 1000BASE-T half duplex */ +#define CTL1000_PREFER_MASTER 0x0400 /* prefer to operate as master */ #define CTL1000_AS_MASTER 0x0800 #define CTL1000_ENABLE_MASTER 0x1000 /* 1000BASE-T Status register */ #define LPA_1000MSFAIL 0x8000 /* Master/Slave resolution failure */ +#define LPA_1000MSRES 0x4000 /* Master/Slave resolution status */ #define LPA_1000LOCALRXOK 0x2000 /* Link partner local receiver status */ #define LPA_1000REMRXOK 0x1000 /* Link partner remote receiver status */ #define LPA_1000FULL 0x0800 /* Link partner 1000BASE-T full duplex */ diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 226d5ecdd567..52102ab1709b 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -552,6 +552,8 @@ static int ethtool_get_link_ksettings(struct net_device *dev, link_ksettings.base.cmd = ETHTOOL_GLINKSETTINGS; link_ksettings.base.link_mode_masks_nwords = __ETHTOOL_LINK_MODE_MASK_NU32; + link_ksettings.base.master_slave_cfg = MASTER_SLAVE_CFG_UNSUPPORTED; + link_ksettings.base.master_slave_state = MASTER_SLAVE_STATE_UNSUPPORTED; return store_link_ksettings_for_user(useraddr, &link_ksettings); } @@ -589,6 +591,10 @@ static int ethtool_set_link_ksettings(struct net_device *dev, != link_ksettings.base.link_mode_masks_nwords) return -EINVAL; + if (link_ksettings.base.master_slave_cfg || + link_ksettings.base.master_slave_state) + return -EINVAL; + err = dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings); if (err >= 0) { ethtool_notify(dev, ETHTOOL_MSG_LINKINFO_NTF, NULL); diff --git a/net/ethtool/linkmodes.c b/net/ethtool/linkmodes.c index 452608c6d856..fd4f3e58c6f6 100644 --- a/net/ethtool/linkmodes.c +++ b/net/ethtool/linkmodes.c @@ -27,6 +27,8 @@ linkmodes_get_policy[ETHTOOL_A_LINKMODES_MAX + 1] = { [ETHTOOL_A_LINKMODES_PEER] = { .type = NLA_REJECT }, [ETHTOOL_A_LINKMODES_SPEED] = { .type = NLA_REJECT }, [ETHTOOL_A_LINKMODES_DUPLEX] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKMODES_MASTER_SLAVE_STATE] = { .type = NLA_REJECT }, }; static int linkmodes_prepare_data(const struct ethnl_req_info *req_base, @@ -63,6 +65,7 @@ static int linkmodes_reply_size(const struct ethnl_req_info *req_base, { const struct linkmodes_reply_data *data = LINKMODES_REPDATA(reply_base); const struct ethtool_link_ksettings *ksettings = &data->ksettings; + const struct ethtool_link_settings *lsettings = &ksettings->base; bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; int len, ret; @@ -86,6 +89,12 @@ static int linkmodes_reply_size(const struct ethnl_req_info *req_base, len += ret; } + if (lsettings->master_slave_cfg != MASTER_SLAVE_CFG_UNSUPPORTED) + len += nla_total_size(sizeof(u8)); + + if (lsettings->master_slave_state != MASTER_SLAVE_STATE_UNSUPPORTED) + len += nla_total_size(sizeof(u8)); + return len; } @@ -122,6 +131,16 @@ static int linkmodes_fill_reply(struct sk_buff *skb, nla_put_u8(skb, ETHTOOL_A_LINKMODES_DUPLEX, lsettings->duplex)) return -EMSGSIZE; + if (lsettings->master_slave_cfg != MASTER_SLAVE_CFG_UNSUPPORTED && + nla_put_u8(skb, ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG, + lsettings->master_slave_cfg)) + return -EMSGSIZE; + + if (lsettings->master_slave_state != MASTER_SLAVE_STATE_UNSUPPORTED && + nla_put_u8(skb, ETHTOOL_A_LINKMODES_MASTER_SLAVE_STATE, + lsettings->master_slave_state)) + return -EMSGSIZE; + return 0; } @@ -249,6 +268,8 @@ linkmodes_set_policy[ETHTOOL_A_LINKMODES_MAX + 1] = { [ETHTOOL_A_LINKMODES_PEER] = { .type = NLA_REJECT }, [ETHTOOL_A_LINKMODES_SPEED] = { .type = NLA_U32 }, [ETHTOOL_A_LINKMODES_DUPLEX] = { .type = NLA_U8 }, + [ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG] = { .type = NLA_U8 }, + [ETHTOOL_A_LINKMODES_MASTER_SLAVE_STATE] = { .type = NLA_REJECT }, }; /* Set advertised link modes to all supported modes matching requested speed @@ -287,14 +308,45 @@ static bool ethnl_auto_linkmodes(struct ethtool_link_ksettings *ksettings, __ETHTOOL_LINK_MODE_MASK_NBITS); } +static bool ethnl_validate_master_slave_cfg(u8 cfg) +{ + switch (cfg) { + case MASTER_SLAVE_CFG_MASTER_PREFERRED: + case MASTER_SLAVE_CFG_SLAVE_PREFERRED: + case MASTER_SLAVE_CFG_MASTER_FORCE: + case MASTER_SLAVE_CFG_SLAVE_FORCE: + return true; + } + + return false; +} + static int ethnl_update_linkmodes(struct genl_info *info, struct nlattr **tb, struct ethtool_link_ksettings *ksettings, bool *mod) { struct ethtool_link_settings *lsettings = &ksettings->base; bool req_speed, req_duplex; + const struct nlattr *master_slave_cfg; int ret; + master_slave_cfg = tb[ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG]; + if (master_slave_cfg) { + u8 cfg = nla_get_u8(master_slave_cfg); + + if (lsettings->master_slave_cfg == MASTER_SLAVE_CFG_UNSUPPORTED) { + NL_SET_ERR_MSG_ATTR(info->extack, master_slave_cfg, + "master/slave configuration not supported by device"); + return -EOPNOTSUPP; + } + + if (!ethnl_validate_master_slave_cfg(cfg)) { + NL_SET_ERR_MSG_ATTR(info->extack, master_slave_cfg, + "master/slave value is invalid"); + return -EOPNOTSUPP; + } + } + *mod = false; req_speed = tb[ETHTOOL_A_LINKMODES_SPEED]; req_duplex = tb[ETHTOOL_A_LINKMODES_DUPLEX]; @@ -311,6 +363,7 @@ static int ethnl_update_linkmodes(struct genl_info *info, struct nlattr **tb, mod); ethnl_update_u8(&lsettings->duplex, tb[ETHTOOL_A_LINKMODES_DUPLEX], mod); + ethnl_update_u8(&lsettings->master_slave_cfg, master_slave_cfg, mod); if (!tb[ETHTOOL_A_LINKMODES_OURS] && lsettings->autoneg && (req_speed || req_duplex) && -- cgit v1.2.3 From 6e728f321393b1fce9e1c2c3e55f9f7c15991321 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Thu, 7 May 2020 18:23:05 +0530 Subject: net: qrtr: Add MHI transport layer MHI is the transport layer used for communicating to the external modems. Hence, this commit adds MHI transport layer support to QRTR for transferring the QMI messages over IPC Router. Reviewed-by: Bjorn Andersson Signed-off-by: Manivannan Sadhasivam Signed-off-by: David S. Miller --- net/qrtr/Kconfig | 7 +++ net/qrtr/Makefile | 2 + net/qrtr/mhi.c | 127 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 136 insertions(+) create mode 100644 net/qrtr/mhi.c (limited to 'net') diff --git a/net/qrtr/Kconfig b/net/qrtr/Kconfig index 63f89cc6e82c..8eb876471564 100644 --- a/net/qrtr/Kconfig +++ b/net/qrtr/Kconfig @@ -29,4 +29,11 @@ config QRTR_TUN implement endpoints of QRTR, for purpose of tunneling data to other hosts or testing purposes. +config QRTR_MHI + tristate "MHI IPC Router channels" + depends on MHI_BUS + help + Say Y here to support MHI based ipcrouter channels. MHI is the + transport used for communicating to external modems. + endif # QRTR diff --git a/net/qrtr/Makefile b/net/qrtr/Makefile index 32d4e923925d..1b1411d158a7 100644 --- a/net/qrtr/Makefile +++ b/net/qrtr/Makefile @@ -5,3 +5,5 @@ obj-$(CONFIG_QRTR_SMD) += qrtr-smd.o qrtr-smd-y := smd.o obj-$(CONFIG_QRTR_TUN) += qrtr-tun.o qrtr-tun-y := tun.o +obj-$(CONFIG_QRTR_MHI) += qrtr-mhi.o +qrtr-mhi-y := mhi.o diff --git a/net/qrtr/mhi.c b/net/qrtr/mhi.c new file mode 100644 index 000000000000..ff0c41467fc1 --- /dev/null +++ b/net/qrtr/mhi.c @@ -0,0 +1,127 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2018-2020, The Linux Foundation. All rights reserved. + */ + +#include +#include +#include +#include +#include + +#include "qrtr.h" + +struct qrtr_mhi_dev { + struct qrtr_endpoint ep; + struct mhi_device *mhi_dev; + struct device *dev; +}; + +/* From MHI to QRTR */ +static void qcom_mhi_qrtr_dl_callback(struct mhi_device *mhi_dev, + struct mhi_result *mhi_res) +{ + struct qrtr_mhi_dev *qdev = dev_get_drvdata(&mhi_dev->dev); + int rc; + + if (!qdev || mhi_res->transaction_status) + return; + + rc = qrtr_endpoint_post(&qdev->ep, mhi_res->buf_addr, + mhi_res->bytes_xferd); + if (rc == -EINVAL) + dev_err(qdev->dev, "invalid ipcrouter packet\n"); +} + +/* From QRTR to MHI */ +static void qcom_mhi_qrtr_ul_callback(struct mhi_device *mhi_dev, + struct mhi_result *mhi_res) +{ + struct sk_buff *skb = mhi_res->buf_addr; + + if (skb->sk) + sock_put(skb->sk); + consume_skb(skb); +} + +/* Send data over MHI */ +static int qcom_mhi_qrtr_send(struct qrtr_endpoint *ep, struct sk_buff *skb) +{ + struct qrtr_mhi_dev *qdev = container_of(ep, struct qrtr_mhi_dev, ep); + int rc; + + rc = skb_linearize(skb); + if (rc) + goto free_skb; + + rc = mhi_queue_skb(qdev->mhi_dev, DMA_TO_DEVICE, skb, skb->len, + MHI_EOT); + if (rc) + goto free_skb; + + if (skb->sk) + sock_hold(skb->sk); + + return rc; + +free_skb: + kfree_skb(skb); + + return rc; +} + +static int qcom_mhi_qrtr_probe(struct mhi_device *mhi_dev, + const struct mhi_device_id *id) +{ + struct qrtr_mhi_dev *qdev; + int rc; + + qdev = devm_kzalloc(&mhi_dev->dev, sizeof(*qdev), GFP_KERNEL); + if (!qdev) + return -ENOMEM; + + qdev->mhi_dev = mhi_dev; + qdev->dev = &mhi_dev->dev; + qdev->ep.xmit = qcom_mhi_qrtr_send; + + dev_set_drvdata(&mhi_dev->dev, qdev); + rc = qrtr_endpoint_register(&qdev->ep, QRTR_EP_NID_AUTO); + if (rc) + return rc; + + dev_dbg(qdev->dev, "Qualcomm MHI QRTR driver probed\n"); + + return 0; +} + +static void qcom_mhi_qrtr_remove(struct mhi_device *mhi_dev) +{ + struct qrtr_mhi_dev *qdev = dev_get_drvdata(&mhi_dev->dev); + + qrtr_endpoint_unregister(&qdev->ep); + dev_set_drvdata(&mhi_dev->dev, NULL); +} + +static const struct mhi_device_id qcom_mhi_qrtr_id_table[] = { + { .chan = "IPCR" }, + {} +}; +MODULE_DEVICE_TABLE(mhi, qcom_mhi_qrtr_id_table); + +static struct mhi_driver qcom_mhi_qrtr_driver = { + .probe = qcom_mhi_qrtr_probe, + .remove = qcom_mhi_qrtr_remove, + .dl_xfer_cb = qcom_mhi_qrtr_dl_callback, + .ul_xfer_cb = qcom_mhi_qrtr_ul_callback, + .id_table = qcom_mhi_qrtr_id_table, + .driver = { + .name = "qcom_mhi_qrtr", + }, +}; + +module_mhi_driver(qcom_mhi_qrtr_driver); + +MODULE_AUTHOR("Chris Lew "); +MODULE_AUTHOR("Manivannan Sadhasivam "); +MODULE_DESCRIPTION("Qualcomm IPC-Router MHI interface driver"); +MODULE_LICENSE("GPL v2"); -- cgit v1.2.3 From e42671084361302141a09284fde9bbc14fdd16bf Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Thu, 7 May 2020 18:23:06 +0530 Subject: net: qrtr: Do not depend on ARCH_QCOM IPC Router protocol is also used by external modems for exchanging the QMI messages. Hence, it doesn't always depend on Qualcomm platforms. One such instance is the QCA6390 WLAN device connected to x86 machine. Reviewed-by: Bjorn Andersson Signed-off-by: Manivannan Sadhasivam Signed-off-by: David S. Miller --- net/qrtr/Kconfig | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/qrtr/Kconfig b/net/qrtr/Kconfig index 8eb876471564..f362ca316015 100644 --- a/net/qrtr/Kconfig +++ b/net/qrtr/Kconfig @@ -4,7 +4,6 @@ config QRTR tristate "Qualcomm IPC Router support" - depends on ARCH_QCOM || COMPILE_TEST ---help--- Say Y if you intend to use Qualcomm IPC router protocol. The protocol is used to communicate with services provided by other -- cgit v1.2.3 From e1eea8112017cbdc596d90caf6ede191502a9691 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 5 May 2020 22:20:52 +0300 Subject: net: dsa: introduce a dsa_port_from_netdev public helper As its implementation shows, this is synonimous with calling dsa_slave_dev_check followed by dsa_slave_to_port, so it is quite simple already and provides functionality which is already there. However there is now a need for these functions outside dsa_priv.h, for example in drivers that perform mirroring and redirection through tc-flower offloads (they are given raw access to the flow_cls_offload structure), where they need to call this function on act->dev. But simply exporting dsa_slave_to_port would make it non-inline and would result in an extra function call in the hotpath, as can be seen for example in sja1105: Before: 000006dc : { 6dc: e92d4ff0 push {r4, r5, r6, r7, r8, r9, sl, fp, lr} 6e0: e1a04000 mov r4, r0 6e4: e591958c ldr r9, [r1, #1420] ; 0x58c <- Inline dsa_slave_to_port 6e8: e1a05001 mov r5, r1 6ec: e24dd004 sub sp, sp, #4 u16 tx_vid = dsa_8021q_tx_vid(dp->ds, dp->index); 6f0: e1c901d8 ldrd r0, [r9, #24] 6f4: ebfffffe bl 0 6f4: R_ARM_CALL dsa_8021q_tx_vid u8 pcp = netdev_txq_to_tc(netdev, queue_mapping); 6f8: e1d416b0 ldrh r1, [r4, #96] ; 0x60 u16 tx_vid = dsa_8021q_tx_vid(dp->ds, dp->index); 6fc: e1a08000 mov r8, r0 After: 000006e4 : { 6e4: e92d4ff0 push {r4, r5, r6, r7, r8, r9, sl, fp, lr} 6e8: e1a04000 mov r4, r0 6ec: e24dd004 sub sp, sp, #4 struct dsa_port *dp = dsa_slave_to_port(netdev); 6f0: e1a00001 mov r0, r1 { 6f4: e1a05001 mov r5, r1 struct dsa_port *dp = dsa_slave_to_port(netdev); 6f8: ebfffffe bl 0 6f8: R_ARM_CALL dsa_slave_to_port 6fc: e1a09000 mov r9, r0 u16 tx_vid = dsa_8021q_tx_vid(dp->ds, dp->index); 700: e1c001d8 ldrd r0, [r0, #24] 704: ebfffffe bl 0 704: R_ARM_CALL dsa_8021q_tx_vid Because we want to avoid possible performance regressions, introduce this new function which is designed to be public. Suggested-by: Vivien Didelot Signed-off-by: Vladimir Oltean Reviewed-by: Vivien Didelot Signed-off-by: David S. Miller --- include/net/dsa.h | 1 + net/dsa/dsa.c | 9 +++++++++ 2 files changed, 10 insertions(+) (limited to 'net') diff --git a/include/net/dsa.h b/include/net/dsa.h index fb3f9222f2a1..6dfc8c2f68b8 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -637,6 +637,7 @@ void dsa_devlink_resource_occ_get_register(struct dsa_switch *ds, void *occ_get_priv); void dsa_devlink_resource_occ_get_unregister(struct dsa_switch *ds, u64 resource_id); +struct dsa_port *dsa_port_from_netdev(struct net_device *netdev); struct dsa_devlink_priv { struct dsa_switch *ds; diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 0384a911779e..1ce9ba8cf545 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -412,6 +412,15 @@ void dsa_devlink_resource_occ_get_unregister(struct dsa_switch *ds, } EXPORT_SYMBOL_GPL(dsa_devlink_resource_occ_get_unregister); +struct dsa_port *dsa_port_from_netdev(struct net_device *netdev) +{ + if (!netdev || !dsa_slave_dev_check(netdev)) + return ERR_PTR(-ENODEV); + + return dsa_slave_to_port(netdev); +} +EXPORT_SYMBOL_GPL(dsa_port_from_netdev); + static int __init dsa_init_module(void) { int rc; -- cgit v1.2.3 From f96e87178bb819a2ee25623a6935fa850c2defdd Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Wed, 6 May 2020 15:47:45 +0000 Subject: hsr: remove WARN_ONCE() in hsr_fill_frame_info() When VLAN frame is being sent, hsr calls WARN_ONCE() because hsr doesn't support VLAN. But using WARN_ONCE() is overdoing. Using netdev_warn_once() is enough. Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- net/hsr/hsr_forward.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c index ddd9605bad04..ed13760463de 100644 --- a/net/hsr/hsr_forward.c +++ b/net/hsr/hsr_forward.c @@ -321,7 +321,7 @@ static int hsr_fill_frame_info(struct hsr_frame_info *frame, if (ethhdr->h_proto == htons(ETH_P_8021Q)) { frame->is_vlan = true; /* FIXME: */ - WARN_ONCE(1, "HSR: VLAN not yet supported"); + netdev_warn_once(skb->dev, "VLAN not yet supported"); } if (ethhdr->h_proto == htons(ETH_P_PRP) || ethhdr->h_proto == htons(ETH_P_HSR)) { -- cgit v1.2.3 From c75a33c84b83ffbb8b8b58a6bf4dea69dba21326 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Wed, 6 May 2020 17:58:27 -0700 Subject: net: remove newlines in NL_SET_ERR_MSG_MOD The NL_SET_ERR_MSG_MOD macro is used to report a string describing an error message to userspace via the netlink extended ACK structure. It should not have a trailing newline. Add a cocci script which catches cases where the newline marker is present. Using this script, fix the handful of cases which accidentally included a trailing new line. I couldn't figure out a way to get a patch mode working, so this script only implements context, report, and org. Signed-off-by: Jacob Keller Cc: Jakub Kicinski Cc: Andy Whitcroft Cc: Joe Perches Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 2 +- drivers/net/ethernet/mscc/ocelot_tc.c | 6 +- net/bridge/br_mrp_netlink.c | 2 +- net/bridge/br_stp_if.c | 2 +- net/dsa/slave.c | 6 +- scripts/coccinelle/misc/newline_in_nl_msg.cocci | 75 +++++++++++++++++++++++++ 6 files changed, 84 insertions(+), 9 deletions(-) create mode 100644 scripts/coccinelle/misc/newline_in_nl_msg.cocci (limited to 'net') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 77397aa66810..a050808f2128 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -1097,7 +1097,7 @@ mlx5e_tc_add_nic_flow(struct mlx5e_priv *priv, if (IS_ERR(priv->fs.tc.t)) { mutex_unlock(&priv->fs.tc.t_lock); NL_SET_ERR_MSG_MOD(extack, - "Failed to create tc offload table\n"); + "Failed to create tc offload table"); netdev_err(priv->netdev, "Failed to create tc offload table\n"); return PTR_ERR(priv->fs.tc.t); diff --git a/drivers/net/ethernet/mscc/ocelot_tc.c b/drivers/net/ethernet/mscc/ocelot_tc.c index d326e231f0ad..b7baf7624e18 100644 --- a/drivers/net/ethernet/mscc/ocelot_tc.c +++ b/drivers/net/ethernet/mscc/ocelot_tc.c @@ -48,7 +48,7 @@ static int ocelot_setup_tc_cls_matchall(struct ocelot_port_private *priv, if (priv->tc.police_id && priv->tc.police_id != f->cookie) { NL_SET_ERR_MSG_MOD(extack, - "Only one policer per port is supported\n"); + "Only one policer per port is supported"); return -EEXIST; } @@ -59,7 +59,7 @@ static int ocelot_setup_tc_cls_matchall(struct ocelot_port_private *priv, err = ocelot_port_policer_add(ocelot, port, &pol); if (err) { - NL_SET_ERR_MSG_MOD(extack, "Could not add policer\n"); + NL_SET_ERR_MSG_MOD(extack, "Could not add policer"); return err; } @@ -73,7 +73,7 @@ static int ocelot_setup_tc_cls_matchall(struct ocelot_port_private *priv, err = ocelot_port_policer_del(ocelot, port); if (err) { NL_SET_ERR_MSG_MOD(extack, - "Could not delete policer\n"); + "Could not delete policer"); return err; } priv->tc.police_id = 0; diff --git a/net/bridge/br_mrp_netlink.c b/net/bridge/br_mrp_netlink.c index 503896638be0..397e7f710772 100644 --- a/net/bridge/br_mrp_netlink.c +++ b/net/bridge/br_mrp_netlink.c @@ -28,7 +28,7 @@ int br_mrp_parse(struct net_bridge *br, struct net_bridge_port *p, int err; if (br->stp_enabled != BR_NO_STP) { - NL_SET_ERR_MSG_MOD(extack, "MRP can't be enabled if STP is already enabled\n"); + NL_SET_ERR_MSG_MOD(extack, "MRP can't be enabled if STP is already enabled"); return -EINVAL; } diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index a42850b7eb9a..ba55851fe132 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -203,7 +203,7 @@ int br_stp_set_enabled(struct net_bridge *br, unsigned long val, if (br_mrp_enabled(br)) { NL_SET_ERR_MSG_MOD(extack, - "STP can't be enabled if MRP is already enabled\n"); + "STP can't be enabled if MRP is already enabled"); return -EINVAL; } diff --git a/net/dsa/slave.c b/net/dsa/slave.c index ea0fcf7bf786..dfb4282fc339 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -911,13 +911,13 @@ dsa_slave_add_cls_matchall_police(struct net_device *dev, if (!ds->ops->port_policer_add) { NL_SET_ERR_MSG_MOD(extack, - "Policing offload not implemented\n"); + "Policing offload not implemented"); return -EOPNOTSUPP; } if (!ingress) { NL_SET_ERR_MSG_MOD(extack, - "Only supported on ingress qdisc\n"); + "Only supported on ingress qdisc"); return -EOPNOTSUPP; } @@ -928,7 +928,7 @@ dsa_slave_add_cls_matchall_police(struct net_device *dev, list_for_each_entry(mall_tc_entry, &p->mall_tc_list, list) { if (mall_tc_entry->type == DSA_PORT_MALL_POLICER) { NL_SET_ERR_MSG_MOD(extack, - "Only one port policer allowed\n"); + "Only one port policer allowed"); return -EEXIST; } } diff --git a/scripts/coccinelle/misc/newline_in_nl_msg.cocci b/scripts/coccinelle/misc/newline_in_nl_msg.cocci new file mode 100644 index 000000000000..c175886e4015 --- /dev/null +++ b/scripts/coccinelle/misc/newline_in_nl_msg.cocci @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: GPL-2.0-only +/// +/// Catch strings ending in newline with GENL_SET_ERR_MSG, NL_SET_ERR_MSG, +/// NL_SET_ERR_MSG_MOD. +/// +// Confidence: Very High +// Copyright: (C) 2020 Intel Corporation +// URL: http://coccinelle.lip6.fr/ +// Options: --no-includes --include-headers + +virtual context +virtual org +virtual report + +@r depends on context || org || report@ +expression e; +constant m; +position p; +@@ + \(GENL_SET_ERR_MSG\|NL_SET_ERR_MSG\|NL_SET_ERR_MSG_MOD\)(e,m@p) + +@script:python@ +m << r.m; +@@ + +if not m.endswith("\\n\""): + cocci.include_match(False) + +@r1 depends on r@ +identifier fname; +expression r.e; +constant r.m; +position r.p; +@@ + fname(e,m@p) + +//---------------------------------------------------------- +// For context mode +//---------------------------------------------------------- + +@depends on context && r@ +identifier r1.fname; +expression r.e; +constant r.m; +@@ +* fname(e,m) + +//---------------------------------------------------------- +// For org mode +//---------------------------------------------------------- + +@script:python depends on org@ +fname << r1.fname; +m << r.m; +p << r.p; +@@ + +if m.endswith("\\n\""): + msg="WARNING avoid newline at end of message in %s" % (fname) + msg_safe=msg.replace("[","@(").replace("]",")") + coccilib.org.print_todo(p[0], msg_safe) + +//---------------------------------------------------------- +// For report mode +//---------------------------------------------------------- + +@script:python depends on report@ +fname << r1.fname; +m << r.m; +p << r.p; +@@ + +if m.endswith("\\n\""): + msg="WARNING avoid newline at end of message in %s" % (fname) + coccilib.report.print_report(p[0], msg) -- cgit v1.2.3 From 636ef28d6e4d174e424102466caf572b0406fb0e Mon Sep 17 00:00:00 2001 From: zhang kai Date: Thu, 7 May 2020 11:08:30 +0800 Subject: tcp: tcp_mark_head_lost is only valid for sack-tcp so tcp_is_sack/reno checks are removed from tcp_mark_head_lost. Signed-off-by: zhang kai Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 32 +++++++------------------------- 1 file changed, 7 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 66e55e51c550..7529c2816f2f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2183,8 +2183,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag) } /* Detect loss in event "A" above by marking head of queue up as lost. - * For non-SACK(Reno) senders, the first "packets" number of segments - * are considered lost. For RFC3517 SACK, a segment is considered lost if it + * For RFC3517 SACK, a segment is considered lost if it * has at least tp->reordering SACKed seqments above it; "packets" refers to * the maximum SACKed segments to pass before reaching this limit. */ @@ -2192,10 +2191,9 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - int cnt, oldcnt, lost; - unsigned int mss; + int cnt; /* Use SACK to deduce losses of new sequences sent during recovery */ - const u32 loss_high = tcp_is_sack(tp) ? tp->snd_nxt : tp->high_seq; + const u32 loss_high = tp->snd_nxt; WARN_ON(packets > tp->packets_out); skb = tp->lost_skb_hint; @@ -2218,26 +2216,11 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) if (after(TCP_SKB_CB(skb)->end_seq, loss_high)) break; - oldcnt = cnt; - if (tcp_is_reno(tp) || - (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) + if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) cnt += tcp_skb_pcount(skb); - if (cnt > packets) { - if (tcp_is_sack(tp) || - (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) || - (oldcnt >= packets)) - break; - - mss = tcp_skb_mss(skb); - /* If needed, chop off the prefix to mark as lost. */ - lost = (packets - oldcnt) * mss; - if (lost < skb->len && - tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, - lost, mss, GFP_ATOMIC) < 0) - break; - cnt = packets; - } + if (cnt > packets) + break; tcp_skb_mark_lost(tp, skb); @@ -2849,8 +2832,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, if (tcp_try_undo_partial(sk, prior_snd_una)) return; /* Partial ACK arrived. Force fast retransmit. */ - do_lost = tcp_is_reno(tp) || - tcp_force_fast_retransmit(sk); + do_lost = tcp_force_fast_retransmit(sk); } if (tcp_try_undo_dsack(sk)) { tcp_try_keep_open(sk); -- cgit v1.2.3 From ca7e3edc221d5cf750ae04cac29cf9fe9db38e84 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Thu, 7 May 2020 16:24:06 +0200 Subject: net/smc: remove set but not used variables 'del_llc, del_llc_resp' Fixes gcc '-Wunused-but-set-variable' warning: net/smc/smc_llc.c: In function 'smc_llc_cli_conf_link': net/smc/smc_llc.c:753:31: warning: variable 'del_llc' set but not used [-Wunused-but-set-variable] struct smc_llc_msg_del_link *del_llc; ^ net/smc/smc_llc.c: In function 'smc_llc_process_srv_delete_link': net/smc/smc_llc.c:1311:33: warning: variable 'del_llc_resp' set but not used [-Wunused-but-set-variable] struct smc_llc_msg_del_link *del_llc_resp; ^ Signed-off-by: YueHaibing Signed-off-by: Karsten Graul Signed-off-by: David S. Miller --- net/smc/smc_llc.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'net') diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 4cc583678ac7..391237b601fe 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -750,7 +750,6 @@ static int smc_llc_cli_conf_link(struct smc_link *link, enum smc_lgr_type lgr_new_t) { struct smc_link_group *lgr = link->lgr; - struct smc_llc_msg_del_link *del_llc; struct smc_llc_qentry *qentry = NULL; int rc = 0; @@ -764,7 +763,6 @@ static int smc_llc_cli_conf_link(struct smc_link *link, } if (qentry->msg.raw.hdr.common.type != SMC_LLC_CONFIRM_LINK) { /* received DELETE_LINK instead */ - del_llc = &qentry->msg.delete_link; qentry->msg.raw.hdr.flags |= SMC_LLC_FLAG_RESP; smc_llc_send_message(link, &qentry->msg); smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); @@ -1308,16 +1306,12 @@ static void smc_llc_process_srv_delete_link(struct smc_link_group *lgr) * enqueued DELETE_LINK request (forward it) */ if (!smc_llc_send_message(lnk, &qentry->msg)) { - struct smc_llc_msg_del_link *del_llc_resp; struct smc_llc_qentry *qentry2; qentry2 = smc_llc_wait(lgr, lnk, SMC_LLC_WAIT_TIME, SMC_LLC_DELETE_LINK); - if (!qentry2) { - } else { - del_llc_resp = &qentry2->msg.delete_link; + if (qentry2) smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); - } } } smcr_link_clear(lnk_del, true); -- cgit v1.2.3 From 307f660d056b5eb8f5bb2328fac3915ab75b5007 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 May 2020 09:32:18 -0700 Subject: netpoll: remove dev argument from netpoll_send_skb_on_dev() netpoll_send_skb_on_dev() can get the device pointer directly from np->dev Rename it to __netpoll_send_skb() Following patch will move netpoll_send_skb() out-of-line. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netpoll.h | 5 ++--- net/core/netpoll.c | 10 ++++++---- 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index 676f1ff161a9..00e0bae3d402 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -63,13 +63,12 @@ int netpoll_setup(struct netpoll *np); void __netpoll_cleanup(struct netpoll *np); void __netpoll_free(struct netpoll *np); void netpoll_cleanup(struct netpoll *np); -void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, - struct net_device *dev); +void __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb); static inline void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) { unsigned long flags; local_irq_save(flags); - netpoll_send_skb_on_dev(np, skb, np->dev); + __netpoll_send_skb(np, skb); local_irq_restore(flags); } diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 15b366a1a958..c5059b7ffc94 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -305,17 +305,19 @@ static int netpoll_owner_active(struct net_device *dev) } /* call with IRQ disabled */ -void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, - struct net_device *dev) +void __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) { netdev_tx_t status = NETDEV_TX_BUSY; + struct net_device *dev; unsigned long tries; /* It is up to the caller to keep npinfo alive. */ struct netpoll_info *npinfo; lockdep_assert_irqs_disabled(); - npinfo = rcu_dereference_bh(np->dev->npinfo); + dev = np->dev; + npinfo = rcu_dereference_bh(dev->npinfo); + if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) { dev_kfree_skb_irq(skb); return; @@ -358,7 +360,7 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, schedule_delayed_work(&npinfo->tx_work,0); } } -EXPORT_SYMBOL(netpoll_send_skb_on_dev); +EXPORT_SYMBOL(__netpoll_send_skb); void netpoll_send_udp(struct netpoll *np, const char *msg, int len) { -- cgit v1.2.3 From fb1eee476b0d3be3e58dac1a3a96f726c6278bed Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 May 2020 09:32:19 -0700 Subject: netpoll: move netpoll_send_skb() out of line There is no need to inline this helper, as we intend to add more code in this function. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netpoll.h | 9 +-------- net/core/netpoll.c | 13 +++++++++++-- 2 files changed, 12 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index 00e0bae3d402..e466ddffef61 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -63,14 +63,7 @@ int netpoll_setup(struct netpoll *np); void __netpoll_cleanup(struct netpoll *np); void __netpoll_free(struct netpoll *np); void netpoll_cleanup(struct netpoll *np); -void __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb); -static inline void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) -{ - unsigned long flags; - local_irq_save(flags); - __netpoll_send_skb(np, skb); - local_irq_restore(flags); -} +void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb); #ifdef CONFIG_NETPOLL static inline void *netpoll_poll_lock(struct napi_struct *napi) diff --git a/net/core/netpoll.c b/net/core/netpoll.c index c5059b7ffc94..34cd34f24423 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -305,7 +305,7 @@ static int netpoll_owner_active(struct net_device *dev) } /* call with IRQ disabled */ -void __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) +static void __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) { netdev_tx_t status = NETDEV_TX_BUSY; struct net_device *dev; @@ -360,7 +360,16 @@ void __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) schedule_delayed_work(&npinfo->tx_work,0); } } -EXPORT_SYMBOL(__netpoll_send_skb); + +void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) +{ + unsigned long flags; + + local_irq_save(flags); + __netpoll_send_skb(np, skb); + local_irq_restore(flags); +} +EXPORT_SYMBOL(netpoll_send_skb); void netpoll_send_udp(struct netpoll *np, const char *msg, int len) { -- cgit v1.2.3 From 1ddabdfaf70c202b88925edd74c66f4707dbd92e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 May 2020 09:32:20 -0700 Subject: netpoll: netpoll_send_skb() returns transmit status Some callers want to know if the packet has been sent or dropped, to inform upper stacks. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netpoll.h | 2 +- net/core/netpoll.c | 11 +++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index e466ddffef61..f47af135bd56 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -63,7 +63,7 @@ int netpoll_setup(struct netpoll *np); void __netpoll_cleanup(struct netpoll *np); void __netpoll_free(struct netpoll *np); void netpoll_cleanup(struct netpoll *np); -void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb); +netdev_tx_t netpoll_send_skb(struct netpoll *np, struct sk_buff *skb); #ifdef CONFIG_NETPOLL static inline void *netpoll_poll_lock(struct napi_struct *napi) diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 34cd34f24423..40d2753aa47d 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -305,7 +305,7 @@ static int netpoll_owner_active(struct net_device *dev) } /* call with IRQ disabled */ -static void __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) +static netdev_tx_t __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) { netdev_tx_t status = NETDEV_TX_BUSY; struct net_device *dev; @@ -320,7 +320,7 @@ static void __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) { dev_kfree_skb_irq(skb); - return; + return NET_XMIT_DROP; } /* don't get messages out of order, and no recursion */ @@ -359,15 +359,18 @@ static void __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) skb_queue_tail(&npinfo->txq, skb); schedule_delayed_work(&npinfo->tx_work,0); } + return NETDEV_TX_OK; } -void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) +netdev_tx_t netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) { unsigned long flags; + netdev_tx_t ret; local_irq_save(flags); - __netpoll_send_skb(np, skb); + ret = __netpoll_send_skb(np, skb); local_irq_restore(flags); + return ret; } EXPORT_SYMBOL(netpoll_send_skb); -- cgit v1.2.3 From f78ed2204db9fc35b545d693865bddbe0149aa1f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 May 2020 09:32:21 -0700 Subject: netpoll: accept NULL np argument in netpoll_send_skb() netpoll_send_skb() callers seem to leak skb if the np pointer is NULL. While this should not happen, we can make the code more robust. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/macvlan.c | 5 ++--- include/linux/if_team.h | 5 +---- include/net/bonding.h | 5 +---- net/8021q/vlan_dev.c | 5 ++--- net/bridge/br_private.h | 5 +---- net/core/netpoll.c | 11 ++++++++--- net/dsa/slave.c | 5 ++--- 7 files changed, 17 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index 34eb073cdd74..9a419d5102ce 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -542,12 +542,11 @@ xmit_world: static inline netdev_tx_t macvlan_netpoll_send_skb(struct macvlan_dev *vlan, struct sk_buff *skb) { #ifdef CONFIG_NET_POLL_CONTROLLER - if (vlan->netpoll) - netpoll_send_skb(vlan->netpoll, skb); + return netpoll_send_skb(vlan->netpoll, skb); #else BUG(); -#endif return NETDEV_TX_OK; +#endif } static netdev_tx_t macvlan_start_xmit(struct sk_buff *skb, diff --git a/include/linux/if_team.h b/include/linux/if_team.h index ec7e4bd07f82..537dc2b8c879 100644 --- a/include/linux/if_team.h +++ b/include/linux/if_team.h @@ -102,10 +102,7 @@ static inline bool team_port_dev_txable(const struct net_device *port_dev) static inline void team_netpoll_send_skb(struct team_port *port, struct sk_buff *skb) { - struct netpoll *np = port->np; - - if (np) - netpoll_send_skb(np, skb); + netpoll_send_skb(port->np, skb); } #else static inline void team_netpoll_send_skb(struct team_port *port, diff --git a/include/net/bonding.h b/include/net/bonding.h index 0b696da5c115..f211983cd52a 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -507,10 +507,7 @@ static inline unsigned long slave_last_rx(struct bonding *bond, static inline void bond_netpoll_send_skb(const struct slave *slave, struct sk_buff *skb) { - struct netpoll *np = slave->np; - - if (np) - netpoll_send_skb(np, skb); + netpoll_send_skb(slave->np, skb); } #else static inline void bond_netpoll_send_skb(const struct slave *slave, diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 319220b2341d..f00bb57f0f60 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -88,12 +88,11 @@ static int vlan_dev_hard_header(struct sk_buff *skb, struct net_device *dev, static inline netdev_tx_t vlan_netpoll_send_skb(struct vlan_dev_priv *vlan, struct sk_buff *skb) { #ifdef CONFIG_NET_POLL_CONTROLLER - if (vlan->netpoll) - netpoll_send_skb(vlan->netpoll, skb); + return netpoll_send_skb(vlan->netpoll, skb); #else BUG(); -#endif return NETDEV_TX_OK; +#endif } static netdev_tx_t vlan_dev_hard_start_xmit(struct sk_buff *skb, diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 78d3a951180d..4dc21e8f7e33 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -598,10 +598,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev); static inline void br_netpoll_send_skb(const struct net_bridge_port *p, struct sk_buff *skb) { - struct netpoll *np = p->np; - - if (np) - netpoll_send_skb(np, skb); + netpoll_send_skb(p->np, skb); } int br_netpoll_enable(struct net_bridge_port *p); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 40d2753aa47d..093e90e52bc2 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -367,9 +367,14 @@ netdev_tx_t netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) unsigned long flags; netdev_tx_t ret; - local_irq_save(flags); - ret = __netpoll_send_skb(np, skb); - local_irq_restore(flags); + if (unlikely(!np)) { + dev_kfree_skb_irq(skb); + ret = NET_XMIT_DROP; + } else { + local_irq_save(flags); + ret = __netpoll_send_skb(np, skb); + local_irq_restore(flags); + } return ret; } EXPORT_SYMBOL(netpoll_send_skb); diff --git a/net/dsa/slave.c b/net/dsa/slave.c index dfb4282fc339..61b0de52040a 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -445,12 +445,11 @@ static inline netdev_tx_t dsa_slave_netpoll_send_skb(struct net_device *dev, #ifdef CONFIG_NET_POLL_CONTROLLER struct dsa_slave_priv *p = netdev_priv(dev); - if (p->netpoll) - netpoll_send_skb(p->netpoll, skb); + return netpoll_send_skb(p->netpoll, skb); #else BUG(); -#endif return NETDEV_TX_OK; +#endif } static void dsa_skb_tx_timestamp(struct dsa_slave_priv *p, -- cgit v1.2.3 From 790709f249728640faa4eff38286a9feb34fed81 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 May 2020 10:05:39 -0700 Subject: net: relax SO_TXTIME CAP_NET_ADMIN check Now sch_fq has horizon feature, we want to allow QUIC/UDP applications to use EDT model so that pacing can be offloaded to the kernel (sch_fq) or the NIC. Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Acked-by: Willem de Bruijn Acked-by: Vinicius Costa Gomes Signed-off-by: David S. Miller --- net/core/sock.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index b714162213ae..fd85e651ce28 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1152,23 +1152,31 @@ set_rcvbuf: break; case SO_TXTIME: - if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { - ret = -EPERM; - } else if (optlen != sizeof(struct sock_txtime)) { + if (optlen != sizeof(struct sock_txtime)) { ret = -EINVAL; + break; } else if (copy_from_user(&sk_txtime, optval, sizeof(struct sock_txtime))) { ret = -EFAULT; + break; } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) { ret = -EINVAL; - } else { - sock_valbool_flag(sk, SOCK_TXTIME, true); - sk->sk_clockid = sk_txtime.clockid; - sk->sk_txtime_deadline_mode = - !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE); - sk->sk_txtime_report_errors = - !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS); + break; + } + /* CLOCK_MONOTONIC is only used by sch_fq, and this packet + * scheduler has enough safe guards. + */ + if (sk_txtime.clockid != CLOCK_MONOTONIC && + !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { + ret = -EPERM; + break; } + sock_valbool_flag(sk, SOCK_TXTIME, true); + sk->sk_clockid = sk_txtime.clockid; + sk->sk_txtime_deadline_mode = + !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE); + sk->sk_txtime_report_errors = + !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS); break; case SO_BINDTOIFINDEX: -- cgit v1.2.3 From cb0721c7e200750907bb8ef59b12646a5cb2dadf Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 8 May 2020 10:46:10 -0700 Subject: net: Refactor arguments of inet{,6}_bind The intent is to add an additional bind parameter in the next commit. Instead of adding another argument, let's convert all existing flag arguments into an extendable bit field. No functional changes. Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Acked-by: Andrey Ignatov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200508174611.228805-4-sdf@google.com --- include/net/inet_common.h | 6 +++++- include/net/ipv6_stubs.h | 2 +- net/core/filter.c | 6 ++++-- net/ipv4/af_inet.c | 10 +++++----- net/ipv6/af_inet6.c | 10 +++++----- 5 files changed, 20 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/include/net/inet_common.h b/include/net/inet_common.h index ae2ba897675c..c38f4f7d660a 100644 --- a/include/net/inet_common.h +++ b/include/net/inet_common.h @@ -35,8 +35,12 @@ int inet_shutdown(struct socket *sock, int how); int inet_listen(struct socket *sock, int backlog); void inet_sock_destruct(struct sock *sk); int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); +/* Don't allocate port at this moment, defer to connect. */ +#define BIND_FORCE_ADDRESS_NO_PORT (1 << 0) +/* Grab and release socket lock. */ +#define BIND_WITH_LOCK (1 << 1) int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, - bool force_bind_address_no_port, bool with_lock); + u32 flags); int inet_getname(struct socket *sock, struct sockaddr *uaddr, int peer); int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h index a5f7c12c326a..6e622dd3122e 100644 --- a/include/net/ipv6_stubs.h +++ b/include/net/ipv6_stubs.h @@ -63,7 +63,7 @@ extern const struct ipv6_stub *ipv6_stub __read_mostly; /* A stub used by bpf helpers. Similarly ugly as ipv6_stub */ struct ipv6_bpf_stub { int (*inet6_bind)(struct sock *sk, struct sockaddr *uaddr, int addr_len, - bool force_bind_address_no_port, bool with_lock); + u32 flags); struct sock *(*udp6_lib_lookup)(struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, __be16 dport, diff --git a/net/core/filter.c b/net/core/filter.c index dfaf5df13722..fa9ddab5dd1f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4538,7 +4538,8 @@ BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr, return err; if (((struct sockaddr_in *)addr)->sin_port != htons(0)) return err; - return __inet_bind(sk, addr, addr_len, true, false); + return __inet_bind(sk, addr, addr_len, + BIND_FORCE_ADDRESS_NO_PORT); #if IS_ENABLED(CONFIG_IPV6) } else if (addr->sa_family == AF_INET6) { if (addr_len < SIN6_LEN_RFC2133) @@ -4548,7 +4549,8 @@ BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr, /* ipv6_bpf_stub cannot be NULL, since it's called from * bpf_cgroup_inet6_connect hook and ipv6 is already loaded */ - return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, true, false); + return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, + BIND_FORCE_ADDRESS_NO_PORT); #endif /* CONFIG_IPV6 */ } #endif /* CONFIG_INET */ diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 6177c4ba0037..68e74b1b0f26 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -450,12 +450,12 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (err) return err; - return __inet_bind(sk, uaddr, addr_len, false, true); + return __inet_bind(sk, uaddr, addr_len, BIND_WITH_LOCK); } EXPORT_SYMBOL(inet_bind); int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, - bool force_bind_address_no_port, bool with_lock) + u32 flags) { struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; struct inet_sock *inet = inet_sk(sk); @@ -506,7 +506,7 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, * would be illegal to use them (multicast/broadcast) in * which case the sending device address is used. */ - if (with_lock) + if (flags & BIND_WITH_LOCK) lock_sock(sk); /* Check these errors (active socket, double bind). */ @@ -520,7 +520,7 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, /* Make sure we are allowed to bind here. */ if (snum || !(inet->bind_address_no_port || - force_bind_address_no_port)) { + (flags & BIND_FORCE_ADDRESS_NO_PORT))) { if (sk->sk_prot->get_port(sk, snum)) { inet->inet_saddr = inet->inet_rcv_saddr = 0; err = -EADDRINUSE; @@ -543,7 +543,7 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, sk_dst_reset(sk); err = 0; out_release_sock: - if (with_lock) + if (flags & BIND_WITH_LOCK) release_sock(sk); out: return err; diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 345baa0a754f..552c2592b81c 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -273,7 +273,7 @@ out_rcu_unlock: } static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, - bool force_bind_address_no_port, bool with_lock) + u32 flags) { struct sockaddr_in6 *addr = (struct sockaddr_in6 *)uaddr; struct inet_sock *inet = inet_sk(sk); @@ -297,7 +297,7 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) return -EACCES; - if (with_lock) + if (flags & BIND_WITH_LOCK) lock_sock(sk); /* Check these errors (active socket, double bind). */ @@ -400,7 +400,7 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, /* Make sure we are allowed to bind here. */ if (snum || !(inet->bind_address_no_port || - force_bind_address_no_port)) { + (flags & BIND_FORCE_ADDRESS_NO_PORT))) { if (sk->sk_prot->get_port(sk, snum)) { sk->sk_ipv6only = saved_ipv6only; inet_reset_saddr(sk); @@ -423,7 +423,7 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, inet->inet_dport = 0; inet->inet_daddr = 0; out: - if (with_lock) + if (flags & BIND_WITH_LOCK) release_sock(sk); return err; out_unlock: @@ -451,7 +451,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (err) return err; - return __inet6_bind(sk, uaddr, addr_len, false, true); + return __inet6_bind(sk, uaddr, addr_len, BIND_WITH_LOCK); } EXPORT_SYMBOL(inet6_bind); -- cgit v1.2.3 From 8086fbaf49345f988deec539ec8e182b02914401 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 8 May 2020 10:46:11 -0700 Subject: bpf: Allow any port in bpf_bind helper We want to have a tighter control on what ports we bind to in the BPF_CGROUP_INET{4,6}_CONNECT hooks even if it means connect() becomes slightly more expensive. The expensive part comes from the fact that we now need to call inet_csk_get_port() that verifies that the port is not used and allocates an entry in the hash table for it. Since we can't rely on "snum || !bind_address_no_port" to prevent us from calling POST_BIND hook anymore, let's add another bind flag to indicate that the call site is BPF program. v5: * fix wrong AF_INET (should be AF_INET6) in the bpf program for v6 v3: * More bpf_bind documentation refinements (Martin KaFai Lau) * Add UDP tests as well (Martin KaFai Lau) * Don't start the thread, just do socket+bind+listen (Martin KaFai Lau) v2: * Update documentation (Andrey Ignatov) * Pass BIND_FORCE_ADDRESS_NO_PORT conditionally (Andrey Ignatov) Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Acked-by: Andrey Ignatov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200508174611.228805-5-sdf@google.com --- include/net/inet_common.h | 2 + include/uapi/linux/bpf.h | 9 +- net/core/filter.c | 18 ++-- net/ipv4/af_inet.c | 10 +- net/ipv6/af_inet6.c | 12 ++- tools/include/uapi/linux/bpf.h | 9 +- .../selftests/bpf/prog_tests/connect_force_port.c | 115 +++++++++++++++++++++ .../selftests/bpf/progs/connect_force_port4.c | 28 +++++ .../selftests/bpf/progs/connect_force_port6.c | 28 +++++ 9 files changed, 203 insertions(+), 28 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/connect_force_port.c create mode 100644 tools/testing/selftests/bpf/progs/connect_force_port4.c create mode 100644 tools/testing/selftests/bpf/progs/connect_force_port6.c (limited to 'net') diff --git a/include/net/inet_common.h b/include/net/inet_common.h index c38f4f7d660a..cb2818862919 100644 --- a/include/net/inet_common.h +++ b/include/net/inet_common.h @@ -39,6 +39,8 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); #define BIND_FORCE_ADDRESS_NO_PORT (1 << 0) /* Grab and release socket lock. */ #define BIND_WITH_LOCK (1 << 1) +/* Called from BPF program. */ +#define BIND_FROM_BPF (1 << 2) int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, u32 flags); int inet_getname(struct socket *sock, struct sockaddr *uaddr, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b3643e27e264..6e5e7caa3739 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1994,10 +1994,11 @@ union bpf_attr { * * This helper works for IPv4 and IPv6, TCP and UDP sockets. The * domain (*addr*\ **->sa_family**) must be **AF_INET** (or - * **AF_INET6**). Looking for a free port to bind to can be - * expensive, therefore binding to port is not permitted by the - * helper: *addr*\ **->sin_port** (or **sin6_port**, respectively) - * must be set to zero. + * **AF_INET6**). It's advised to pass zero port (**sin_port** + * or **sin6_port**) which triggers IP_BIND_ADDRESS_NO_PORT-like + * behavior and lets the kernel efficiently pick up an unused + * port as long as 4-tuple is unique. Passing non-zero port might + * lead to degraded performance. * Return * 0 on success, or a negative error in case of failure. * diff --git a/net/core/filter.c b/net/core/filter.c index fa9ddab5dd1f..da0634979f53 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4525,32 +4525,28 @@ BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr, { #ifdef CONFIG_INET struct sock *sk = ctx->sk; + u32 flags = BIND_FROM_BPF; int err; - /* Binding to port can be expensive so it's prohibited in the helper. - * Only binding to IP is supported. - */ err = -EINVAL; if (addr_len < offsetofend(struct sockaddr, sa_family)) return err; if (addr->sa_family == AF_INET) { if (addr_len < sizeof(struct sockaddr_in)) return err; - if (((struct sockaddr_in *)addr)->sin_port != htons(0)) - return err; - return __inet_bind(sk, addr, addr_len, - BIND_FORCE_ADDRESS_NO_PORT); + if (((struct sockaddr_in *)addr)->sin_port == htons(0)) + flags |= BIND_FORCE_ADDRESS_NO_PORT; + return __inet_bind(sk, addr, addr_len, flags); #if IS_ENABLED(CONFIG_IPV6) } else if (addr->sa_family == AF_INET6) { if (addr_len < SIN6_LEN_RFC2133) return err; - if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0)) - return err; + if (((struct sockaddr_in6 *)addr)->sin6_port == htons(0)) + flags |= BIND_FORCE_ADDRESS_NO_PORT; /* ipv6_bpf_stub cannot be NULL, since it's called from * bpf_cgroup_inet6_connect hook and ipv6 is already loaded */ - return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, - BIND_FORCE_ADDRESS_NO_PORT); + return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, flags); #endif /* CONFIG_IPV6 */ } #endif /* CONFIG_INET */ diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 68e74b1b0f26..fcf0d12a407a 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -526,10 +526,12 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, err = -EADDRINUSE; goto out_release_sock; } - err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk); - if (err) { - inet->inet_saddr = inet->inet_rcv_saddr = 0; - goto out_release_sock; + if (!(flags & BIND_FROM_BPF)) { + err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk); + if (err) { + inet->inet_saddr = inet->inet_rcv_saddr = 0; + goto out_release_sock; + } } } diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 552c2592b81c..771a462a8322 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -407,11 +407,13 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, err = -EADDRINUSE; goto out; } - err = BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk); - if (err) { - sk->sk_ipv6only = saved_ipv6only; - inet_reset_saddr(sk); - goto out; + if (!(flags & BIND_FROM_BPF)) { + err = BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk); + if (err) { + sk->sk_ipv6only = saved_ipv6only; + inet_reset_saddr(sk); + goto out; + } } } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index b3643e27e264..6e5e7caa3739 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1994,10 +1994,11 @@ union bpf_attr { * * This helper works for IPv4 and IPv6, TCP and UDP sockets. The * domain (*addr*\ **->sa_family**) must be **AF_INET** (or - * **AF_INET6**). Looking for a free port to bind to can be - * expensive, therefore binding to port is not permitted by the - * helper: *addr*\ **->sin_port** (or **sin6_port**, respectively) - * must be set to zero. + * **AF_INET6**). It's advised to pass zero port (**sin_port** + * or **sin6_port**) which triggers IP_BIND_ADDRESS_NO_PORT-like + * behavior and lets the kernel efficiently pick up an unused + * port as long as 4-tuple is unique. Passing non-zero port might + * lead to degraded performance. * Return * 0 on success, or a negative error in case of failure. * diff --git a/tools/testing/selftests/bpf/prog_tests/connect_force_port.c b/tools/testing/selftests/bpf/prog_tests/connect_force_port.c new file mode 100644 index 000000000000..47fbb20cb6a6 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/connect_force_port.c @@ -0,0 +1,115 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include "cgroup_helpers.h" +#include "network_helpers.h" + +static int verify_port(int family, int fd, int expected) +{ + struct sockaddr_storage addr; + socklen_t len = sizeof(addr); + __u16 port; + + if (getsockname(fd, (struct sockaddr *)&addr, &len)) { + log_err("Failed to get server addr"); + return -1; + } + + if (family == AF_INET) + port = ((struct sockaddr_in *)&addr)->sin_port; + else + port = ((struct sockaddr_in6 *)&addr)->sin6_port; + + if (ntohs(port) != expected) { + log_err("Unexpected port %d, expected %d", ntohs(port), + expected); + return -1; + } + + return 0; +} + +static int run_test(int cgroup_fd, int server_fd, int family, int type) +{ + struct bpf_prog_load_attr attr = { + .prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR, + }; + struct bpf_object *obj; + int expected_port; + int prog_fd; + int err; + int fd; + + if (family == AF_INET) { + attr.file = "./connect_force_port4.o"; + attr.expected_attach_type = BPF_CGROUP_INET4_CONNECT; + expected_port = 22222; + } else { + attr.file = "./connect_force_port6.o"; + attr.expected_attach_type = BPF_CGROUP_INET6_CONNECT; + expected_port = 22223; + } + + err = bpf_prog_load_xattr(&attr, &obj, &prog_fd); + if (err) { + log_err("Failed to load BPF object"); + return -1; + } + + err = bpf_prog_attach(prog_fd, cgroup_fd, attr.expected_attach_type, + 0); + if (err) { + log_err("Failed to attach BPF program"); + goto close_bpf_object; + } + + fd = connect_to_fd(family, type, server_fd); + if (fd < 0) { + err = -1; + goto close_bpf_object; + } + + err = verify_port(family, fd, expected_port); + + close(fd); + +close_bpf_object: + bpf_object__close(obj); + return err; +} + +void test_connect_force_port(void) +{ + int server_fd, cgroup_fd; + + cgroup_fd = test__join_cgroup("/connect_force_port"); + if (CHECK_FAIL(cgroup_fd < 0)) + return; + + server_fd = start_server(AF_INET, SOCK_STREAM); + if (CHECK_FAIL(server_fd < 0)) + goto close_cgroup_fd; + CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET, SOCK_STREAM)); + close(server_fd); + + server_fd = start_server(AF_INET6, SOCK_STREAM); + if (CHECK_FAIL(server_fd < 0)) + goto close_cgroup_fd; + CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET6, SOCK_STREAM)); + close(server_fd); + + server_fd = start_server(AF_INET, SOCK_DGRAM); + if (CHECK_FAIL(server_fd < 0)) + goto close_cgroup_fd; + CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET, SOCK_DGRAM)); + close(server_fd); + + server_fd = start_server(AF_INET6, SOCK_DGRAM); + if (CHECK_FAIL(server_fd < 0)) + goto close_cgroup_fd; + CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET6, SOCK_DGRAM)); + close(server_fd); + +close_cgroup_fd: + close(cgroup_fd); +} diff --git a/tools/testing/selftests/bpf/progs/connect_force_port4.c b/tools/testing/selftests/bpf/progs/connect_force_port4.c new file mode 100644 index 000000000000..1b8eb34b2db0 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/connect_force_port4.c @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: GPL-2.0 +#include + +#include +#include +#include +#include + +#include +#include + +char _license[] SEC("license") = "GPL"; +int _version SEC("version") = 1; + +SEC("cgroup/connect4") +int _connect4(struct bpf_sock_addr *ctx) +{ + struct sockaddr_in sa = {}; + + sa.sin_family = AF_INET; + sa.sin_port = bpf_htons(22222); + sa.sin_addr.s_addr = bpf_htonl(0x7f000001); /* 127.0.0.1 */ + + if (bpf_bind(ctx, (struct sockaddr *)&sa, sizeof(sa)) != 0) + return 0; + + return 1; +} diff --git a/tools/testing/selftests/bpf/progs/connect_force_port6.c b/tools/testing/selftests/bpf/progs/connect_force_port6.c new file mode 100644 index 000000000000..ae6f7d750b4c --- /dev/null +++ b/tools/testing/selftests/bpf/progs/connect_force_port6.c @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: GPL-2.0 +#include + +#include +#include +#include +#include + +#include +#include + +char _license[] SEC("license") = "GPL"; +int _version SEC("version") = 1; + +SEC("cgroup/connect6") +int _connect6(struct bpf_sock_addr *ctx) +{ + struct sockaddr_in6 sa = {}; + + sa.sin6_family = AF_INET6; + sa.sin6_port = bpf_htons(22223); + sa.sin6_addr.s6_addr32[3] = bpf_htonl(1); /* ::1 */ + + if (bpf_bind(ctx, (struct sockaddr *)&sa, sizeof(sa)) != 0) + return 0; + + return 1; +} -- cgit v1.2.3 From cf86a086a18095e33e0637cb78cda1fcf5280852 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 May 2020 18:58:10 -0700 Subject: net/dst: use a smaller percpu_counter batch for dst entries accounting percpu_counter_add() uses a default batch size which is quite big on platforms with 256 cpus. (2*256 -> 512) This means dst_entries_get_fast() can be off by +/- 2*(nr_cpus^2) (131072 on servers with 256 cpus) Reduce the batch size to something more reasonable, and add logic to ip6_dst_gc() to call dst_entries_get_slow() before calling the _very_ expensive fib6_run_gc() function. Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/net/dst_ops.h | 4 +++- net/core/dst.c | 8 ++++---- net/ipv6/route.c | 3 +++ 3 files changed, 10 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/net/dst_ops.h b/include/net/dst_ops.h index 443863c7b8da..88ff7bb2bb9b 100644 --- a/include/net/dst_ops.h +++ b/include/net/dst_ops.h @@ -53,9 +53,11 @@ static inline int dst_entries_get_slow(struct dst_ops *dst) return percpu_counter_sum_positive(&dst->pcpuc_entries); } +#define DST_PERCPU_COUNTER_BATCH 32 static inline void dst_entries_add(struct dst_ops *dst, int val) { - percpu_counter_add(&dst->pcpuc_entries, val); + percpu_counter_add_batch(&dst->pcpuc_entries, val, + DST_PERCPU_COUNTER_BATCH); } static inline int dst_entries_init(struct dst_ops *dst) diff --git a/net/core/dst.c b/net/core/dst.c index 193af526e908..d6b6ced0d451 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -81,11 +81,11 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev, { struct dst_entry *dst; - if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) { + if (ops->gc && + !(flags & DST_NOCOUNT) && + dst_entries_get_fast(ops) > ops->gc_thresh) { if (ops->gc(ops)) { - printk_ratelimited(KERN_NOTICE "Route cache is full: " - "consider increasing sysctl " - "net.ipv[4|6].route.max_size.\n"); + pr_notice_ratelimited("Route cache is full: consider increasing sysctl net.ipv6.route.max_size.\n"); return NULL; } } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 1ff142393c76..a9072dba00f4 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3195,6 +3195,9 @@ static int ip6_dst_gc(struct dst_ops *ops) int entries; entries = dst_entries_get_fast(ops); + if (entries > rt_max_size) + entries = dst_entries_get_slow(ops); + if (time_after(rt_last_gc + rt_min_interval, jiffies) && entries <= rt_max_size) goto out; -- cgit v1.2.3 From 3712c1c2ef2a0cbca633d150e568deb8f63a6f29 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Fri, 8 May 2020 11:52:08 +0800 Subject: ieee802154: 6lowpan: remove unnecessary comparison The type of dispatch is u8 which is always '<=' 0xff, so the dispatch <= 0xff is always true, we can remove this comparison. Reported-by: Hulk Robot Signed-off-by: Yang Yingliang Signed-off-by: Stefan Schmidt Signed-off-by: Jakub Kicinski --- net/ieee802154/6lowpan/rx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ieee802154/6lowpan/rx.c b/net/ieee802154/6lowpan/rx.c index ee179380a766..b34d050c9687 100644 --- a/net/ieee802154/6lowpan/rx.c +++ b/net/ieee802154/6lowpan/rx.c @@ -240,7 +240,7 @@ static inline bool lowpan_is_reserved(u8 dispatch) return ((dispatch >= 0x44 && dispatch <= 0x4F) || (dispatch >= 0x51 && dispatch <= 0x5F) || (dispatch >= 0xc8 && dispatch <= 0xdf) || - (dispatch >= 0xe8 && dispatch <= 0xff)); + dispatch >= 0xe8); } /* lowpan_rx_h_check checks on generic 6LoWPAN requirements -- cgit v1.2.3 From d8882935fcae28bceb5f6f56f09cded8d36d85e6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 8 May 2020 07:34:14 -0700 Subject: ipv6: use DST_NOCOUNT in ip6_rt_pcpu_alloc() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We currently have to adjust ipv6 route gc_thresh/max_size depending on number of cpus on a server, this makes very little sense. If the kernels sets /proc/sys/net/ipv6/route/gc_thresh to 1024 and /proc/sys/net/ipv6/route/max_size to 4096, then we better not track the percpu dst that our implementation uses. Only routes not added (directly or indirectly) by the admin should be tracked and limited. Signed-off-by: Eric Dumazet Cc: Martin KaFai Lau Cc: David Ahern Cc: Maciej Żenczykowski Acked-by: Wei Wang Signed-off-by: Jakub Kicinski --- net/ipv6/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index a9072dba00f4..4292653af533 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1377,7 +1377,7 @@ static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res) rcu_read_lock(); dev = ip6_rt_get_dev_rcu(res); - pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags); + pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags | DST_NOCOUNT); rcu_read_unlock(); if (!pcpu_rt) { fib6_info_release(f6i); -- cgit v1.2.3 From 138d0be35b141e09f6b267c6ae4094318d4e4491 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:10 -0700 Subject: net: bpf: Add netlink and ipv6_route bpf_iter targets This patch added netlink and ipv6_route targets, using the same seq_ops (except show() and minor changes for stop()) for /proc/net/{netlink,ipv6_route}. The net namespace for these targets are the current net namespace at file open stage, similar to /proc/net/{netlink,ipv6_route} reference counting the net namespace at seq_file open stage. Since module is not supported for now, ipv6_route is supported only if the IPV6 is built-in, i.e., not compiled as a module. The restriction can be lifted once module is properly supported for bpf_iter. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175910.2476329-1-yhs@fb.com --- fs/proc/proc_net.c | 19 +++++++++++ include/linux/proc_fs.h | 3 ++ net/ipv6/ip6_fib.c | 65 ++++++++++++++++++++++++++++++++++-- net/ipv6/route.c | 37 ++++++++++++++++++++ net/netlink/af_netlink.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++-- 5 files changed, 207 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 4888c5224442..dba63b2429f0 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -98,6 +98,25 @@ static const struct proc_ops proc_net_seq_ops = { .proc_release = seq_release_net, }; +int bpf_iter_init_seq_net(void *priv_data) +{ +#ifdef CONFIG_NET_NS + struct seq_net_private *p = priv_data; + + p->net = get_net(current->nsproxy->net_ns); +#endif + return 0; +} + +void bpf_iter_fini_seq_net(void *priv_data) +{ +#ifdef CONFIG_NET_NS + struct seq_net_private *p = priv_data; + + put_net(p->net); +#endif +} + struct proc_dir_entry *proc_create_net_data(const char *name, umode_t mode, struct proc_dir_entry *parent, const struct seq_operations *ops, unsigned int state_size, void *data) diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 45c05fd9c99d..03953c59807d 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -105,6 +105,9 @@ struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mo void *data); extern struct pid *tgid_pidfd_to_pid(const struct file *file); +extern int bpf_iter_init_seq_net(void *priv_data); +extern void bpf_iter_fini_seq_net(void *priv_data); + #ifdef CONFIG_PROC_PID_ARCH_STATUS /* * The architecture which selects CONFIG_PROC_PID_ARCH_STATUS must diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 46ed56719476..a1fcc0ca21af 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -2467,7 +2467,7 @@ void fib6_gc_cleanup(void) } #ifdef CONFIG_PROC_FS -static int ipv6_route_seq_show(struct seq_file *seq, void *v) +static int ipv6_route_native_seq_show(struct seq_file *seq, void *v) { struct fib6_info *rt = v; struct ipv6_route_iter *iter = seq->private; @@ -2625,7 +2625,7 @@ static bool ipv6_route_iter_active(struct ipv6_route_iter *iter) return w->node && !(w->state == FWS_U && w->node == w->root); } -static void ipv6_route_seq_stop(struct seq_file *seq, void *v) +static void ipv6_route_native_seq_stop(struct seq_file *seq, void *v) __releases(RCU_BH) { struct net *net = seq_file_net(seq); @@ -2637,6 +2637,67 @@ static void ipv6_route_seq_stop(struct seq_file *seq, void *v) rcu_read_unlock_bh(); } +#if IS_BUILTIN(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL) +struct bpf_iter__ipv6_route { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct fib6_info *, rt); +}; + +static int ipv6_route_prog_seq_show(struct bpf_prog *prog, + struct bpf_iter_meta *meta, + void *v) +{ + struct bpf_iter__ipv6_route ctx; + + ctx.meta = meta; + ctx.rt = v; + return bpf_iter_run_prog(prog, &ctx); +} + +static int ipv6_route_seq_show(struct seq_file *seq, void *v) +{ + struct ipv6_route_iter *iter = seq->private; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + int ret; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, false); + if (!prog) + return ipv6_route_native_seq_show(seq, v); + + ret = ipv6_route_prog_seq_show(prog, &meta, v); + iter->w.leaf = NULL; + + return ret; +} + +static void ipv6_route_seq_stop(struct seq_file *seq, void *v) +{ + struct bpf_iter_meta meta; + struct bpf_prog *prog; + + if (!v) { + meta.seq = seq; + prog = bpf_iter_get_info(&meta, true); + if (prog) + (void)ipv6_route_prog_seq_show(prog, &meta, v); + } + + ipv6_route_native_seq_stop(seq, v); +} +#else +static int ipv6_route_seq_show(struct seq_file *seq, void *v) +{ + return ipv6_route_native_seq_show(seq, v); +} + +static void ipv6_route_seq_stop(struct seq_file *seq, void *v) +{ + ipv6_route_native_seq_stop(seq, v); +} +#endif + const struct seq_operations ipv6_route_seq_ops = { .start = ipv6_route_seq_start, .next = ipv6_route_seq_next, diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 3912aac7854d..25f6d3e619d0 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -6393,6 +6393,30 @@ void __init ip6_route_init_special_entries(void) #endif } +#if IS_BUILTIN(CONFIG_IPV6) +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) +DEFINE_BPF_ITER_FUNC(ipv6_route, struct bpf_iter_meta *meta, struct fib6_info *rt) + +static int __init bpf_iter_register(void) +{ + struct bpf_iter_reg reg_info = { + .target = "ipv6_route", + .seq_ops = &ipv6_route_seq_ops, + .init_seq_private = bpf_iter_init_seq_net, + .fini_seq_private = bpf_iter_fini_seq_net, + .seq_priv_size = sizeof(struct ipv6_route_iter), + }; + + return bpf_iter_reg_target(®_info); +} + +static void bpf_iter_unregister(void) +{ + bpf_iter_unreg_target("ipv6_route"); +} +#endif +#endif + int __init ip6_route_init(void) { int ret; @@ -6455,6 +6479,14 @@ int __init ip6_route_init(void) if (ret) goto out_register_late_subsys; +#if IS_BUILTIN(CONFIG_IPV6) +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) + ret = bpf_iter_register(); + if (ret) + goto out_register_late_subsys; +#endif +#endif + for_each_possible_cpu(cpu) { struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); @@ -6487,6 +6519,11 @@ out_kmem_cache: void ip6_route_cleanup(void) { +#if IS_BUILTIN(CONFIG_IPV6) +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) + bpf_iter_unregister(); +#endif +#endif unregister_netdevice_notifier(&ip6_route_dev_notifier); unregister_pernet_subsys(&ip6_route_net_late_ops); fib6_rules_cleanup(); diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 5ded01ca8b20..33cda9baa979 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2596,7 +2596,7 @@ static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos) return __netlink_seq_next(seq); } -static void netlink_seq_stop(struct seq_file *seq, void *v) +static void netlink_native_seq_stop(struct seq_file *seq, void *v) { struct nl_seq_iter *iter = seq->private; @@ -2607,7 +2607,7 @@ static void netlink_seq_stop(struct seq_file *seq, void *v) } -static int netlink_seq_show(struct seq_file *seq, void *v) +static int netlink_native_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) { seq_puts(seq, @@ -2634,6 +2634,68 @@ static int netlink_seq_show(struct seq_file *seq, void *v) return 0; } +#ifdef CONFIG_BPF_SYSCALL +struct bpf_iter__netlink { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct netlink_sock *, sk); +}; + +DEFINE_BPF_ITER_FUNC(netlink, struct bpf_iter_meta *meta, struct netlink_sock *sk) + +static int netlink_prog_seq_show(struct bpf_prog *prog, + struct bpf_iter_meta *meta, + void *v) +{ + struct bpf_iter__netlink ctx; + + meta->seq_num--; /* skip SEQ_START_TOKEN */ + ctx.meta = meta; + ctx.sk = nlk_sk((struct sock *)v); + return bpf_iter_run_prog(prog, &ctx); +} + +static int netlink_seq_show(struct seq_file *seq, void *v) +{ + struct bpf_iter_meta meta; + struct bpf_prog *prog; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, false); + if (!prog) + return netlink_native_seq_show(seq, v); + + if (v != SEQ_START_TOKEN) + return netlink_prog_seq_show(prog, &meta, v); + + return 0; +} + +static void netlink_seq_stop(struct seq_file *seq, void *v) +{ + struct bpf_iter_meta meta; + struct bpf_prog *prog; + + if (!v) { + meta.seq = seq; + prog = bpf_iter_get_info(&meta, true); + if (prog) + (void)netlink_prog_seq_show(prog, &meta, v); + } + + netlink_native_seq_stop(seq, v); +} +#else +static int netlink_seq_show(struct seq_file *seq, void *v) +{ + return netlink_native_seq_show(seq, v); +} + +static void netlink_seq_stop(struct seq_file *seq, void *v) +{ + netlink_native_seq_stop(seq, v); +} +#endif + static const struct seq_operations netlink_seq_ops = { .start = netlink_seq_start, .next = netlink_seq_next, @@ -2740,6 +2802,21 @@ static const struct rhashtable_params netlink_rhashtable_params = { .automatic_shrinking = true, }; +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) +static int __init bpf_iter_register(void) +{ + struct bpf_iter_reg reg_info = { + .target = "netlink", + .seq_ops = &netlink_seq_ops, + .init_seq_private = bpf_iter_init_seq_net, + .fini_seq_private = bpf_iter_fini_seq_net, + .seq_priv_size = sizeof(struct nl_seq_iter), + }; + + return bpf_iter_reg_target(®_info); +} +#endif + static int __init netlink_proto_init(void) { int i; @@ -2748,6 +2825,12 @@ static int __init netlink_proto_init(void) if (err != 0) goto out; +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) + err = bpf_iter_register(); + if (err) + goto out; +#endif + BUILD_BUG_ON(sizeof(struct netlink_skb_parms) > sizeof_field(struct sk_buff, cb)); nl_table = kcalloc(MAX_LINKS, sizeof(*nl_table), GFP_KERNEL); -- cgit v1.2.3 From 11ca3c4261cdb4e2f33e32daf6447f8185843317 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sun, 10 May 2020 21:12:33 +0200 Subject: net: ethtool: netlink: Add support for triggering a cable test Add new ethtool netlink calls to trigger the starting of a PHY cable test. Add Kconfig'ury to ETHTOOL_NETLINK so that PHYLIB is not a module when ETHTOOL_NETLINK is builtin, which would result in kernel linking errors. v2: Remove unwanted white space change Remove ethnl_cable_test_act_ops and use doit handler Rename cable_test_set_policy cable_test_act_policy Remove ETHTOOL_MSG_CABLE_TEST_ACT_REPLY v3: Remove ETHTOOL_MSG_CABLE_TEST_ACT_REPLY from documentation Remove unused cable_test_get_policy Add Reviewed-by tags v4: Remove unwanted blank line Signed-off-by: Andrew Lunn Reviewed-by: Michal Kubecek Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- Documentation/networking/ethtool-netlink.rst | 16 ++++++++- include/uapi/linux/ethtool_netlink.h | 12 +++++++ net/Kconfig | 1 + net/ethtool/Makefile | 2 +- net/ethtool/cabletest.c | 54 ++++++++++++++++++++++++++++ net/ethtool/netlink.c | 5 +++ net/ethtool/netlink.h | 1 + 7 files changed, 89 insertions(+), 2 deletions(-) create mode 100644 net/ethtool/cabletest.c (limited to 'net') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index 8f5cefc539cf..a8731d33d0c9 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -204,6 +204,7 @@ Userspace to kernel: ``ETHTOOL_MSG_EEE_GET`` get EEE settings ``ETHTOOL_MSG_EEE_SET`` set EEE settings ``ETHTOOL_MSG_TSINFO_GET`` get timestamping info + ``ETHTOOL_MSG_CABLE_TEST_ACT`` action start cable test ===================================== ================================ Kernel to userspace: @@ -958,13 +959,25 @@ Kernel response contents: is no special value for this case). The bitset attributes are omitted if they would be empty (no bit set). +CABLE_TEST +========== + +Start a cable test. + +Request contents: + + ==================================== ====== ========================== + ``ETHTOOL_A_CABLE_TEST_HEADER`` nested request header + ==================================== ====== ========================== + Request translation =================== The following table maps ioctl commands to netlink commands providing their functionality. Entries with "n/a" in right column are commands which do not -have their netlink replacement yet. +have their netlink replacement yet. Entries which "n/a" in the left column +are netlink only. =================================== ===================================== ioctl command netlink command @@ -1053,4 +1066,5 @@ have their netlink replacement yet. ``ETHTOOL_PHY_STUNABLE`` n/a ``ETHTOOL_GFECPARAM`` n/a ``ETHTOOL_SFECPARAM`` n/a + n/a ''ETHTOOL_MSG_CABLE_TEST_ACT'' =================================== ===================================== diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index bf1d310e20bc..6bfd648c32cf 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -39,6 +39,7 @@ enum { ETHTOOL_MSG_EEE_GET, ETHTOOL_MSG_EEE_SET, ETHTOOL_MSG_TSINFO_GET, + ETHTOOL_MSG_CABLE_TEST_ACT, /* add new constants above here */ __ETHTOOL_MSG_USER_CNT, @@ -405,6 +406,17 @@ enum { ETHTOOL_A_TSINFO_MAX = (__ETHTOOL_A_TSINFO_CNT - 1) }; +/* CABLE TEST */ + +enum { + ETHTOOL_A_CABLE_TEST_UNSPEC, + ETHTOOL_A_CABLE_TEST_HEADER, /* nest - _A_HEADER_* */ + + /* add new constants above here */ + __ETHTOOL_A_CABLE_TEST_CNT, + ETHTOOL_A_CABLE_TEST_MAX = __ETHTOOL_A_CABLE_TEST_CNT - 1 +}; + /* generic netlink info */ #define ETHTOOL_GENL_NAME "ethtool" #define ETHTOOL_GENL_VERSION 1 diff --git a/net/Kconfig b/net/Kconfig index c5ba2d180c43..5c524c6ee75d 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -455,6 +455,7 @@ config FAILOVER config ETHTOOL_NETLINK bool "Netlink interface for ethtool" default y + depends on PHYLIB=y || PHYLIB=n help An alternative userspace interface for ethtool based on generic netlink. It provides better extensibility and some new features, diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile index 6c360c9c9370..0c2b94f20499 100644 --- a/net/ethtool/Makefile +++ b/net/ethtool/Makefile @@ -6,4 +6,4 @@ obj-$(CONFIG_ETHTOOL_NETLINK) += ethtool_nl.o ethtool_nl-y := netlink.o bitset.o strset.o linkinfo.o linkmodes.o \ linkstate.o debug.o wol.o features.o privflags.o rings.o \ - channels.o coalesce.o pause.o eee.o tsinfo.o + channels.o coalesce.o pause.o eee.o tsinfo.o cabletest.o diff --git a/net/ethtool/cabletest.c b/net/ethtool/cabletest.c new file mode 100644 index 000000000000..aeb6672a46d0 --- /dev/null +++ b/net/ethtool/cabletest.c @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include "netlink.h" +#include "common.h" + +/* CABLE_TEST_ACT */ + +static const struct nla_policy +cable_test_act_policy[ETHTOOL_A_CABLE_TEST_MAX + 1] = { + [ETHTOOL_A_CABLE_TEST_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_CABLE_TEST_HEADER] = { .type = NLA_NESTED }, +}; + +int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *tb[ETHTOOL_A_CABLE_TEST_MAX + 1]; + struct ethnl_req_info req_info = {}; + struct net_device *dev; + int ret; + + ret = nlmsg_parse(info->nlhdr, GENL_HDRLEN, tb, + ETHTOOL_A_CABLE_TEST_MAX, + cable_test_act_policy, info->extack); + if (ret < 0) + return ret; + + ret = ethnl_parse_header_dev_get(&req_info, + tb[ETHTOOL_A_CABLE_TEST_HEADER], + genl_info_net(info), info->extack, + true); + if (ret < 0) + return ret; + + dev = req_info.dev; + if (!dev->phydev) { + ret = -EOPNOTSUPP; + goto out_dev_put; + } + + rtnl_lock(); + ret = ethnl_ops_begin(dev); + if (ret < 0) + goto out_rtnl; + + ret = phy_start_cable_test(dev->phydev, info->extack); + + ethnl_ops_complete(dev); +out_rtnl: + rtnl_unlock(); +out_dev_put: + dev_put(dev); + return ret; +} diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 0c772318c023..b9c9ddf408fe 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -839,6 +839,11 @@ static const struct genl_ops ethtool_genl_ops[] = { .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, }, + { + .cmd = ETHTOOL_MSG_CABLE_TEST_ACT, + .flags = GENL_UNS_ADMIN_PERM, + .doit = ethnl_act_cable_test, + }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 81b8fa020bcb..bd7df592db2f 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -357,5 +357,6 @@ int ethnl_set_channels(struct sk_buff *skb, struct genl_info *info); int ethnl_set_coalesce(struct sk_buff *skb, struct genl_info *info); int ethnl_set_pause(struct sk_buff *skb, struct genl_info *info); int ethnl_set_eee(struct sk_buff *skb, struct genl_info *info); +int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info); #endif /* _NET_ETHTOOL_NETLINK_H */ -- cgit v1.2.3 From 0df960f14e17e55e68dfd1342f063d17dbcc6107 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sun, 10 May 2020 21:12:35 +0200 Subject: net: ethtool: Make helpers public Make some helpers for building ethtool netlink messages available outside the compilation unit, so they can be used for building messages which are not simple get/set. Signed-off-by: Andrew Lunn Reviewed-by: Michal Kubecek Signed-off-by: Jakub Kicinski --- net/ethtool/netlink.c | 4 ++-- net/ethtool/netlink.h | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index b9c9ddf408fe..87bc02da74bc 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -181,13 +181,13 @@ err: return NULL; } -static void *ethnl_bcastmsg_put(struct sk_buff *skb, u8 cmd) +void *ethnl_bcastmsg_put(struct sk_buff *skb, u8 cmd) { return genlmsg_put(skb, 0, ++ethnl_bcast_seq, ðtool_genl_family, 0, cmd); } -static int ethnl_multicast(struct sk_buff *skb, struct net_device *dev) +int ethnl_multicast(struct sk_buff *skb, struct net_device *dev) { return genlmsg_multicast_netns(ðtool_genl_family, dev_net(dev), skb, 0, ETHNL_MCGRP_MONITOR, GFP_KERNEL); diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index bd7df592db2f..b0eb5d920099 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -19,6 +19,8 @@ int ethnl_fill_reply_header(struct sk_buff *skb, struct net_device *dev, struct sk_buff *ethnl_reply_init(size_t payload, struct net_device *dev, u8 cmd, u16 hdr_attrtype, struct genl_info *info, void **ehdrp); +void *ethnl_bcastmsg_put(struct sk_buff *skb, u8 cmd); +int ethnl_multicast(struct sk_buff *skb, struct net_device *dev); /** * ethnl_strz_size() - calculate attribute length for fixed size string -- cgit v1.2.3 From 1dd3f212af30b42c90ba252c165f2f6d2ddf5230 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sun, 10 May 2020 21:12:36 +0200 Subject: net: ethtool: Add infrastructure for reporting cable test results Provide infrastructure for PHY drivers to report the cable test results. A netlink skb is associated to the phydev. Helpers will be added which can add results to this skb. Once the test has finished the results are sent to user space. When netlink ethtool is not part of the kernel configuration stubs are provided. It is also impossible to trigger a cable test, so the error code returned by the alloc function is of no consequence. v2: Include the status complete in the netlink notification message v4: Replace -EINVAL with -EMSGSIZE Signed-off-by: Andrew Lunn Reviewed-by: Michal Kubecek Signed-off-by: Jakub Kicinski --- drivers/net/phy/phy.c | 22 +++++++++++++++-- include/linux/ethtool_netlink.h | 20 +++++++++++++++ include/linux/phy.h | 5 ++++ net/ethtool/cabletest.c | 55 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 100 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index 9fa61019533f..afdc1c2146ee 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -30,6 +31,9 @@ #include #include #include +#include +#include +#include #define PHY_STATE_TIME HZ @@ -478,6 +482,8 @@ static void phy_abort_cable_test(struct phy_device *phydev) { int err; + ethnl_cable_test_finished(phydev); + err = phy_init_hw(phydev); if (err) phydev_err(phydev, "Error while aborting cable test"); @@ -486,7 +492,7 @@ static void phy_abort_cable_test(struct phy_device *phydev) int phy_start_cable_test(struct phy_device *phydev, struct netlink_ext_ack *extack) { - int err; + int err = -ENOMEM; if (!(phydev->drv && phydev->drv->cable_test_start && @@ -512,19 +518,30 @@ int phy_start_cable_test(struct phy_device *phydev, goto out; } + err = ethnl_cable_test_alloc(phydev); + if (err) + goto out; + /* Mark the carrier down until the test is complete */ phy_link_down(phydev, true); err = phydev->drv->cable_test_start(phydev); if (err) { phy_link_up(phydev); - goto out; + goto out_free; } phydev->state = PHY_CABLETEST; if (phy_polling_mode(phydev)) phy_trigger_machine(phydev); + + mutex_unlock(&phydev->lock); + + return 0; + +out_free: + ethnl_cable_test_free(phydev); out: mutex_unlock(&phydev->lock); @@ -964,6 +981,7 @@ void phy_state_machine(struct work_struct *work) } if (finished) { + ethnl_cable_test_finished(phydev); needs_aneg = true; phydev->state = PHY_UP; } diff --git a/include/linux/ethtool_netlink.h b/include/linux/ethtool_netlink.h index d01b77887f82..7d763ba22f6f 100644 --- a/include/linux/ethtool_netlink.h +++ b/include/linux/ethtool_netlink.h @@ -14,4 +14,24 @@ enum ethtool_multicast_groups { ETHNL_MCGRP_MONITOR, }; +struct phy_device; + +#if IS_ENABLED(CONFIG_ETHTOOL_NETLINK) +int ethnl_cable_test_alloc(struct phy_device *phydev); +void ethnl_cable_test_free(struct phy_device *phydev); +void ethnl_cable_test_finished(struct phy_device *phydev); +#else +static inline int ethnl_cable_test_alloc(struct phy_device *phydev) +{ + return -ENOTSUPP; +} + +static inline void ethnl_cable_test_free(struct phy_device *phydev) +{ +} + +static inline void ethnl_cable_test_finished(struct phy_device *phydev) +{ +} +#endif /* IS_ENABLED(ETHTOOL_NETLINK) */ #endif /* _LINUX_ETHTOOL_NETLINK_H_ */ diff --git a/include/linux/phy.h b/include/linux/phy.h index f58eee735a45..169fae4249a9 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -523,6 +523,11 @@ struct phy_device { /* For use by PHYs inside the same package that need a shared state. */ struct phy_package_shared *shared; + /* Reporting cable test results */ + struct sk_buff *skb; + void *ehdr; + struct nlattr *nest; + /* Interrupt and Polling infrastructure */ struct delayed_work state_queue; diff --git a/net/ethtool/cabletest.c b/net/ethtool/cabletest.c index aeb6672a46d0..ae8e63647663 100644 --- a/net/ethtool/cabletest.c +++ b/net/ethtool/cabletest.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only #include +#include #include "netlink.h" #include "common.h" @@ -52,3 +53,57 @@ out_dev_put: dev_put(dev); return ret; } + +int ethnl_cable_test_alloc(struct phy_device *phydev) +{ + int err = -ENOMEM; + + phydev->skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!phydev->skb) + goto out; + + phydev->ehdr = ethnl_bcastmsg_put(phydev->skb, + ETHTOOL_MSG_CABLE_TEST_NTF); + if (!phydev->ehdr) { + err = -EMSGSIZE; + goto out; + } + + err = ethnl_fill_reply_header(phydev->skb, phydev->attached_dev, + ETHTOOL_A_CABLE_TEST_NTF_HEADER); + if (err) + goto out; + + err = nla_put_u8(phydev->skb, ETHTOOL_A_CABLE_TEST_NTF_STATUS, + ETHTOOL_A_CABLE_TEST_NTF_STATUS_COMPLETED); + if (err) + goto out; + + phydev->nest = nla_nest_start(phydev->skb, + ETHTOOL_A_CABLE_TEST_NTF_NEST); + if (!phydev->nest) + goto out; + + return 0; + +out: + nlmsg_free(phydev->skb); + return err; +} +EXPORT_SYMBOL_GPL(ethnl_cable_test_alloc); + +void ethnl_cable_test_free(struct phy_device *phydev) +{ + nlmsg_free(phydev->skb); +} +EXPORT_SYMBOL_GPL(ethnl_cable_test_free); + +void ethnl_cable_test_finished(struct phy_device *phydev) +{ + nla_nest_end(phydev->skb, phydev->nest); + + genlmsg_end(phydev->skb, phydev->ehdr); + + ethnl_multicast(phydev->skb, phydev->attached_dev); +} +EXPORT_SYMBOL_GPL(ethnl_cable_test_finished); -- cgit v1.2.3 From 1e2dc14509fd072739e4bab98ac42317267dbad6 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sun, 10 May 2020 21:12:37 +0200 Subject: net: ethtool: Add helpers for reporting test results The PHY drivers can use these helpers for reporting the results. The results get translated into netlink attributes which are added to the pre-allocated skbuf. v3: Poison phydev->skb Return -EMSGSIZE when ethnl_bcastmsg_put() fails Return valid error code when nla_nest_start() fails Use u8 for results Actually put u32 length into message v4: s/ENOTSUPP/EOPNOTSUPP/g Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Reviewed-by: Michal Kubecek Signed-off-by: Jakub Kicinski --- include/linux/ethtool_netlink.h | 15 +++++++++++- include/linux/phy.h | 4 ++++ net/ethtool/cabletest.c | 53 ++++++++++++++++++++++++++++++++++++++++- 3 files changed, 70 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/linux/ethtool_netlink.h b/include/linux/ethtool_netlink.h index 7d763ba22f6f..e317fc99565e 100644 --- a/include/linux/ethtool_netlink.h +++ b/include/linux/ethtool_netlink.h @@ -20,10 +20,12 @@ struct phy_device; int ethnl_cable_test_alloc(struct phy_device *phydev); void ethnl_cable_test_free(struct phy_device *phydev); void ethnl_cable_test_finished(struct phy_device *phydev); +int ethnl_cable_test_result(struct phy_device *phydev, u8 pair, u8 result); +int ethnl_cable_test_fault_length(struct phy_device *phydev, u8 pair, u32 cm); #else static inline int ethnl_cable_test_alloc(struct phy_device *phydev) { - return -ENOTSUPP; + return -EOPNOTSUPP; } static inline void ethnl_cable_test_free(struct phy_device *phydev) @@ -33,5 +35,16 @@ static inline void ethnl_cable_test_free(struct phy_device *phydev) static inline void ethnl_cable_test_finished(struct phy_device *phydev) { } +static inline int ethnl_cable_test_result(struct phy_device *phydev, u8 pair, + u8 result) +{ + return -EOPNOTSUPP; +} + +static inline int ethnl_cable_test_fault_length(struct phy_device *phydev, + u8 pair, u32 cm) +{ + return -EOPNOTSUPP; +} #endif /* IS_ENABLED(ETHTOOL_NETLINK) */ #endif /* _LINUX_ETHTOOL_NETLINK_H_ */ diff --git a/include/linux/phy.h b/include/linux/phy.h index 169fae4249a9..5d8ff5428010 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1265,6 +1265,10 @@ int phy_start_cable_test(struct phy_device *phydev, } #endif +int phy_cable_test_result(struct phy_device *phydev, u8 pair, u16 result); +int phy_cable_test_fault_length(struct phy_device *phydev, u8 pair, + u16 cm); + static inline void phy_device_reset(struct phy_device *phydev, int value) { mdio_device_reset(&phydev->mdio, value); diff --git a/net/ethtool/cabletest.c b/net/ethtool/cabletest.c index ae8e63647663..e0c917918c70 100644 --- a/net/ethtool/cabletest.c +++ b/net/ethtool/cabletest.c @@ -81,13 +81,16 @@ int ethnl_cable_test_alloc(struct phy_device *phydev) phydev->nest = nla_nest_start(phydev->skb, ETHTOOL_A_CABLE_TEST_NTF_NEST); - if (!phydev->nest) + if (!phydev->nest) { + err = -EMSGSIZE; goto out; + } return 0; out: nlmsg_free(phydev->skb); + phydev->skb = NULL; return err; } EXPORT_SYMBOL_GPL(ethnl_cable_test_alloc); @@ -95,6 +98,7 @@ EXPORT_SYMBOL_GPL(ethnl_cable_test_alloc); void ethnl_cable_test_free(struct phy_device *phydev) { nlmsg_free(phydev->skb); + phydev->skb = NULL; } EXPORT_SYMBOL_GPL(ethnl_cable_test_free); @@ -107,3 +111,50 @@ void ethnl_cable_test_finished(struct phy_device *phydev) ethnl_multicast(phydev->skb, phydev->attached_dev); } EXPORT_SYMBOL_GPL(ethnl_cable_test_finished); + +int ethnl_cable_test_result(struct phy_device *phydev, u8 pair, u8 result) +{ + struct nlattr *nest; + int ret = -EMSGSIZE; + + nest = nla_nest_start(phydev->skb, ETHTOOL_A_CABLE_NEST_RESULT); + if (!nest) + return -EMSGSIZE; + + if (nla_put_u8(phydev->skb, ETHTOOL_A_CABLE_RESULT_PAIR, pair)) + goto err; + if (nla_put_u8(phydev->skb, ETHTOOL_A_CABLE_RESULT_CODE, result)) + goto err; + + nla_nest_end(phydev->skb, nest); + return 0; + +err: + nla_nest_cancel(phydev->skb, nest); + return ret; +} +EXPORT_SYMBOL_GPL(ethnl_cable_test_result); + +int ethnl_cable_test_fault_length(struct phy_device *phydev, u8 pair, u32 cm) +{ + struct nlattr *nest; + int ret = -EMSGSIZE; + + nest = nla_nest_start(phydev->skb, + ETHTOOL_A_CABLE_NEST_FAULT_LENGTH); + if (!nest) + return -EMSGSIZE; + + if (nla_put_u8(phydev->skb, ETHTOOL_A_CABLE_FAULT_LENGTH_PAIR, pair)) + goto err; + if (nla_put_u32(phydev->skb, ETHTOOL_A_CABLE_FAULT_LENGTH_CM, cm)) + goto err; + + nla_nest_end(phydev->skb, nest); + return 0; + +err: + nla_nest_cancel(phydev->skb, nest); + return ret; +} +EXPORT_SYMBOL_GPL(ethnl_cable_test_fault_length); -- cgit v1.2.3 From 9896a4574ecb137d4e5b9283004aa34c688bc761 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sun, 10 May 2020 21:12:40 +0200 Subject: net: phy: Send notifier when starting the cable test Given that it takes time to run a cable test, send a notify message at the start, as well as when it is completed. v3: EMSGSIZE when ethnl_bcastmsg_put() fails Print an error message on failure, since this is a void function. Signed-off-by: Andrew Lunn Reviewed-by: Michal Kubecek Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- net/ethtool/cabletest.c | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) (limited to 'net') diff --git a/net/ethtool/cabletest.c b/net/ethtool/cabletest.c index e0c917918c70..5ba06eabe8c2 100644 --- a/net/ethtool/cabletest.c +++ b/net/ethtool/cabletest.c @@ -13,6 +13,43 @@ cable_test_act_policy[ETHTOOL_A_CABLE_TEST_MAX + 1] = { [ETHTOOL_A_CABLE_TEST_HEADER] = { .type = NLA_NESTED }, }; +static int ethnl_cable_test_started(struct phy_device *phydev) +{ + struct sk_buff *skb; + int err = -ENOMEM; + void *ehdr; + + skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + goto out; + + ehdr = ethnl_bcastmsg_put(skb, ETHTOOL_MSG_CABLE_TEST_NTF); + if (!ehdr) { + err = -EMSGSIZE; + goto out; + } + + err = ethnl_fill_reply_header(skb, phydev->attached_dev, + ETHTOOL_A_CABLE_TEST_NTF_HEADER); + if (err) + goto out; + + err = nla_put_u8(skb, ETHTOOL_A_CABLE_TEST_NTF_STATUS, + ETHTOOL_A_CABLE_TEST_NTF_STATUS_STARTED); + if (err) + goto out; + + genlmsg_end(skb, ehdr); + + return ethnl_multicast(skb, phydev->attached_dev); + +out: + nlmsg_free(skb); + phydev_err(phydev, "%s: Error %pe\n", __func__, ERR_PTR(err)); + + return err; +} + int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info) { struct nlattr *tb[ETHTOOL_A_CABLE_TEST_MAX + 1]; @@ -47,6 +84,10 @@ int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info) ret = phy_start_cable_test(dev->phydev, info->extack); ethnl_ops_complete(dev); + + if (!ret) + ethnl_cable_test_started(dev->phydev); + out_rtnl: rtnl_unlock(); out_dev_put: -- cgit v1.2.3 From 9eb8eff0cf2f1e1afc0756bb30cb9746ba90dd07 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 10 May 2020 19:37:40 +0300 Subject: net: bridge: allow enslaving some DSA master network devices Commit 8db0a2ee2c63 ("net: bridge: reject DSA-enabled master netdevices as bridge members") added a special check in br_if.c in order to check for a DSA master network device with a tagging protocol configured. This was done because back then, such devices, once enslaved in a bridge would become inoperative and would not pass DSA tagged traffic anymore due to br_handle_frame returning RX_HANDLER_CONSUMED. But right now we have valid use cases which do require bridging of DSA masters. One such example is when the DSA master ports are DSA switch ports themselves (in a disjoint tree setup). This should be completely equivalent, functionally speaking, from having multiple DSA switches hanging off of the ports of a switchdev driver. So we should allow the enslaving of DSA tagged master network devices. Instead of the regular br_handle_frame(), install a new function br_handle_frame_dummy() on these DSA masters, which returns RX_HANDLER_PASS in order to call into the DSA specific tagging protocol handlers, and lift the restriction from br_add_if. Suggested-by: Nikolay Aleksandrov Suggested-by: Florian Fainelli Signed-off-by: Vladimir Oltean Acked-by: Nikolay Aleksandrov Reviewed-by: Florian Fainelli Tested-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 2 +- net/bridge/br_if.c | 32 +++++++++++++++++++++++--------- net/bridge/br_input.c | 23 ++++++++++++++++++++++- net/bridge/br_private.h | 6 +++--- 4 files changed, 49 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 6dfc8c2f68b8..02fb5025e0ac 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -651,7 +651,7 @@ struct dsa_switch_driver { struct net_device *dsa_dev_to_net_device(struct device *dev); /* Keep inline for faster access in hot path */ -static inline bool netdev_uses_dsa(struct net_device *dev) +static inline bool netdev_uses_dsa(const struct net_device *dev) { #if IS_ENABLED(CONFIG_NET_DSA) return dev->dsa_ptr && dev->dsa_ptr->rcv; diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index ca685c0cdf95..a0e9a7937412 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -563,18 +563,32 @@ int br_add_if(struct net_bridge *br, struct net_device *dev, unsigned br_hr, dev_hr; bool changed_addr; - /* Don't allow bridging non-ethernet like devices, or DSA-enabled - * master network devices since the bridge layer rx_handler prevents - * the DSA fake ethertype handler to be invoked, so we do not strip off - * the DSA switch tag protocol header and the bridge layer just return - * RX_HANDLER_CONSUMED, stopping RX processing for these frames. - */ + /* Don't allow bridging non-ethernet like devices. */ if ((dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER || dev->addr_len != ETH_ALEN || - !is_valid_ether_addr(dev->dev_addr) || - netdev_uses_dsa(dev)) + !is_valid_ether_addr(dev->dev_addr)) return -EINVAL; + /* Also don't allow bridging of net devices that are DSA masters, since + * the bridge layer rx_handler prevents the DSA fake ethertype handler + * to be invoked, so we don't get the chance to strip off and parse the + * DSA switch tag protocol header (the bridge layer just returns + * RX_HANDLER_CONSUMED, stopping RX processing for these frames). + * The only case where that would not be an issue is when bridging can + * already be offloaded, such as when the DSA master is itself a DSA + * or plain switchdev port, and is bridged only with other ports from + * the same hardware device. + */ + if (netdev_uses_dsa(dev)) { + list_for_each_entry(p, &br->port_list, list) { + if (!netdev_port_same_parent_id(dev, p->dev)) { + NL_SET_ERR_MSG(extack, + "Cannot do software bridging with a DSA master"); + return -EINVAL; + } + } + } + /* No bridging of bridges */ if (dev->netdev_ops->ndo_start_xmit == br_dev_xmit) { NL_SET_ERR_MSG(extack, @@ -618,7 +632,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev, if (err) goto err3; - err = netdev_rx_handler_register(dev, br_handle_frame, p); + err = netdev_rx_handler_register(dev, br_get_rx_handler(dev), p); if (err) goto err4; diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index d5c34f36f0f4..59a318b9f646 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -17,6 +17,7 @@ #endif #include #include +#include #include #include #include "br_private.h" @@ -257,7 +258,7 @@ frame_finish: * Return NULL if skb is handled * note: already called with rcu_read_lock */ -rx_handler_result_t br_handle_frame(struct sk_buff **pskb) +static rx_handler_result_t br_handle_frame(struct sk_buff **pskb) { struct net_bridge_port *p; struct sk_buff *skb = *pskb; @@ -359,3 +360,23 @@ drop: } return RX_HANDLER_CONSUMED; } + +/* This function has no purpose other than to appease the br_port_get_rcu/rtnl + * helpers which identify bridged ports according to the rx_handler installed + * on them (so there _needs_ to be a bridge rx_handler even if we don't need it + * to do anything useful). This bridge won't support traffic to/from the stack, + * but only hardware bridging. So return RX_HANDLER_PASS so we don't steal + * frames from the ETH_P_XDSA packet_type handler. + */ +static rx_handler_result_t br_handle_frame_dummy(struct sk_buff **pskb) +{ + return RX_HANDLER_PASS; +} + +rx_handler_func_t *br_get_rx_handler(const struct net_device *dev) +{ + if (netdev_uses_dsa(dev)) + return br_handle_frame_dummy; + + return br_handle_frame; +} diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 4dc21e8f7e33..7501be4eeba0 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -702,16 +702,16 @@ int nbp_backup_change(struct net_bridge_port *p, struct net_device *backup_dev); /* br_input.c */ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb); -rx_handler_result_t br_handle_frame(struct sk_buff **pskb); +rx_handler_func_t *br_get_rx_handler(const struct net_device *dev); static inline bool br_rx_handler_check_rcu(const struct net_device *dev) { - return rcu_dereference(dev->rx_handler) == br_handle_frame; + return rcu_dereference(dev->rx_handler) == br_get_rx_handler(dev); } static inline bool br_rx_handler_check_rtnl(const struct net_device *dev) { - return rcu_dereference_rtnl(dev->rx_handler) == br_handle_frame; + return rcu_dereference_rtnl(dev->rx_handler) == br_get_rx_handler(dev); } static inline struct net_bridge_port *br_port_get_check_rcu(const struct net_device *dev) -- cgit v1.2.3 From f66a6a69f97a24546664541237a82b288c2713f6 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 10 May 2020 19:37:41 +0300 Subject: net: dsa: permit cross-chip bridging between all trees in the system One way of utilizing DSA is by cascading switches which do not all have compatible taggers. Consider the following real-life topology: +---------------------------------------------------------------+ | LS1028A | | +------------------------------+ | | | DSA master for Felix | | | |(internal ENETC port 2: eno2))| | | +------------+------------------------------+-------------+ | | | Felix embedded L2 switch | | | | | | | | +--------------+ +--------------+ +--------------+ | | | | |DSA master for| |DSA master for| |DSA master for| | | | | | SJA1105 1 | | SJA1105 2 | | SJA1105 3 | | | | | |(Felix port 1)| |(Felix port 2)| |(Felix port 3)| | | +--+-+--------------+---+--------------+---+--------------+--+--+ +-----------------------+ +-----------------------+ +-----------------------+ | SJA1105 switch 1 | | SJA1105 switch 2 | | SJA1105 switch 3 | +-----+-----+-----+-----+ +-----+-----+-----+-----+ +-----+-----+-----+-----+ |sw1p0|sw1p1|sw1p2|sw1p3| |sw2p0|sw2p1|sw2p2|sw2p3| |sw3p0|sw3p1|sw3p2|sw3p3| +-----+-----+-----+-----+ +-----+-----+-----+-----+ +-----+-----+-----+-----+ The above can be described in the device tree as follows (obviously not complete): mscc_felix { dsa,member = <0 0>; ports { port@4 { ethernet = <&enetc_port2>; }; }; }; sja1105_switch1 { dsa,member = <1 1>; ports { port@4 { ethernet = <&mscc_felix_port1>; }; }; }; sja1105_switch2 { dsa,member = <2 2>; ports { port@4 { ethernet = <&mscc_felix_port2>; }; }; }; sja1105_switch3 { dsa,member = <3 3>; ports { port@4 { ethernet = <&mscc_felix_port3>; }; }; }; Basically we instantiate one DSA switch tree for every hardware switch in the system, but we still give them globally unique switch IDs (will come back to that later). Having 3 disjoint switch trees makes the tagger drivers "just work", because net devices are registered for the 3 Felix DSA master ports, and they are also DSA slave ports to the ENETC port. So packets received on the ENETC port are stripped of their stacked DSA tags one by one. Currently, hardware bridging between ports on the same sja1105 chip is possible, but switching between sja1105 ports on different chips is handled by the software bridge. This is fine, but we can do better. In fact, the dsa_8021q tag used by sja1105 is compatible with cascading. In other words, a sja1105 switch can correctly parse and route a packet containing a dsa_8021q tag. So if we could enable hardware bridging on the Felix DSA master ports, cross-chip bridging could be completely offloaded. Such as system would be used as follows: ip link add dev br0 type bridge && ip link set dev br0 up for port in sw0p0 sw0p1 sw0p2 sw0p3 \ sw1p0 sw1p1 sw1p2 sw1p3 \ sw2p0 sw2p1 sw2p2 sw2p3; do ip link set dev $port master br0 done The above makes switching between ports on the same row be performed in hardware, and between ports on different rows in software. Now assume the Felix switch ports are called swp0, swp1, swp2. By running the following extra commands: ip link add dev br1 type bridge && ip link set dev br1 up for port in swp0 swp1 swp2; do ip link set dev $port master br1 done the CPU no longer sees packets which traverse sja1105 switch boundaries and can be forwarded directly by Felix. The br1 bridge would not be used for any sort of traffic termination. For this to work, we need to give drivers an opportunity to listen for bridging events on DSA trees other than their own, and pass that other tree index as argument. I have made the assumption, for the moment, that the other existing DSA notifiers don't need to be broadcast to other trees. That assumption might turn out to be incorrect. But in the meantime, introduce a dsa_broadcast function, similar in purpose to dsa_port_notify, which is used only by the bridging notifiers. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- drivers/net/dsa/mv88e6xxx/chip.c | 16 ++++++++++++---- include/net/dsa.h | 10 ++++++---- net/dsa/dsa_priv.h | 1 + net/dsa/port.c | 23 +++++++++++++++++++++-- net/dsa/switch.c | 21 +++++++++++++++------ 5 files changed, 55 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 2b4a723c8306..7627ea61e0ea 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -2233,26 +2233,34 @@ static void mv88e6xxx_port_bridge_leave(struct dsa_switch *ds, int port, mv88e6xxx_reg_unlock(chip); } -static int mv88e6xxx_crosschip_bridge_join(struct dsa_switch *ds, int dev, +static int mv88e6xxx_crosschip_bridge_join(struct dsa_switch *ds, + int tree_index, int sw_index, int port, struct net_device *br) { struct mv88e6xxx_chip *chip = ds->priv; int err; + if (tree_index != ds->dst->index) + return 0; + mv88e6xxx_reg_lock(chip); - err = mv88e6xxx_pvt_map(chip, dev, port); + err = mv88e6xxx_pvt_map(chip, sw_index, port); mv88e6xxx_reg_unlock(chip); return err; } -static void mv88e6xxx_crosschip_bridge_leave(struct dsa_switch *ds, int dev, +static void mv88e6xxx_crosschip_bridge_leave(struct dsa_switch *ds, + int tree_index, int sw_index, int port, struct net_device *br) { struct mv88e6xxx_chip *chip = ds->priv; + if (tree_index != ds->dst->index) + return; + mv88e6xxx_reg_lock(chip); - if (mv88e6xxx_pvt_map(chip, dev, port)) + if (mv88e6xxx_pvt_map(chip, sw_index, port)) dev_err(ds->dev, "failed to remap cross-chip Port VLAN\n"); mv88e6xxx_reg_unlock(chip); } diff --git a/include/net/dsa.h b/include/net/dsa.h index 02fb5025e0ac..0f4fc00239d9 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -574,10 +574,12 @@ struct dsa_switch_ops { /* * Cross-chip operations */ - int (*crosschip_bridge_join)(struct dsa_switch *ds, int sw_index, - int port, struct net_device *br); - void (*crosschip_bridge_leave)(struct dsa_switch *ds, int sw_index, - int port, struct net_device *br); + int (*crosschip_bridge_join)(struct dsa_switch *ds, int tree_index, + int sw_index, int port, + struct net_device *br); + void (*crosschip_bridge_leave)(struct dsa_switch *ds, int tree_index, + int sw_index, int port, + struct net_device *br); /* * PTP functionality diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 6d9a1ef65fa0..a1a0ae242012 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -35,6 +35,7 @@ struct dsa_notifier_ageing_time_info { /* DSA_NOTIFIER_BRIDGE_* */ struct dsa_notifier_bridge_info { struct net_device *br; + int tree_index; int sw_index; int port; }; diff --git a/net/dsa/port.c b/net/dsa/port.c index a58fdd362574..ebc8d6cbd1d4 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -13,6 +13,23 @@ #include "dsa_priv.h" +static int dsa_broadcast(unsigned long e, void *v) +{ + struct dsa_switch_tree *dst; + int err = 0; + + list_for_each_entry(dst, &dsa_tree_list, list) { + struct raw_notifier_head *nh = &dst->nh; + + err = raw_notifier_call_chain(nh, e, v); + err = notifier_to_errno(err); + if (err) + break; + } + + return err; +} + static int dsa_port_notify(const struct dsa_port *dp, unsigned long e, void *v) { struct raw_notifier_head *nh = &dp->ds->dst->nh; @@ -120,6 +137,7 @@ void dsa_port_disable(struct dsa_port *dp) int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br) { struct dsa_notifier_bridge_info info = { + .tree_index = dp->ds->dst->index, .sw_index = dp->ds->index, .port = dp->index, .br = br, @@ -136,7 +154,7 @@ int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br) */ dp->bridge_dev = br; - err = dsa_port_notify(dp, DSA_NOTIFIER_BRIDGE_JOIN, &info); + err = dsa_broadcast(DSA_NOTIFIER_BRIDGE_JOIN, &info); /* The bridging is rolled back on error */ if (err) { @@ -150,6 +168,7 @@ int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br) void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br) { struct dsa_notifier_bridge_info info = { + .tree_index = dp->ds->dst->index, .sw_index = dp->ds->index, .port = dp->index, .br = br, @@ -161,7 +180,7 @@ void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br) */ dp->bridge_dev = NULL; - err = dsa_port_notify(dp, DSA_NOTIFIER_BRIDGE_LEAVE, &info); + err = dsa_broadcast(DSA_NOTIFIER_BRIDGE_LEAVE, &info); if (err) pr_err("DSA: failed to notify DSA_NOTIFIER_BRIDGE_LEAVE\n"); diff --git a/net/dsa/switch.c b/net/dsa/switch.c index f3c32ff552b3..86c8dc5c32a0 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -89,11 +89,16 @@ static int dsa_switch_mtu(struct dsa_switch *ds, static int dsa_switch_bridge_join(struct dsa_switch *ds, struct dsa_notifier_bridge_info *info) { - if (ds->index == info->sw_index && ds->ops->port_bridge_join) + struct dsa_switch_tree *dst = ds->dst; + + if (dst->index == info->tree_index && ds->index == info->sw_index && + ds->ops->port_bridge_join) return ds->ops->port_bridge_join(ds, info->port, info->br); - if (ds->index != info->sw_index && ds->ops->crosschip_bridge_join) - return ds->ops->crosschip_bridge_join(ds, info->sw_index, + if ((dst->index != info->tree_index || ds->index != info->sw_index) && + ds->ops->crosschip_bridge_join) + return ds->ops->crosschip_bridge_join(ds, info->tree_index, + info->sw_index, info->port, info->br); return 0; @@ -103,13 +108,17 @@ static int dsa_switch_bridge_leave(struct dsa_switch *ds, struct dsa_notifier_bridge_info *info) { bool unset_vlan_filtering = br_vlan_enabled(info->br); + struct dsa_switch_tree *dst = ds->dst; int err, i; - if (ds->index == info->sw_index && ds->ops->port_bridge_leave) + if (dst->index == info->tree_index && ds->index == info->sw_index && + ds->ops->port_bridge_join) ds->ops->port_bridge_leave(ds, info->port, info->br); - if (ds->index != info->sw_index && ds->ops->crosschip_bridge_leave) - ds->ops->crosschip_bridge_leave(ds, info->sw_index, info->port, + if ((dst->index != info->tree_index || ds->index != info->sw_index) && + ds->ops->crosschip_bridge_join) + ds->ops->crosschip_bridge_leave(ds, info->tree_index, + info->sw_index, info->port, info->br); /* If the bridge was vlan_filtering, the bridge core doesn't trigger an -- cgit v1.2.3 From 3b7bc1f09101ccace330d105c13c2946bf3be6d5 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 10 May 2020 19:37:42 +0300 Subject: net: dsa: introduce a dsa_switch_find function Somewhat similar to dsa_tree_find, dsa_switch_find returns a dsa_switch structure pointer by searching for its tree index and switch index (the parameters from dsa,member). To be used, for example, by drivers who implement .crosschip_bridge_join and need a reference to the other switch indicated to by the tree_index and sw_index arguments. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 1 + net/dsa/dsa2.c | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+) (limited to 'net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 0f4fc00239d9..312c2f067e65 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -672,6 +672,7 @@ static inline bool dsa_can_decode(const struct sk_buff *skb, void dsa_unregister_switch(struct dsa_switch *ds); int dsa_register_switch(struct dsa_switch *ds); +struct dsa_switch *dsa_switch_find(int tree_index, int sw_index); #ifdef CONFIG_PM_SLEEP int dsa_switch_suspend(struct dsa_switch *ds); int dsa_switch_resume(struct dsa_switch *ds); diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index d90665b465b8..076908fdd29b 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -24,6 +24,27 @@ LIST_HEAD(dsa_tree_list); static const struct devlink_ops dsa_devlink_ops = { }; +struct dsa_switch *dsa_switch_find(int tree_index, int sw_index) +{ + struct dsa_switch_tree *dst; + struct dsa_port *dp; + + list_for_each_entry(dst, &dsa_tree_list, list) { + if (dst->index != tree_index) + continue; + + list_for_each_entry(dp, &dst->ports, list) { + if (dp->ds->index != sw_index) + continue; + + return dp->ds; + } + } + + return NULL; +} +EXPORT_SYMBOL_GPL(dsa_switch_find); + static struct dsa_switch_tree *dsa_tree_find(int index) { struct dsa_switch_tree *dst; -- cgit v1.2.3 From ac02a451a6148bb9c395b39783ce7299eddf4f31 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 10 May 2020 19:37:43 +0300 Subject: net: dsa: sja1105: implement cross-chip bridging operations sja1105 uses dsa_8021q for DSA tagging, a format which is VLAN at heart and which is compatible with cascading. A complete description of this tagging format is in net/dsa/tag_8021q.c, but a quick summary is that each external-facing port tags incoming frames with a unique pvid, and this special VLAN is transmitted as tagged towards the inside of the system, and as untagged towards the exterior. The tag encodes the switch id and the source port index. This means that cross-chip bridging for dsa_8021q only entails adding the dsa_8021q pvids of one switch to the RX filter of the other switches. Everything else falls naturally into place, as long as the bottom-end of ports (the leaves in the tree) is comprised exclusively of dsa_8021q-compatible (i.e. sja1105 switches). Otherwise, there would be a chance that a front-panel switch transmits a packet tagged with a dsa_8021q header, header which it wouldn't be able to remove, and which would hence "leak" out. The only use case I tested (due to lack of board availability) was when the sja1105 switches are part of disjoint trees (however, this doesn't change the fact that multiple sja1105 switches still need unique switch identifiers in such a system). But in principle, even "true" single-tree setups (with DSA links) should work just as fine, except for a small change which I can't test: dsa_towards_port should be used instead of dsa_upstream_port (I made the assumption that the routing port that any sja1105 should use towards its neighbours is the CPU port. That might not hold true in other setups). Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- drivers/net/dsa/sja1105/sja1105.h | 2 + drivers/net/dsa/sja1105/sja1105_main.c | 90 ++++++++++++++++++++ include/linux/dsa/8021q.h | 45 ++++++++++ net/dsa/tag_8021q.c | 151 +++++++++++++++++++++++++++++++++ 4 files changed, 288 insertions(+) (limited to 'net') diff --git a/drivers/net/dsa/sja1105/sja1105.h b/drivers/net/dsa/sja1105/sja1105.h index 8df2a5c53b02..a64ace07b89f 100644 --- a/drivers/net/dsa/sja1105/sja1105.h +++ b/drivers/net/dsa/sja1105/sja1105.h @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include "sja1105_static_config.h" @@ -185,6 +186,7 @@ struct sja1105_private { struct gpio_desc *reset_gpio; struct spi_device *spidev; struct dsa_switch *ds; + struct list_head crosschip_links; struct sja1105_flow_block flow_block; struct sja1105_port ports[SJA1105_NUM_PORTS]; /* Serializes transmission of management frames so that diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c index 666e54565df0..d5de9305df25 100644 --- a/drivers/net/dsa/sja1105/sja1105_main.c +++ b/drivers/net/dsa/sja1105/sja1105_main.c @@ -25,6 +25,8 @@ #include "sja1105_sgmii.h" #include "sja1105_tas.h" +static const struct dsa_switch_ops sja1105_switch_ops; + static void sja1105_hw_reset(struct gpio_desc *gpio, unsigned int pulse_len, unsigned int startup_delay) { @@ -1791,6 +1793,84 @@ static int sja1105_vlan_apply(struct sja1105_private *priv, int port, u16 vid, return 0; } +static int sja1105_crosschip_bridge_join(struct dsa_switch *ds, + int tree_index, int sw_index, + int other_port, struct net_device *br) +{ + struct dsa_switch *other_ds = dsa_switch_find(tree_index, sw_index); + struct sja1105_private *other_priv = other_ds->priv; + struct sja1105_private *priv = ds->priv; + int port, rc; + + if (other_ds->ops != &sja1105_switch_ops) + return 0; + + for (port = 0; port < ds->num_ports; port++) { + if (!dsa_is_user_port(ds, port)) + continue; + if (dsa_to_port(ds, port)->bridge_dev != br) + continue; + + rc = dsa_8021q_crosschip_bridge_join(ds, port, other_ds, + other_port, br, + &priv->crosschip_links); + if (rc) + return rc; + + rc = dsa_8021q_crosschip_bridge_join(other_ds, other_port, ds, + port, br, + &other_priv->crosschip_links); + if (rc) + return rc; + } + + return 0; +} + +static void sja1105_crosschip_bridge_leave(struct dsa_switch *ds, + int tree_index, int sw_index, + int other_port, + struct net_device *br) +{ + struct dsa_switch *other_ds = dsa_switch_find(tree_index, sw_index); + struct sja1105_private *other_priv = other_ds->priv; + struct sja1105_private *priv = ds->priv; + int port; + + if (other_ds->ops != &sja1105_switch_ops) + return; + + for (port = 0; port < ds->num_ports; port++) { + if (!dsa_is_user_port(ds, port)) + continue; + if (dsa_to_port(ds, port)->bridge_dev != br) + continue; + + dsa_8021q_crosschip_bridge_leave(ds, port, other_ds, other_port, + br, &priv->crosschip_links); + + dsa_8021q_crosschip_bridge_leave(other_ds, other_port, ds, + port, br, + &other_priv->crosschip_links); + } +} + +static int sja1105_replay_crosschip_vlans(struct dsa_switch *ds, bool enabled) +{ + struct sja1105_private *priv = ds->priv; + struct dsa_8021q_crosschip_link *c; + int rc; + + list_for_each_entry(c, &priv->crosschip_links, list) { + rc = dsa_8021q_crosschip_link_apply(ds, c->port, c->other_ds, + c->other_port, enabled); + if (rc) + break; + } + + return rc; +} + static int sja1105_setup_8021q_tagging(struct dsa_switch *ds, bool enabled) { int rc, i; @@ -1803,6 +1883,12 @@ static int sja1105_setup_8021q_tagging(struct dsa_switch *ds, bool enabled) return rc; } } + rc = sja1105_replay_crosschip_vlans(ds, enabled); + if (rc) { + dev_err(ds->dev, "Failed to replay crosschip VLANs: %d\n", rc); + return rc; + } + dev_info(ds->dev, "%s switch tagging\n", enabled ? "Enabled" : "Disabled"); return 0; @@ -2370,6 +2456,8 @@ static const struct dsa_switch_ops sja1105_switch_ops = { .cls_flower_add = sja1105_cls_flower_add, .cls_flower_del = sja1105_cls_flower_del, .cls_flower_stats = sja1105_cls_flower_stats, + .crosschip_bridge_join = sja1105_crosschip_bridge_join, + .crosschip_bridge_leave = sja1105_crosschip_bridge_leave, }; static int sja1105_check_device_id(struct sja1105_private *priv) @@ -2472,6 +2560,8 @@ static int sja1105_probe(struct spi_device *spi) mutex_init(&priv->ptp_data.lock); mutex_init(&priv->mgmt_lock); + INIT_LIST_HEAD(&priv->crosschip_links); + sja1105_tas_setup(ds); sja1105_flower_setup(ds); diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h index c620d9139c28..b8daaec0896e 100644 --- a/include/linux/dsa/8021q.h +++ b/include/linux/dsa/8021q.h @@ -12,11 +12,33 @@ struct sk_buff; struct net_device; struct packet_type; +struct dsa_8021q_crosschip_link { + struct list_head list; + int port; + struct dsa_switch *other_ds; + int other_port; + refcount_t refcount; +}; + #if IS_ENABLED(CONFIG_NET_DSA_TAG_8021Q) int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int index, bool enabled); +int dsa_8021q_crosschip_link_apply(struct dsa_switch *ds, int port, + struct dsa_switch *other_ds, + int other_port, bool enabled); + +int dsa_8021q_crosschip_bridge_join(struct dsa_switch *ds, int port, + struct dsa_switch *other_ds, + int other_port, struct net_device *br, + struct list_head *crosschip_links); + +int dsa_8021q_crosschip_bridge_leave(struct dsa_switch *ds, int port, + struct dsa_switch *other_ds, + int other_port, struct net_device *br, + struct list_head *crosschip_links); + struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev, u16 tpid, u16 tci); @@ -36,6 +58,29 @@ int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int index, return 0; } +int dsa_8021q_crosschip_link_apply(struct dsa_switch *ds, int port, + struct dsa_switch *other_ds, + int other_port, bool enabled) +{ + return 0; +} + +int dsa_8021q_crosschip_bridge_join(struct dsa_switch *ds, int port, + struct dsa_switch *other_ds, + int other_port, struct net_device *br, + struct list_head *crosschip_links) +{ + return 0; +} + +int dsa_8021q_crosschip_bridge_leave(struct dsa_switch *ds, int port, + struct dsa_switch *other_ds, + int other_port, struct net_device *br, + struct list_head *crosschip_links) +{ + return 0; +} + struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev, u16 tpid, u16 tci) { diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c index b97ad93d1c1a..ff9c5bf64bda 100644 --- a/net/dsa/tag_8021q.c +++ b/net/dsa/tag_8021q.c @@ -8,6 +8,7 @@ */ #include #include +#include #include "dsa_priv.h" @@ -288,6 +289,156 @@ int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int port, bool enabled) } EXPORT_SYMBOL_GPL(dsa_port_setup_8021q_tagging); +int dsa_8021q_crosschip_link_apply(struct dsa_switch *ds, int port, + struct dsa_switch *other_ds, + int other_port, bool enabled) +{ + u16 rx_vid = dsa_8021q_rx_vid(ds, port); + + /* @rx_vid of local @ds port @port goes to @other_port of + * @other_ds + */ + return dsa_8021q_vid_apply(other_ds, other_port, rx_vid, + BRIDGE_VLAN_INFO_UNTAGGED, enabled); +} +EXPORT_SYMBOL_GPL(dsa_8021q_crosschip_link_apply); + +static int dsa_8021q_crosschip_link_add(struct dsa_switch *ds, int port, + struct dsa_switch *other_ds, + int other_port, + struct list_head *crosschip_links) +{ + struct dsa_8021q_crosschip_link *c; + + list_for_each_entry(c, crosschip_links, list) { + if (c->port == port && c->other_ds == other_ds && + c->other_port == other_port) { + refcount_inc(&c->refcount); + return 0; + } + } + + dev_dbg(ds->dev, "adding crosschip link from port %d to %s port %d\n", + port, dev_name(other_ds->dev), other_port); + + c = kzalloc(sizeof(*c), GFP_KERNEL); + if (!c) + return -ENOMEM; + + c->port = port; + c->other_ds = other_ds; + c->other_port = other_port; + refcount_set(&c->refcount, 1); + + list_add(&c->list, crosschip_links); + + return 0; +} + +static void dsa_8021q_crosschip_link_del(struct dsa_switch *ds, + struct dsa_8021q_crosschip_link *c, + struct list_head *crosschip_links, + bool *keep) +{ + *keep = !refcount_dec_and_test(&c->refcount); + + if (*keep) + return; + + dev_dbg(ds->dev, + "deleting crosschip link from port %d to %s port %d\n", + c->port, dev_name(c->other_ds->dev), c->other_port); + + list_del(&c->list); + kfree(c); +} + +/* Make traffic from local port @port be received by remote port @other_port. + * This means that our @rx_vid needs to be installed on @other_ds's upstream + * and user ports. The user ports should be egress-untagged so that they can + * pop the dsa_8021q VLAN. But the @other_upstream can be either egress-tagged + * or untagged: it doesn't matter, since it should never egress a frame having + * our @rx_vid. + */ +int dsa_8021q_crosschip_bridge_join(struct dsa_switch *ds, int port, + struct dsa_switch *other_ds, + int other_port, struct net_device *br, + struct list_head *crosschip_links) +{ + /* @other_upstream is how @other_ds reaches us. If we are part + * of disjoint trees, then we are probably connected through + * our CPU ports. If we're part of the same tree though, we should + * probably use dsa_towards_port. + */ + int other_upstream = dsa_upstream_port(other_ds, other_port); + int rc; + + rc = dsa_8021q_crosschip_link_add(ds, port, other_ds, + other_port, crosschip_links); + if (rc) + return rc; + + if (!br_vlan_enabled(br)) { + rc = dsa_8021q_crosschip_link_apply(ds, port, other_ds, + other_port, true); + if (rc) + return rc; + } + + rc = dsa_8021q_crosschip_link_add(ds, port, other_ds, + other_upstream, + crosschip_links); + if (rc) + return rc; + + if (!br_vlan_enabled(br)) { + rc = dsa_8021q_crosschip_link_apply(ds, port, other_ds, + other_upstream, true); + if (rc) + return rc; + } + + return 0; +} +EXPORT_SYMBOL_GPL(dsa_8021q_crosschip_bridge_join); + +int dsa_8021q_crosschip_bridge_leave(struct dsa_switch *ds, int port, + struct dsa_switch *other_ds, + int other_port, struct net_device *br, + struct list_head *crosschip_links) +{ + int other_upstream = dsa_upstream_port(other_ds, other_port); + struct dsa_8021q_crosschip_link *c, *n; + + list_for_each_entry_safe(c, n, crosschip_links, list) { + if (c->port == port && c->other_ds == other_ds && + (c->other_port == other_port || + c->other_port == other_upstream)) { + struct dsa_switch *other_ds = c->other_ds; + int other_port = c->other_port; + bool keep; + int rc; + + dsa_8021q_crosschip_link_del(ds, c, crosschip_links, + &keep); + if (keep) + continue; + + if (!br_vlan_enabled(br)) { + rc = dsa_8021q_crosschip_link_apply(ds, port, + other_ds, + other_port, + false); + if (rc) + return rc; + } + } + } + + return 0; +} +EXPORT_SYMBOL_GPL(dsa_8021q_crosschip_bridge_leave); + struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev, u16 tpid, u16 tci) { -- cgit v1.2.3 From eaa7b7228ff7f8688623120bb0cd75d1490d5d04 Mon Sep 17 00:00:00 2001 From: Tedd Ho-Jeong An Date: Fri, 1 May 2020 10:00:50 -0700 Subject: Bluetooth: Fix advertising handle is set to 0 This patch fix the advertising handle is set to 0 regardless of actual instance value. The affected commands are LE Set Advertising Set Random Address, LE Set Extended Advertising Data, and LE Set Extended Scan Response Data commands. Signed-off-by: Tedd Ho-Jeong An Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_request.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index 9ea40106ef17..3f470f0e432c 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -1447,7 +1447,7 @@ void __hci_req_update_scan_rsp_data(struct hci_request *req, u8 instance) memcpy(hdev->scan_rsp_data, cp.data, sizeof(cp.data)); hdev->scan_rsp_data_len = len; - cp.handle = 0; + cp.handle = instance; cp.length = len; cp.operation = LE_SET_ADV_DATA_OP_COMPLETE; cp.frag_pref = LE_SET_ADV_DATA_NO_FRAG; @@ -1591,7 +1591,7 @@ void __hci_req_update_adv_data(struct hci_request *req, u8 instance) hdev->adv_data_len = len; cp.length = len; - cp.handle = 0; + cp.handle = instance; cp.operation = LE_SET_ADV_DATA_OP_COMPLETE; cp.frag_pref = LE_SET_ADV_DATA_NO_FRAG; @@ -1876,7 +1876,7 @@ int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance) memset(&cp, 0, sizeof(cp)); - cp.handle = 0; + cp.handle = instance; bacpy(&cp.bdaddr, &random_addr); hci_req_add(req, -- cgit v1.2.3 From 69d67b461a180144ad1d31174fadf3e3eda78e56 Mon Sep 17 00:00:00 2001 From: Konstantin Forostyan Date: Mon, 4 May 2020 16:01:49 +0000 Subject: Bluetooth: L2CAP: Fix errors during L2CAP_CREDIT_BASED_CONNECTION_REQ (0x17) Fix 2 typos in L2CAP_CREDIT_BASED_CONNECTION_REQ (0x17) handling function, that cause BlueZ answer with L2CAP_CR_LE_INVALID_PARAMS or L2CAP_CR_LE_INVALID_SCID error on a correct ECRED connection request. Enchanced Credit Based Mode support was recently introduced with the commit 15f02b91056253e8cdc592888f431da0731337b8 ("Bluetooth: L2CAP: Add initial code for Enhanced Credit Based Mode"). Signed-off-by: Konstantin Forostyan Signed-off-by: Marcel Holtmann --- net/bluetooth/l2cap_core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index fd9d0d08f9c9..fe913a5c754a 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -5927,7 +5927,7 @@ static inline int l2cap_ecred_conn_req(struct l2cap_conn *conn, if (!enable_ecred) return -EINVAL; - if (cmd_len < sizeof(*req) || cmd_len - sizeof(*req) % sizeof(u16)) { + if (cmd_len < sizeof(*req) || (cmd_len - sizeof(*req)) % sizeof(u16)) { result = L2CAP_CR_LE_INVALID_PARAMS; goto response; } @@ -5964,7 +5964,7 @@ static inline int l2cap_ecred_conn_req(struct l2cap_conn *conn, } result = L2CAP_CR_LE_SUCCESS; - cmd_len -= sizeof(req); + cmd_len -= sizeof(*req); num_scid = cmd_len / sizeof(u16); for (i = 0; i < num_scid; i++) { -- cgit v1.2.3 From 5f4b91728bba007be563fa5a3bd5d96b6a03b3b9 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Wed, 6 May 2020 09:57:46 +0200 Subject: Bluetooth: Add MGMT_EV_PHY_CONFIGURATION_CHANGED to supported list The event MGMT_EV_PHY_CONFIGURATION_CHANGED wasn't listed in the list of supported events. So add it. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/mgmt.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index f8c0a4fc8090..33b5640ea060 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -147,6 +147,7 @@ static const u16 mgmt_events[] = { MGMT_EV_ADVERTISING_ADDED, MGMT_EV_ADVERTISING_REMOVED, MGMT_EV_EXT_INFO_CHANGED, + MGMT_EV_PHY_CONFIGURATION_CHANGED, }; static const u16 mgmt_untrusted_commands[] = { -- cgit v1.2.3 From 181d695352305cc52a49c151a1c3370376e54887 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Wed, 6 May 2020 09:57:47 +0200 Subject: Bluetooth: Replace BT_DBG with bt_dev_dbg for management support The majority of management interaction are based on a controller index and have a hci_dev associated with it. So use bt_dev_dbg to have a clean way of indentifying the controller the debug message belongs to. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/mgmt.c | 221 +++++++++++++++++++++++++-------------------------- 1 file changed, 110 insertions(+), 111 deletions(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 33b5640ea060..78cf72b64014 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -294,7 +294,7 @@ static int read_version(struct sock *sk, struct hci_dev *hdev, void *data, { struct mgmt_rp_read_version rp; - BT_DBG("sock %p", sk); + bt_dev_dbg(hdev, "sock %p", sk); mgmt_fill_version_info(&rp); @@ -310,7 +310,7 @@ static int read_commands(struct sock *sk, struct hci_dev *hdev, void *data, size_t rp_size; int i, err; - BT_DBG("sock %p", sk); + bt_dev_dbg(hdev, "sock %p", sk); if (hci_sock_test_flag(sk, HCI_SOCK_TRUSTED)) { num_commands = ARRAY_SIZE(mgmt_commands); @@ -363,7 +363,7 @@ static int read_index_list(struct sock *sk, struct hci_dev *hdev, void *data, u16 count; int err; - BT_DBG("sock %p", sk); + bt_dev_dbg(hdev, "sock %p", sk); read_lock(&hci_dev_list_lock); @@ -397,7 +397,7 @@ static int read_index_list(struct sock *sk, struct hci_dev *hdev, void *data, if (d->dev_type == HCI_PRIMARY && !hci_dev_test_flag(d, HCI_UNCONFIGURED)) { rp->index[count++] = cpu_to_le16(d->id); - BT_DBG("Added hci%u", d->id); + bt_dev_dbg(hdev, "Added hci%u", d->id); } } @@ -423,7 +423,7 @@ static int read_unconf_index_list(struct sock *sk, struct hci_dev *hdev, u16 count; int err; - BT_DBG("sock %p", sk); + bt_dev_dbg(hdev, "sock %p", sk); read_lock(&hci_dev_list_lock); @@ -457,7 +457,7 @@ static int read_unconf_index_list(struct sock *sk, struct hci_dev *hdev, if (d->dev_type == HCI_PRIMARY && hci_dev_test_flag(d, HCI_UNCONFIGURED)) { rp->index[count++] = cpu_to_le16(d->id); - BT_DBG("Added hci%u", d->id); + bt_dev_dbg(hdev, "Added hci%u", d->id); } } @@ -482,7 +482,7 @@ static int read_ext_index_list(struct sock *sk, struct hci_dev *hdev, u16 count; int err; - BT_DBG("sock %p", sk); + bt_dev_dbg(hdev, "sock %p", sk); read_lock(&hci_dev_list_lock); @@ -524,7 +524,7 @@ static int read_ext_index_list(struct sock *sk, struct hci_dev *hdev, rp->entry[count].bus = d->bus; rp->entry[count++].index = cpu_to_le16(d->id); - BT_DBG("Added hci%u", d->id); + bt_dev_dbg(hdev, "Added hci%u", d->id); } rp->num_controllers = cpu_to_le16(count); @@ -600,7 +600,7 @@ static int read_config_info(struct sock *sk, struct hci_dev *hdev, struct mgmt_rp_read_config_info rp; u32 options = 0; - BT_DBG("sock %p %s", sk, hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); hci_dev_lock(hdev); @@ -940,7 +940,7 @@ static void rpa_expired(struct work_struct *work) rpa_expired.work); struct hci_request req; - BT_DBG(""); + bt_dev_dbg(hdev, ""); hci_dev_set_flag(hdev, HCI_RPA_EXPIRED); @@ -980,7 +980,7 @@ static int read_controller_info(struct sock *sk, struct hci_dev *hdev, { struct mgmt_rp_read_info rp; - BT_DBG("sock %p %s", sk, hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); hci_dev_lock(hdev); @@ -1036,7 +1036,7 @@ static int read_ext_controller_info(struct sock *sk, struct hci_dev *hdev, struct mgmt_rp_read_ext_info *rp = (void *)buf; u16 eir_len; - BT_DBG("sock %p %s", sk, hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); memset(&buf, 0, sizeof(buf)); @@ -1095,7 +1095,7 @@ static int send_settings_rsp(struct sock *sk, u16 opcode, struct hci_dev *hdev) static void clean_up_hci_complete(struct hci_dev *hdev, u8 status, u16 opcode) { - BT_DBG("%s status 0x%02x", hdev->name, status); + bt_dev_dbg(hdev, "status 0x%02x", status); if (hci_conn_count(hdev) == 0) { cancel_delayed_work(&hdev->power_off); @@ -1171,7 +1171,7 @@ static int set_powered(struct sock *sk, struct hci_dev *hdev, void *data, struct mgmt_pending_cmd *cmd; int err; - BT_DBG("request for %s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); if (cp->val != 0x00 && cp->val != 0x01) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_POWERED, @@ -1312,7 +1312,7 @@ void mgmt_set_discoverable_complete(struct hci_dev *hdev, u8 status) { struct mgmt_pending_cmd *cmd; - BT_DBG("status 0x%02x", status); + bt_dev_dbg(hdev, "status 0x%02x", status); hci_dev_lock(hdev); @@ -1351,7 +1351,7 @@ static int set_discoverable(struct sock *sk, struct hci_dev *hdev, void *data, u16 timeout; int err; - BT_DBG("request for %s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED) && !hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) @@ -1477,7 +1477,7 @@ void mgmt_set_connectable_complete(struct hci_dev *hdev, u8 status) { struct mgmt_pending_cmd *cmd; - BT_DBG("status 0x%02x", status); + bt_dev_dbg(hdev, "status 0x%02x", status); hci_dev_lock(hdev); @@ -1537,7 +1537,7 @@ static int set_connectable(struct sock *sk, struct hci_dev *hdev, void *data, struct mgmt_pending_cmd *cmd; int err; - BT_DBG("request for %s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED) && !hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) @@ -1594,7 +1594,7 @@ static int set_bondable(struct sock *sk, struct hci_dev *hdev, void *data, bool changed; int err; - BT_DBG("request for %s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); if (cp->val != 0x00 && cp->val != 0x01) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_BONDABLE, @@ -1638,7 +1638,7 @@ static int set_link_security(struct sock *sk, struct hci_dev *hdev, void *data, u8 val, status; int err; - BT_DBG("request for %s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); status = mgmt_bredr_support(hdev); if (status) @@ -1706,7 +1706,7 @@ static int set_ssp(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) u8 status; int err; - BT_DBG("request for %s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); status = mgmt_bredr_support(hdev); if (status) @@ -1787,7 +1787,7 @@ static int set_hs(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) u8 status; int err; - BT_DBG("request for %s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); status = mgmt_bredr_support(hdev); if (status) @@ -1893,7 +1893,7 @@ static int set_le(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) int err; u8 val, enabled; - BT_DBG("request for %s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); if (!lmp_le_capable(hdev)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_LE, @@ -2054,7 +2054,7 @@ unlock: static void add_uuid_complete(struct hci_dev *hdev, u8 status, u16 opcode) { - BT_DBG("status 0x%02x", status); + bt_dev_dbg(hdev, "status 0x%02x", status); mgmt_class_complete(hdev, MGMT_OP_ADD_UUID, status); } @@ -2067,7 +2067,7 @@ static int add_uuid(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) struct bt_uuid *uuid; int err; - BT_DBG("request for %s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); hci_dev_lock(hdev); @@ -2133,7 +2133,7 @@ static bool enable_service_cache(struct hci_dev *hdev) static void remove_uuid_complete(struct hci_dev *hdev, u8 status, u16 opcode) { - BT_DBG("status 0x%02x", status); + bt_dev_dbg(hdev, "status 0x%02x", status); mgmt_class_complete(hdev, MGMT_OP_REMOVE_UUID, status); } @@ -2148,7 +2148,7 @@ static int remove_uuid(struct sock *sk, struct hci_dev *hdev, void *data, struct hci_request req; int err, found; - BT_DBG("request for %s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); hci_dev_lock(hdev); @@ -2219,7 +2219,7 @@ unlock: static void set_class_complete(struct hci_dev *hdev, u8 status, u16 opcode) { - BT_DBG("status 0x%02x", status); + bt_dev_dbg(hdev, "status 0x%02x", status); mgmt_class_complete(hdev, MGMT_OP_SET_DEV_CLASS, status); } @@ -2232,7 +2232,7 @@ static int set_dev_class(struct sock *sk, struct hci_dev *hdev, void *data, struct hci_request req; int err; - BT_DBG("request for %s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); if (!lmp_bredr_capable(hdev)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DEV_CLASS, @@ -2305,7 +2305,7 @@ static int load_link_keys(struct sock *sk, struct hci_dev *hdev, void *data, bool changed; int i; - BT_DBG("request for %s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); if (!lmp_bredr_capable(hdev)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_LINK_KEYS, @@ -2331,8 +2331,8 @@ static int load_link_keys(struct sock *sk, struct hci_dev *hdev, void *data, return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_LINK_KEYS, MGMT_STATUS_INVALID_PARAMS); - BT_DBG("%s debug_keys %u key_count %u", hdev->name, cp->debug_keys, - key_count); + bt_dev_dbg(hdev, "debug_keys %u key_count %u", cp->debug_keys, + key_count); for (i = 0; i < key_count; i++) { struct mgmt_link_key_info *key = &cp->keys[i]; @@ -2533,7 +2533,7 @@ static int disconnect(struct sock *sk, struct hci_dev *hdev, void *data, struct hci_conn *conn; int err; - BT_DBG(""); + bt_dev_dbg(hdev, "sock %p", sk); memset(&rp, 0, sizeof(rp)); bacpy(&rp.addr.bdaddr, &cp->addr.bdaddr); @@ -2617,7 +2617,7 @@ static int get_connections(struct sock *sk, struct hci_dev *hdev, void *data, int err; u16 i; - BT_DBG(""); + bt_dev_dbg(hdev, "sock %p", sk); hci_dev_lock(hdev); @@ -2693,7 +2693,7 @@ static int pin_code_reply(struct sock *sk, struct hci_dev *hdev, void *data, struct mgmt_pending_cmd *cmd; int err; - BT_DBG(""); + bt_dev_dbg(hdev, "sock %p", sk); hci_dev_lock(hdev); @@ -2751,7 +2751,7 @@ static int set_io_capability(struct sock *sk, struct hci_dev *hdev, void *data, { struct mgmt_cp_set_io_capability *cp = data; - BT_DBG(""); + bt_dev_dbg(hdev, "sock %p", sk); if (cp->io_capability > SMP_IO_KEYBOARD_DISPLAY) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_IO_CAPABILITY, @@ -2761,8 +2761,7 @@ static int set_io_capability(struct sock *sk, struct hci_dev *hdev, void *data, hdev->io_capability = cp->io_capability; - BT_DBG("%s IO capability set to 0x%02x", hdev->name, - hdev->io_capability); + bt_dev_dbg(hdev, "IO capability set to 0x%02x", hdev->io_capability); hci_dev_unlock(hdev); @@ -2874,7 +2873,7 @@ static int pair_device(struct sock *sk, struct hci_dev *hdev, void *data, struct hci_conn *conn; int err; - BT_DBG(""); + bt_dev_dbg(hdev, "sock %p", sk); memset(&rp, 0, sizeof(rp)); bacpy(&rp.addr.bdaddr, &cp->addr.bdaddr); @@ -3003,7 +3002,7 @@ static int cancel_pair_device(struct sock *sk, struct hci_dev *hdev, void *data, struct hci_conn *conn; int err; - BT_DBG(""); + bt_dev_dbg(hdev, "sock %p", sk); hci_dev_lock(hdev); @@ -3114,7 +3113,7 @@ static int pin_code_neg_reply(struct sock *sk, struct hci_dev *hdev, { struct mgmt_cp_pin_code_neg_reply *cp = data; - BT_DBG(""); + bt_dev_dbg(hdev, "sock %p", sk); return user_pairing_resp(sk, hdev, &cp->addr, MGMT_OP_PIN_CODE_NEG_REPLY, @@ -3126,7 +3125,7 @@ static int user_confirm_reply(struct sock *sk, struct hci_dev *hdev, void *data, { struct mgmt_cp_user_confirm_reply *cp = data; - BT_DBG(""); + bt_dev_dbg(hdev, "sock %p", sk); if (len != sizeof(*cp)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_USER_CONFIRM_REPLY, @@ -3142,7 +3141,7 @@ static int user_confirm_neg_reply(struct sock *sk, struct hci_dev *hdev, { struct mgmt_cp_user_confirm_neg_reply *cp = data; - BT_DBG(""); + bt_dev_dbg(hdev, "sock %p", sk); return user_pairing_resp(sk, hdev, &cp->addr, MGMT_OP_USER_CONFIRM_NEG_REPLY, @@ -3154,7 +3153,7 @@ static int user_passkey_reply(struct sock *sk, struct hci_dev *hdev, void *data, { struct mgmt_cp_user_passkey_reply *cp = data; - BT_DBG(""); + bt_dev_dbg(hdev, "sock %p", sk); return user_pairing_resp(sk, hdev, &cp->addr, MGMT_OP_USER_PASSKEY_REPLY, @@ -3166,7 +3165,7 @@ static int user_passkey_neg_reply(struct sock *sk, struct hci_dev *hdev, { struct mgmt_cp_user_passkey_neg_reply *cp = data; - BT_DBG(""); + bt_dev_dbg(hdev, "sock %p", sk); return user_pairing_resp(sk, hdev, &cp->addr, MGMT_OP_USER_PASSKEY_NEG_REPLY, @@ -3207,7 +3206,7 @@ static void set_name_complete(struct hci_dev *hdev, u8 status, u16 opcode) struct mgmt_cp_set_local_name *cp; struct mgmt_pending_cmd *cmd; - BT_DBG("status 0x%02x", status); + bt_dev_dbg(hdev, "status 0x%02x", status); hci_dev_lock(hdev); @@ -3242,7 +3241,7 @@ static int set_local_name(struct sock *sk, struct hci_dev *hdev, void *data, struct hci_request req; int err; - BT_DBG(""); + bt_dev_dbg(hdev, "sock %p", sk); hci_dev_lock(hdev); @@ -3311,7 +3310,7 @@ static int set_appearance(struct sock *sk, struct hci_dev *hdev, void *data, u16 appearance; int err; - BT_DBG(""); + bt_dev_dbg(hdev, "sock %p", sk); if (!lmp_le_capable(hdev)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_APPEARANCE, @@ -3343,7 +3342,7 @@ static int get_phy_configuration(struct sock *sk, struct hci_dev *hdev, { struct mgmt_rp_get_phy_confguration rp; - BT_DBG("sock %p %s", sk, hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); hci_dev_lock(hdev); @@ -3376,7 +3375,7 @@ static void set_default_phy_complete(struct hci_dev *hdev, u8 status, { struct mgmt_pending_cmd *cmd; - BT_DBG("status 0x%02x", status); + bt_dev_dbg(hdev, "status 0x%02x", status); hci_dev_lock(hdev); @@ -3414,7 +3413,7 @@ static int set_phy_configuration(struct sock *sk, struct hci_dev *hdev, bool changed = false; int err; - BT_DBG("sock %p %s", sk, hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); configurable_phys = get_configurable_phys(hdev); supported_phys = get_supported_phys(hdev); @@ -3567,7 +3566,7 @@ static int set_blocked_keys(struct sock *sk, struct hci_dev *hdev, void *data, u16 key_count, expected_len; int i; - BT_DBG("request for %s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); key_count = __le16_to_cpu(keys->key_count); if (key_count > max_key_count) { @@ -3613,7 +3612,7 @@ static int set_wideband_speech(struct sock *sk, struct hci_dev *hdev, int err; bool changed = false; - BT_DBG("request for %s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); if (!test_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED, &hdev->quirks)) return mgmt_cmd_status(sk, hdev->id, @@ -3718,7 +3717,7 @@ static void read_local_oob_data_complete(struct hci_dev *hdev, u8 status, size_t rp_size = sizeof(mgmt_rp); struct mgmt_pending_cmd *cmd; - BT_DBG("%s status %u", hdev->name, status); + bt_dev_dbg(hdev, "status %u", status); cmd = pending_find(MGMT_OP_READ_LOCAL_OOB_DATA, hdev); if (!cmd) @@ -3777,7 +3776,7 @@ static int read_local_oob_data(struct sock *sk, struct hci_dev *hdev, struct hci_request req; int err; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); hci_dev_lock(hdev); @@ -3827,7 +3826,7 @@ static int add_remote_oob_data(struct sock *sk, struct hci_dev *hdev, struct mgmt_addr_info *addr = data; int err; - BT_DBG("%s ", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); if (!bdaddr_type_is_valid(addr->type)) return mgmt_cmd_complete(sk, hdev->id, @@ -3936,7 +3935,7 @@ static int remove_remote_oob_data(struct sock *sk, struct hci_dev *hdev, u8 status; int err; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); if (cp->addr.type != BDADDR_BREDR) return mgmt_cmd_complete(sk, hdev->id, @@ -3970,7 +3969,7 @@ void mgmt_start_discovery_complete(struct hci_dev *hdev, u8 status) { struct mgmt_pending_cmd *cmd; - BT_DBG("status %d", status); + bt_dev_dbg(hdev, "status %d", status); hci_dev_lock(hdev); @@ -4031,7 +4030,7 @@ static int start_discovery_internal(struct sock *sk, struct hci_dev *hdev, u8 status; int err; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); hci_dev_lock(hdev); @@ -4123,7 +4122,7 @@ static int start_service_discovery(struct sock *sk, struct hci_dev *hdev, u8 status; int err; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); hci_dev_lock(hdev); @@ -4218,7 +4217,7 @@ void mgmt_stop_discovery_complete(struct hci_dev *hdev, u8 status) { struct mgmt_pending_cmd *cmd; - BT_DBG("status %d", status); + bt_dev_dbg(hdev, "status %d", status); hci_dev_lock(hdev); @@ -4244,7 +4243,7 @@ static int stop_discovery(struct sock *sk, struct hci_dev *hdev, void *data, struct mgmt_pending_cmd *cmd; int err; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); hci_dev_lock(hdev); @@ -4286,7 +4285,7 @@ static int confirm_name(struct sock *sk, struct hci_dev *hdev, void *data, struct inquiry_entry *e; int err; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); hci_dev_lock(hdev); @@ -4328,7 +4327,7 @@ static int block_device(struct sock *sk, struct hci_dev *hdev, void *data, u8 status; int err; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); if (!bdaddr_type_is_valid(cp->addr.type)) return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_BLOCK_DEVICE, @@ -4364,7 +4363,7 @@ static int unblock_device(struct sock *sk, struct hci_dev *hdev, void *data, u8 status; int err; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); if (!bdaddr_type_is_valid(cp->addr.type)) return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_UNBLOCK_DEVICE, @@ -4401,7 +4400,7 @@ static int set_device_id(struct sock *sk, struct hci_dev *hdev, void *data, int err; __u16 source; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); source = __le16_to_cpu(cp->source); @@ -4431,7 +4430,7 @@ static int set_device_id(struct sock *sk, struct hci_dev *hdev, void *data, static void enable_advertising_instance(struct hci_dev *hdev, u8 status, u16 opcode) { - BT_DBG("status %d", status); + bt_dev_dbg(hdev, "status %d", status); } static void set_advertising_complete(struct hci_dev *hdev, u8 status, @@ -4517,7 +4516,7 @@ static int set_advertising(struct sock *sk, struct hci_dev *hdev, void *data, u8 val, status; int err; - BT_DBG("request for %s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); status = mgmt_le_support(hdev); if (status) @@ -4626,7 +4625,7 @@ static int set_static_address(struct sock *sk, struct hci_dev *hdev, struct mgmt_cp_set_static_address *cp = data; int err; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); if (!lmp_le_capable(hdev)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_STATIC_ADDRESS, @@ -4671,7 +4670,7 @@ static int set_scan_params(struct sock *sk, struct hci_dev *hdev, __u16 interval, window; int err; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); if (!lmp_le_capable(hdev)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_SCAN_PARAMS, @@ -4726,7 +4725,7 @@ static void fast_connectable_complete(struct hci_dev *hdev, u8 status, { struct mgmt_pending_cmd *cmd; - BT_DBG("status 0x%02x", status); + bt_dev_dbg(hdev, "status 0x%02x", status); hci_dev_lock(hdev); @@ -4763,7 +4762,7 @@ static int set_fast_connectable(struct sock *sk, struct hci_dev *hdev, struct hci_request req; int err; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED) || hdev->hci_ver < BLUETOOTH_VER_1_2) @@ -4824,7 +4823,7 @@ static void set_bredr_complete(struct hci_dev *hdev, u8 status, u16 opcode) { struct mgmt_pending_cmd *cmd; - BT_DBG("status 0x%02x", status); + bt_dev_dbg(hdev, "status 0x%02x", status); hci_dev_lock(hdev); @@ -4859,7 +4858,7 @@ static int set_bredr(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) struct hci_request req; int err; - BT_DBG("request for %s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); if (!lmp_bredr_capable(hdev) || !lmp_le_capable(hdev)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_BREDR, @@ -4969,7 +4968,7 @@ static void sc_enable_complete(struct hci_dev *hdev, u8 status, u16 opcode) struct mgmt_pending_cmd *cmd; struct mgmt_mode *cp; - BT_DBG("%s status %u", hdev->name, status); + bt_dev_dbg(hdev, "status %u", status); hci_dev_lock(hdev); @@ -5018,7 +5017,7 @@ static int set_secure_conn(struct sock *sk, struct hci_dev *hdev, u8 val; int err; - BT_DBG("request for %s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); if (!lmp_sc_capable(hdev) && !hci_dev_test_flag(hdev, HCI_LE_ENABLED)) @@ -5104,7 +5103,7 @@ static int set_debug_keys(struct sock *sk, struct hci_dev *hdev, bool changed, use_changed; int err; - BT_DBG("request for %s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); if (cp->val != 0x00 && cp->val != 0x01 && cp->val != 0x02) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DEBUG_KEYS, @@ -5151,7 +5150,7 @@ static int set_privacy(struct sock *sk, struct hci_dev *hdev, void *cp_data, bool changed; int err; - BT_DBG("request for %s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); if (!lmp_le_capable(hdev)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_PRIVACY, @@ -5226,7 +5225,7 @@ static int load_irks(struct sock *sk, struct hci_dev *hdev, void *cp_data, u16 irk_count, expected_len; int i, err; - BT_DBG("request for %s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); if (!lmp_le_capable(hdev)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_IRKS, @@ -5248,7 +5247,7 @@ static int load_irks(struct sock *sk, struct hci_dev *hdev, void *cp_data, MGMT_STATUS_INVALID_PARAMS); } - BT_DBG("%s irk_count %u", hdev->name, irk_count); + bt_dev_dbg(hdev, "irk_count %u", irk_count); for (i = 0; i < irk_count; i++) { struct mgmt_irk_info *key = &cp->irks[i]; @@ -5316,7 +5315,7 @@ static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev, u16 key_count, expected_len; int i, err; - BT_DBG("request for %s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); if (!lmp_le_capable(hdev)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_LONG_TERM_KEYS, @@ -5338,7 +5337,7 @@ static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev, MGMT_STATUS_INVALID_PARAMS); } - BT_DBG("%s key_count %u", hdev->name, key_count); + bt_dev_dbg(hdev, "key_count %u", key_count); for (i = 0; i < key_count; i++) { struct mgmt_ltk_info *key = &cp->keys[i]; @@ -5439,7 +5438,7 @@ static void conn_info_refresh_complete(struct hci_dev *hdev, u8 hci_status, u16 handle; u8 status; - BT_DBG("status 0x%02x", hci_status); + bt_dev_dbg(hdev, "status 0x%02x", hci_status); hci_dev_lock(hdev); @@ -5493,7 +5492,7 @@ static int get_conn_info(struct sock *sk, struct hci_dev *hdev, void *data, unsigned long conn_info_age; int err = 0; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); memset(&rp, 0, sizeof(rp)); bacpy(&rp.addr.bdaddr, &cp->addr.bdaddr); @@ -5647,7 +5646,7 @@ static void get_clock_info_complete(struct hci_dev *hdev, u8 status, u16 opcode) struct mgmt_pending_cmd *cmd; struct hci_conn *conn; - BT_DBG("%s status %u", hdev->name, status); + bt_dev_dbg(hdev, "status %u", status); hci_dev_lock(hdev); @@ -5684,7 +5683,7 @@ static int get_clock_info(struct sock *sk, struct hci_dev *hdev, void *data, struct hci_conn *conn; int err; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); memset(&rp, 0, sizeof(rp)); bacpy(&rp.addr.bdaddr, &cp->addr.bdaddr); @@ -5805,8 +5804,8 @@ static int hci_conn_params_set(struct hci_dev *hdev, bdaddr_t *addr, params->auto_connect = auto_connect; - BT_DBG("addr %pMR (type %u) auto_connect %u", addr, addr_type, - auto_connect); + bt_dev_dbg(hdev, "addr %pMR (type %u) auto_connect %u", + addr, addr_type, auto_connect); return 0; } @@ -5830,7 +5829,7 @@ static int add_device(struct sock *sk, struct hci_dev *hdev, u8 auto_conn, addr_type; int err; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); if (!bdaddr_type_is_valid(cp->addr.type) || !bacmp(&cp->addr.bdaddr, BDADDR_ANY)) @@ -5928,7 +5927,7 @@ static int remove_device(struct sock *sk, struct hci_dev *hdev, struct mgmt_cp_remove_device *cp = data; int err; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); hci_dev_lock(hdev); @@ -6037,7 +6036,7 @@ static int remove_device(struct sock *sk, struct hci_dev *hdev, kfree(p); } - BT_DBG("All LE connection parameters were removed"); + bt_dev_dbg(hdev, "All LE connection parameters were removed"); hci_update_background_scan(hdev); } @@ -6080,7 +6079,7 @@ static int load_conn_param(struct sock *sk, struct hci_dev *hdev, void *data, MGMT_STATUS_INVALID_PARAMS); } - BT_DBG("%s param_count %u", hdev->name, param_count); + bt_dev_dbg(hdev, "param_count %u", param_count); hci_dev_lock(hdev); @@ -6092,8 +6091,8 @@ static int load_conn_param(struct sock *sk, struct hci_dev *hdev, void *data, u16 min, max, latency, timeout; u8 addr_type; - BT_DBG("Adding %pMR (type %u)", ¶m->addr.bdaddr, - param->addr.type); + bt_dev_dbg(hdev, "Adding %pMR (type %u)", ¶m->addr.bdaddr, + param->addr.type); if (param->addr.type == BDADDR_LE_PUBLIC) { addr_type = ADDR_LE_DEV_PUBLIC; @@ -6109,8 +6108,8 @@ static int load_conn_param(struct sock *sk, struct hci_dev *hdev, void *data, latency = le16_to_cpu(param->latency); timeout = le16_to_cpu(param->timeout); - BT_DBG("min 0x%04x max 0x%04x latency 0x%04x timeout 0x%04x", - min, max, latency, timeout); + bt_dev_dbg(hdev, "min 0x%04x max 0x%04x latency 0x%04x timeout 0x%04x", + min, max, latency, timeout); if (hci_check_conn_params(min, max, latency, timeout) < 0) { bt_dev_err(hdev, "ignoring invalid connection parameters"); @@ -6143,7 +6142,7 @@ static int set_external_config(struct sock *sk, struct hci_dev *hdev, bool changed; int err; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); if (hdev_is_powered(hdev)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_EXTERNAL_CONFIG, @@ -6199,7 +6198,7 @@ static int set_public_address(struct sock *sk, struct hci_dev *hdev, bool changed; int err; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); if (hdev_is_powered(hdev)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_PUBLIC_ADDRESS, @@ -6254,7 +6253,7 @@ static void read_local_oob_ext_data_complete(struct hci_dev *hdev, u8 status, u16 eir_len; int err; - BT_DBG("%s status %u", hdev->name, status); + bt_dev_dbg(hdev, "status %u", status); cmd = pending_find(MGMT_OP_READ_LOCAL_OOB_EXT_DATA, hdev); if (!cmd) @@ -6393,7 +6392,7 @@ static int read_local_oob_ext_data(struct sock *sk, struct hci_dev *hdev, u8 status, flags, role, addr[7], hash[16], rand[16]; int err; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); if (hdev_is_powered(hdev)) { switch (cp->type) { @@ -6580,7 +6579,7 @@ static int read_adv_features(struct sock *sk, struct hci_dev *hdev, u32 supported_flags; u8 *instance; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); if (!lmp_le_capable(hdev)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_READ_ADV_FEATURES, @@ -6723,7 +6722,7 @@ static void add_advertising_complete(struct hci_dev *hdev, u8 status, struct adv_info *adv_instance, *n; u8 instance; - BT_DBG("status %d", status); + bt_dev_dbg(hdev, "status %d", status); hci_dev_lock(hdev); @@ -6782,7 +6781,7 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev, struct mgmt_pending_cmd *cmd; struct hci_request req; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); status = mgmt_le_support(hdev); if (status) @@ -6919,7 +6918,7 @@ static void remove_advertising_complete(struct hci_dev *hdev, u8 status, struct mgmt_cp_remove_advertising *cp; struct mgmt_rp_remove_advertising rp; - BT_DBG("status %d", status); + bt_dev_dbg(hdev, "status %d", status); hci_dev_lock(hdev); @@ -6951,7 +6950,7 @@ static int remove_advertising(struct sock *sk, struct hci_dev *hdev, struct hci_request req; int err; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); hci_dev_lock(hdev); @@ -7023,7 +7022,7 @@ static int get_adv_size_info(struct sock *sk, struct hci_dev *hdev, u32 flags, supported_flags; int err; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); if (!lmp_le_capable(hdev)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_GET_ADV_SIZE_INFO, @@ -7251,7 +7250,7 @@ void mgmt_power_on(struct hci_dev *hdev, int err) { struct cmd_lookup match = { NULL, hdev }; - BT_DBG("err %d", err); + bt_dev_dbg(hdev, "err %d", err); hci_dev_lock(hdev); @@ -7670,7 +7669,7 @@ int mgmt_user_confirm_request(struct hci_dev *hdev, bdaddr_t *bdaddr, { struct mgmt_ev_user_confirm_request ev; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, "bdaddr %pMR", bdaddr); bacpy(&ev.addr.bdaddr, bdaddr); ev.addr.type = link_to_bdaddr(link_type, addr_type); @@ -7686,7 +7685,7 @@ int mgmt_user_passkey_request(struct hci_dev *hdev, bdaddr_t *bdaddr, { struct mgmt_ev_user_passkey_request ev; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, "bdaddr %pMR", bdaddr); bacpy(&ev.addr.bdaddr, bdaddr); ev.addr.type = link_to_bdaddr(link_type, addr_type); @@ -7747,7 +7746,7 @@ int mgmt_user_passkey_notify(struct hci_dev *hdev, bdaddr_t *bdaddr, { struct mgmt_ev_passkey_notify ev; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, "bdaddr %pMR", bdaddr); bacpy(&ev.addr.bdaddr, bdaddr); ev.addr.type = link_to_bdaddr(link_type, addr_type); @@ -8166,7 +8165,7 @@ void mgmt_discovering(struct hci_dev *hdev, u8 discovering) { struct mgmt_ev_discovering ev; - BT_DBG("%s discovering %u", hdev->name, discovering); + bt_dev_dbg(hdev, "discovering %u", discovering); memset(&ev, 0, sizeof(ev)); ev.type = hdev->discovery.type; -- cgit v1.2.3 From d5cc6626b33780699c7a4986f3521361306862fe Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Wed, 6 May 2020 09:57:49 +0200 Subject: Bluetooth: Introduce HCI_MGMT_HDEV_OPTIONAL option When setting HCI_MGMT_HDEV_OPTIONAL it is possible to target a specific conntroller or a global interface. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- include/net/bluetooth/hci_core.h | 1 + net/bluetooth/hci_sock.c | 12 +++++++----- 2 files changed, 8 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 239ab72f16c6..0c7f3ad76665 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1554,6 +1554,7 @@ void hci_sock_dev_event(struct hci_dev *hdev, int event); #define HCI_MGMT_NO_HDEV BIT(1) #define HCI_MGMT_UNTRUSTED BIT(2) #define HCI_MGMT_UNCONFIGURED BIT(3) +#define HCI_MGMT_HDEV_OPTIONAL BIT(4) struct hci_mgmt_handler { int (*func) (struct sock *sk, struct hci_dev *hdev, void *data, diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 9c4a093f8960..caf38a8ea6a8 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -1579,11 +1579,13 @@ static int hci_mgmt_cmd(struct hci_mgmt_chan *chan, struct sock *sk, } } - no_hdev = (handler->flags & HCI_MGMT_NO_HDEV); - if (no_hdev != !hdev) { - err = mgmt_cmd_status(sk, index, opcode, - MGMT_STATUS_INVALID_INDEX); - goto done; + if (!(handler->flags & HCI_MGMT_HDEV_OPTIONAL)) { + no_hdev = (handler->flags & HCI_MGMT_NO_HDEV); + if (no_hdev != !hdev) { + err = mgmt_cmd_status(sk, index, opcode, + MGMT_STATUS_INVALID_INDEX); + goto done; + } } var_len = (handler->flags & HCI_MGMT_VAR_LEN); -- cgit v1.2.3 From 568602457c1ab6d26db828de168e4ef35b88f1bc Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Wed, 6 May 2020 09:57:50 +0200 Subject: Bluetooth: Replace BT_DBG with bt_dev_dbg for security manager support The security manager operates on a specific controller and thus use bt_dev_dbg to indetify the controller for each debug message. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/smp.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index df22cbf94693..5510017cf9ff 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -508,7 +508,7 @@ bool smp_irk_matches(struct hci_dev *hdev, const u8 irk[16], if (!chan || !chan->data) return false; - BT_DBG("RPA %pMR IRK %*phN", bdaddr, 16, irk); + bt_dev_dbg(hdev, "RPA %pMR IRK %*phN", bdaddr, 16, irk); err = smp_ah(irk, &bdaddr->b[3], hash); if (err) @@ -534,7 +534,7 @@ int smp_generate_rpa(struct hci_dev *hdev, const u8 irk[16], bdaddr_t *rpa) if (err < 0) return err; - BT_DBG("RPA %pMR", rpa); + bt_dev_dbg(hdev, "RPA %pMR", rpa); return 0; } @@ -551,7 +551,7 @@ int smp_generate_oob(struct hci_dev *hdev, u8 hash[16], u8 rand[16]) smp = chan->data; if (hci_dev_test_flag(hdev, HCI_USE_DEBUG_KEYS)) { - BT_DBG("Using debug keys"); + bt_dev_dbg(hdev, "Using debug keys"); err = set_ecdh_privkey(smp->tfm_ecdh, debug_sk); if (err) return err; @@ -1867,7 +1867,7 @@ static u8 sc_send_public_key(struct smp_chan *smp) { struct hci_dev *hdev = smp->conn->hcon->hdev; - BT_DBG(""); + bt_dev_dbg(hdev, ""); if (test_bit(SMP_FLAG_LOCAL_OOB, &smp->flags)) { struct l2cap_chan *chan = hdev->smp_data; -- cgit v1.2.3 From a10c907ce0e5e138c3da091fcb7c3d109a15aec5 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Wed, 6 May 2020 09:57:51 +0200 Subject: Bluetooth: Add support for experimental features configuration To enable platform specific experimental features, introduce this new set of management commands and events. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- include/net/bluetooth/hci.h | 1 + include/net/bluetooth/mgmt.h | 27 +++++++++++++++++++++ net/bluetooth/mgmt.c | 58 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 86 insertions(+) (limited to 'net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index e5bc1dfe809a..16ab6ce87883 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -258,6 +258,7 @@ enum { HCI_MGMT_DEV_CLASS_EVENTS, HCI_MGMT_LOCAL_NAME_EVENTS, HCI_MGMT_OOB_DATA_EVENTS, + HCI_MGMT_EXP_FEATURE_EVENTS, }; /* diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h index 9d4d87c6028e..16e0d87bd8fa 100644 --- a/include/net/bluetooth/mgmt.h +++ b/include/net/bluetooth/mgmt.h @@ -681,6 +681,27 @@ struct mgmt_rp_read_security_info { __u8 sec[]; } __packed; +#define MGMT_OP_READ_EXP_FEATURES_INFO 0x0049 +#define MGMT_READ_EXP_FEATURES_INFO_SIZE 0 +struct mgmt_rp_read_exp_features_info { + __le16 feature_count; + struct { + __u8 uuid[16]; + __le32 flags; + } features[]; +} __packed; + +#define MGMT_OP_SET_EXP_FEATURE 0x004a +struct mgmt_cp_set_exp_feature { + __u8 uuid[16]; + __u8 param[]; +} __packed; +#define MGMT_SET_EXP_FEATURE_SIZE 16 +struct mgmt_rp_set_exp_feature { + __u8 uuid[16]; + __le32 flags; +} __packed; + #define MGMT_EV_CMD_COMPLETE 0x0001 struct mgmt_ev_cmd_complete { __le16 opcode; @@ -906,3 +927,9 @@ struct mgmt_ev_ext_info_changed { struct mgmt_ev_phy_configuration_changed { __le32 selected_phys; } __packed; + +#define MGMT_EV_EXP_FEATURE_CHANGED 0x0027 +struct mgmt_ev_exp_feature_changed { + __u8 uuid[16]; + __le32 flags; +} __packed; diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 78cf72b64014..3c6be70d98ef 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -109,6 +109,8 @@ static const u16 mgmt_commands[] = { MGMT_OP_SET_BLOCKED_KEYS, MGMT_OP_SET_WIDEBAND_SPEECH, MGMT_OP_READ_SECURITY_INFO, + MGMT_OP_READ_EXP_FEATURES_INFO, + MGMT_OP_SET_EXP_FEATURE, }; static const u16 mgmt_events[] = { @@ -148,6 +150,7 @@ static const u16 mgmt_events[] = { MGMT_EV_ADVERTISING_REMOVED, MGMT_EV_EXT_INFO_CHANGED, MGMT_EV_PHY_CONFIGURATION_CHANGED, + MGMT_EV_EXP_FEATURE_CHANGED, }; static const u16 mgmt_untrusted_commands[] = { @@ -158,6 +161,7 @@ static const u16 mgmt_untrusted_commands[] = { MGMT_OP_READ_EXT_INDEX_LIST, MGMT_OP_READ_EXT_INFO, MGMT_OP_READ_SECURITY_INFO, + MGMT_OP_READ_EXP_FEATURES_INFO, }; static const u16 mgmt_untrusted_events[] = { @@ -172,6 +176,7 @@ static const u16 mgmt_untrusted_events[] = { MGMT_EV_EXT_INDEX_ADDED, MGMT_EV_EXT_INDEX_REMOVED, MGMT_EV_EXT_INFO_CHANGED, + MGMT_EV_EXP_FEATURE_CHANGED, }; #define CACHE_TIMEOUT msecs_to_jiffies(2 * 1000) @@ -3710,6 +3715,53 @@ static int read_security_info(struct sock *sk, struct hci_dev *hdev, rp, sizeof(*rp) + sec_len); } +static int read_exp_features_info(struct sock *sk, struct hci_dev *hdev, + void *data, u16 data_len) +{ + char buf[42]; + struct mgmt_rp_read_exp_features_info *rp = (void *)buf; + u16 idx = 0; + + bt_dev_dbg(hdev, "sock %p", sk); + + memset(&buf, 0, sizeof(buf)); + + rp->feature_count = cpu_to_le16(idx); + + /* After reading the experimental features information, enable + * the events to update client on any future change. + */ + hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS); + + return mgmt_cmd_complete(sk, hdev ? hdev->id : MGMT_INDEX_NONE, + MGMT_OP_READ_EXP_FEATURES_INFO, + 0, rp, sizeof(*rp) + (20 * idx)); +} + +static int set_exp_feature(struct sock *sk, struct hci_dev *hdev, + void *data, u16 data_len) +{ + struct mgmt_cp_set_exp_feature *cp = data; + struct mgmt_rp_set_exp_feature rp; + + bt_dev_dbg(hdev, "sock %p", sk); + + if (!memcmp(cp->uuid, ZERO_KEY, 16)) { + memset(rp.uuid, 0, 16); + rp.flags = cpu_to_le32(0); + + hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS); + + return mgmt_cmd_complete(sk, hdev ? hdev->id : MGMT_INDEX_NONE, + MGMT_OP_SET_EXP_FEATURE, 0, + &rp, sizeof(rp)); + } + + return mgmt_cmd_status(sk, hdev ? hdev->id : MGMT_INDEX_NONE, + MGMT_OP_SET_EXP_FEATURE, + MGMT_STATUS_NOT_SUPPORTED); +} + static void read_local_oob_data_complete(struct hci_dev *hdev, u8 status, u16 opcode, struct sk_buff *skb) { @@ -7152,6 +7204,12 @@ static const struct hci_mgmt_handler mgmt_handlers[] = { { set_wideband_speech, MGMT_SETTING_SIZE }, { read_security_info, MGMT_READ_SECURITY_INFO_SIZE, HCI_MGMT_UNTRUSTED }, + { read_exp_features_info, MGMT_READ_EXP_FEATURES_INFO_SIZE, + HCI_MGMT_UNTRUSTED | + HCI_MGMT_HDEV_OPTIONAL }, + { set_exp_feature, MGMT_SET_EXP_FEATURE_SIZE, + HCI_MGMT_VAR_LEN | + HCI_MGMT_HDEV_OPTIONAL }, }; void mgmt_index_added(struct hci_dev *hdev) -- cgit v1.2.3 From e625e50ceee18bc1e3fb1a6375e089405a797a4d Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Wed, 6 May 2020 09:57:52 +0200 Subject: Bluetooth: Introduce debug feature when dynamic debug is disabled In case dynamic debug is disabled, this feature allows a vendor platform to provide debug statement printing. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- include/net/bluetooth/bluetooth.h | 11 +++++ net/bluetooth/Kconfig | 7 ++++ net/bluetooth/lib.c | 33 +++++++++++++++ net/bluetooth/mgmt.c | 87 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 138 insertions(+) (limited to 'net') diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index 3fa7b1e3c5d9..18190055374c 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -153,6 +153,12 @@ __printf(1, 2) void bt_warn(const char *fmt, ...); __printf(1, 2) void bt_err(const char *fmt, ...); +#if IS_ENABLED(CONFIG_BT_FEATURE_DEBUG) +void bt_dbg_set(bool enable); +bool bt_dbg_get(void); +__printf(1, 2) +void bt_dbg(const char *fmt, ...); +#endif __printf(1, 2) void bt_warn_ratelimited(const char *fmt, ...); __printf(1, 2) @@ -161,7 +167,12 @@ void bt_err_ratelimited(const char *fmt, ...); #define BT_INFO(fmt, ...) bt_info(fmt "\n", ##__VA_ARGS__) #define BT_WARN(fmt, ...) bt_warn(fmt "\n", ##__VA_ARGS__) #define BT_ERR(fmt, ...) bt_err(fmt "\n", ##__VA_ARGS__) + +#if IS_ENABLED(CONFIG_BT_FEATURE_DEBUG) +#define BT_DBG(fmt, ...) bt_dbg(fmt "\n", ##__VA_ARGS__) +#else #define BT_DBG(fmt, ...) pr_debug(fmt "\n", ##__VA_ARGS__) +#endif #define bt_dev_info(hdev, fmt, ...) \ BT_INFO("%s: " fmt, (hdev)->name, ##__VA_ARGS__) diff --git a/net/bluetooth/Kconfig b/net/bluetooth/Kconfig index 9e25c6570170..1d6d243cdde9 100644 --- a/net/bluetooth/Kconfig +++ b/net/bluetooth/Kconfig @@ -135,4 +135,11 @@ config BT_SELFTEST_SMP Run test cases for SMP cryptographic functionality, including both legacy SMP as well as the Secure Connections features. +config BT_FEATURE_DEBUG + bool "Enable runtime option for debugging statements" + depends on BT && !DYNAMIC_DEBUG + help + This provides an option to enable/disable debugging statements + at runtime via the experimental features interface. + source "drivers/bluetooth/Kconfig" diff --git a/net/bluetooth/lib.c b/net/bluetooth/lib.c index c09e0a3a0ed9..5326f41a58b7 100644 --- a/net/bluetooth/lib.c +++ b/net/bluetooth/lib.c @@ -183,6 +183,39 @@ void bt_err(const char *format, ...) } EXPORT_SYMBOL(bt_err); +#ifdef CONFIG_BT_FEATURE_DEBUG +static bool debug_enable; + +void bt_dbg_set(bool enable) +{ + debug_enable = enable; +} + +bool bt_dbg_get(void) +{ + return debug_enable; +} + +void bt_dbg(const char *format, ...) +{ + struct va_format vaf; + va_list args; + + if (likely(!debug_enable)) + return; + + va_start(args, format); + + vaf.fmt = format; + vaf.va = &args; + + printk(KERN_DEBUG pr_fmt("%pV"), &vaf); + + va_end(args); +} +EXPORT_SYMBOL(bt_dbg); +#endif + void bt_warn_ratelimited(const char *format, ...) { struct va_format vaf; diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 3c6be70d98ef..9e8a3cccc6ca 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -3715,6 +3715,14 @@ static int read_security_info(struct sock *sk, struct hci_dev *hdev, rp, sizeof(*rp) + sec_len); } +#ifdef CONFIG_BT_FEATURE_DEBUG +/* d4992530-b9ec-469f-ab01-6c481c47da1c */ +static const u8 debug_uuid[16] = { + 0x1c, 0xda, 0x47, 0x1c, 0x48, 0x6c, 0x01, 0xab, + 0x9f, 0x46, 0xec, 0xb9, 0x30, 0x25, 0x99, 0xd4, +}; +#endif + static int read_exp_features_info(struct sock *sk, struct hci_dev *hdev, void *data, u16 data_len) { @@ -3726,6 +3734,16 @@ static int read_exp_features_info(struct sock *sk, struct hci_dev *hdev, memset(&buf, 0, sizeof(buf)); +#ifdef CONFIG_BT_FEATURE_DEBUG + if (!hdev) { + u32 flags = bt_dbg_get() ? BIT(0) : 0; + + memcpy(rp->features[idx].uuid, debug_uuid, 16); + rp->features[idx].flags = cpu_to_le32(flags); + idx++; + } +#endif + rp->feature_count = cpu_to_le16(idx); /* After reading the experimental features information, enable @@ -3738,6 +3756,21 @@ static int read_exp_features_info(struct sock *sk, struct hci_dev *hdev, 0, rp, sizeof(*rp) + (20 * idx)); } +#ifdef CONFIG_BT_FEATURE_DEBUG +static int exp_debug_feature_changed(bool enabled, struct sock *skip) +{ + struct mgmt_ev_exp_feature_changed ev; + + memset(&ev, 0, sizeof(ev)); + memcpy(ev.uuid, debug_uuid, 16); + ev.flags = cpu_to_le32(enabled ? BIT(0) : 0); + + return mgmt_limited_event(MGMT_EV_EXP_FEATURE_CHANGED, NULL, + &ev, sizeof(ev), + HCI_MGMT_EXP_FEATURE_EVENTS, skip); +} +#endif + static int set_exp_feature(struct sock *sk, struct hci_dev *hdev, void *data, u16 data_len) { @@ -3750,6 +3783,17 @@ static int set_exp_feature(struct sock *sk, struct hci_dev *hdev, memset(rp.uuid, 0, 16); rp.flags = cpu_to_le32(0); +#ifdef CONFIG_BT_FEATURE_DEBUG + if (!hdev) { + bool changed = bt_dbg_get(); + + bt_dbg_set(false); + + if (changed) + exp_debug_feature_changed(false, sk); + } +#endif + hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS); return mgmt_cmd_complete(sk, hdev ? hdev->id : MGMT_INDEX_NONE, @@ -3757,6 +3801,49 @@ static int set_exp_feature(struct sock *sk, struct hci_dev *hdev, &rp, sizeof(rp)); } +#ifdef CONFIG_BT_FEATURE_DEBUG + if (!memcmp(cp->uuid, debug_uuid, 16)) { + bool val, changed; + int err; + + /* Command requires to use the non-controller index */ + if (hdev) + return mgmt_cmd_status(sk, hdev->id, + MGMT_OP_SET_EXP_FEATURE, + MGMT_STATUS_INVALID_INDEX); + + /* Parameters are limited to a single octet */ + if (data_len != MGMT_SET_EXP_FEATURE_SIZE + 1) + return mgmt_cmd_status(sk, MGMT_INDEX_NONE, + MGMT_OP_SET_EXP_FEATURE, + MGMT_STATUS_INVALID_PARAMS); + + /* Only boolean on/off is supported */ + if (cp->param[0] != 0x00 && cp->param[0] != 0x01) + return mgmt_cmd_status(sk, MGMT_INDEX_NONE, + MGMT_OP_SET_EXP_FEATURE, + MGMT_STATUS_INVALID_PARAMS); + + val = !!cp->param[0]; + changed = val ? !bt_dbg_get() : bt_dbg_get(); + bt_dbg_set(val); + + memcpy(rp.uuid, debug_uuid, 16); + rp.flags = cpu_to_le32(val ? BIT(0) : 0); + + hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS); + + err = mgmt_cmd_complete(sk, MGMT_INDEX_NONE, + MGMT_OP_SET_EXP_FEATURE, 0, + &rp, sizeof(rp)); + + if (changed) + exp_debug_feature_changed(val, sk); + + return err; + } +#endif + return mgmt_cmd_status(sk, hdev ? hdev->id : MGMT_INDEX_NONE, MGMT_OP_SET_EXP_FEATURE, MGMT_STATUS_NOT_SUPPORTED); -- cgit v1.2.3 From 7d4343d501f9b5ddbc92f278adba339d16d010e1 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 11 May 2020 10:33:42 +0200 Subject: xfrm: fix unused variable warning if CONFIG_NETFILTER=n After recent change 'x' is only used when CONFIG_NETFILTER is set: net/ipv4/xfrm4_output.c: In function '__xfrm4_output': net/ipv4/xfrm4_output.c:19:21: warning: unused variable 'x' [-Wunused-variable] 19 | struct xfrm_state *x = skb_dst(skb)->xfrm; Expand the CONFIG_NETFILTER scope to avoid this. Fixes: 2ab6096db2f1 ("xfrm: remove output_finish indirection from xfrm_state_afinfo") Reported-by: Stephen Rothwell Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- net/ipv4/xfrm4_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index 502eb189d852..3cff51ba72bb 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -16,9 +16,9 @@ static int __xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb) { +#ifdef CONFIG_NETFILTER struct xfrm_state *x = skb_dst(skb)->xfrm; -#ifdef CONFIG_NETFILTER if (!x) { IPCB(skb)->flags |= IPSKB_REROUTED; return dst_output(net, sk, skb); -- cgit v1.2.3 From 2fa3888bb7a9fb3966c77555c8cd6f4bd6a1439a Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 11 May 2020 16:47:14 -0700 Subject: net: dsa: ocelot: Constify dsa_device_ops ocelot_netdev_ops should be const since that is what the DSA layer expects. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/tag_ocelot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c index 59de1315100f..b0c98ee4e13b 100644 --- a/net/dsa/tag_ocelot.c +++ b/net/dsa/tag_ocelot.c @@ -228,7 +228,7 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb, return skb; } -static struct dsa_device_ops ocelot_netdev_ops = { +static const struct dsa_device_ops ocelot_netdev_ops = { .name = "ocelot", .proto = DSA_TAG_PROTO_OCELOT, .xmit = ocelot_xmit, -- cgit v1.2.3 From 097f024454fca0e13bbba0ab54dfe63ac5610953 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 11 May 2020 16:47:15 -0700 Subject: net: dsa: tag_sja1105: Constify dsa_device_ops sja1105_netdev_ops should be const since that is what the DSA layer expects. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/tag_sja1105.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c index d553bf36bd41..5ecac5921a7d 100644 --- a/net/dsa/tag_sja1105.c +++ b/net/dsa/tag_sja1105.c @@ -304,7 +304,7 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb, is_meta); } -static struct dsa_device_ops sja1105_netdev_ops = { +static const struct dsa_device_ops sja1105_netdev_ops = { .name = "sja1105", .proto = DSA_TAG_PROTO_SJA1105, .xmit = sja1105_xmit, -- cgit v1.2.3 From 0462b6bdb6445b887b8896f28be92e0d94c92e7b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 11 May 2020 13:59:11 +0200 Subject: net: add a CMSG_USER_DATA macro Add a variant of CMSG_DATA that operates on user pointer to avoid sparse warnings about casting to/from user pointers. Also fix up CMSG_DATA to rely on the gcc extension that allows void pointer arithmetics to cut down on the amount of casts. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/linux/socket.h | 5 ++++- net/core/scm.c | 4 ++-- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/linux/socket.h b/include/linux/socket.h index 54338fac45cb..4cc64d611cf4 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -94,7 +94,10 @@ struct cmsghdr { #define CMSG_ALIGN(len) ( ((len)+sizeof(long)-1) & ~(sizeof(long)-1) ) -#define CMSG_DATA(cmsg) ((void *)((char *)(cmsg) + sizeof(struct cmsghdr))) +#define CMSG_DATA(cmsg) \ + ((void *)(cmsg) + sizeof(struct cmsghdr)) +#define CMSG_USER_DATA(cmsg) \ + ((void __user *)(cmsg) + sizeof(struct cmsghdr)) #define CMSG_SPACE(len) (sizeof(struct cmsghdr) + CMSG_ALIGN(len)) #define CMSG_LEN(len) (sizeof(struct cmsghdr) + (len)) diff --git a/net/core/scm.c b/net/core/scm.c index dc6fed1f221c..abfdc85a64c1 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -236,7 +236,7 @@ int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data) err = -EFAULT; if (copy_to_user(cm, &cmhdr, sizeof cmhdr)) goto out; - if (copy_to_user(CMSG_DATA(cm), data, cmlen - sizeof(struct cmsghdr))) + if (copy_to_user(CMSG_USER_DATA(cm), data, cmlen - sizeof(*cm))) goto out; cmlen = CMSG_SPACE(len); if (msg->msg_controllen < cmlen) @@ -300,7 +300,7 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) if (fdnum < fdmax) fdmax = fdnum; - for (i=0, cmfptr=(__force int __user *)CMSG_DATA(cm); i Date: Mon, 11 May 2020 13:59:12 +0200 Subject: net/scm: cleanup scm_detach_fds Factor out two helpes to keep the code tidy. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- net/core/scm.c | 94 +++++++++++++++++++++++++++++++--------------------------- 1 file changed, 51 insertions(+), 43 deletions(-) (limited to 'net') diff --git a/net/core/scm.c b/net/core/scm.c index abfdc85a64c1..168b006a52ff 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -277,78 +277,86 @@ void put_cmsg_scm_timestamping(struct msghdr *msg, struct scm_timestamping_inter } EXPORT_SYMBOL(put_cmsg_scm_timestamping); +static int __scm_install_fd(struct file *file, int __user *ufd, int o_flags) +{ + struct socket *sock; + int new_fd; + int error; + + error = security_file_receive(file); + if (error) + return error; + + new_fd = get_unused_fd_flags(o_flags); + if (new_fd < 0) + return new_fd; + + error = put_user(new_fd, ufd); + if (error) { + put_unused_fd(new_fd); + return error; + } + + /* Bump the usage count and install the file. */ + sock = sock_from_file(file, &error); + if (sock) { + sock_update_netprioidx(&sock->sk->sk_cgrp_data); + sock_update_classid(&sock->sk->sk_cgrp_data); + } + fd_install(new_fd, get_file(file)); + return error; +} + +static int scm_max_fds(struct msghdr *msg) +{ + if (msg->msg_controllen <= sizeof(struct cmsghdr)) + return 0; + return (msg->msg_controllen - sizeof(struct cmsghdr)) / sizeof(int); +} + void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) { struct cmsghdr __user *cm = (__force struct cmsghdr __user*)msg->msg_control; - - int fdmax = 0; - int fdnum = scm->fp->count; - struct file **fp = scm->fp->fp; - int __user *cmfptr; + int o_flags = (msg->msg_flags & MSG_CMSG_CLOEXEC) ? O_CLOEXEC : 0; + int fdmax = min_t(int, scm_max_fds(msg), scm->fp->count); + int __user *cmsg_data = CMSG_USER_DATA(cm); int err = 0, i; - if (MSG_CMSG_COMPAT & msg->msg_flags) { + if (msg->msg_flags & MSG_CMSG_COMPAT) { scm_detach_fds_compat(msg, scm); return; } - if (msg->msg_controllen > sizeof(struct cmsghdr)) - fdmax = ((msg->msg_controllen - sizeof(struct cmsghdr)) - / sizeof(int)); - - if (fdnum < fdmax) - fdmax = fdnum; - - for (i=0, cmfptr =(int __user *)CMSG_USER_DATA(cm); ifp->fp[i], cmsg_data + i, o_flags); if (err) break; - err = get_unused_fd_flags(MSG_CMSG_CLOEXEC & msg->msg_flags - ? O_CLOEXEC : 0); - if (err < 0) - break; - new_fd = err; - err = put_user(new_fd, cmfptr); - if (err) { - put_unused_fd(new_fd); - break; - } - /* Bump the usage count and install the file. */ - sock = sock_from_file(fp[i], &err); - if (sock) { - sock_update_netprioidx(&sock->sk->sk_cgrp_data); - sock_update_classid(&sock->sk->sk_cgrp_data); - } - fd_install(new_fd, get_file(fp[i])); } - if (i > 0) - { - int cmlen = CMSG_LEN(i*sizeof(int)); + if (i > 0) { + int cmlen = CMSG_LEN(i * sizeof(int)); + err = put_user(SOL_SOCKET, &cm->cmsg_level); if (!err) err = put_user(SCM_RIGHTS, &cm->cmsg_type); if (!err) err = put_user(cmlen, &cm->cmsg_len); if (!err) { - cmlen = CMSG_SPACE(i*sizeof(int)); + cmlen = CMSG_SPACE(i * sizeof(int)); if (msg->msg_controllen < cmlen) cmlen = msg->msg_controllen; msg->msg_control += cmlen; msg->msg_controllen -= cmlen; } } - if (i < fdnum || (fdnum && fdmax <= 0)) + + if (i < scm->fp->count || (scm->fp->count && fdmax <= 0)) msg->msg_flags |= MSG_CTRUNC; /* - * All of the files that fit in the message have had their - * usage counts incremented, so we just free the list. + * All of the files that fit in the message have had their usage counts + * incremented, so we just free the list. */ __scm_destroy(scm); } -- cgit v1.2.3 From 1f466e1f15cf1dac7c86798d694649fc42cd868a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 11 May 2020 13:59:13 +0200 Subject: net: cleanly handle kernel vs user buffers for ->msg_control The msg_control field in struct msghdr can either contain a user pointer when used with the recvmsg system call, or a kernel pointer when used with sendmsg. To complicate things further kernel_recvmsg can stuff a kernel pointer in and then use set_fs to make the uaccess helpers accept it. Replace it with a union of a kernel pointer msg_control field, and a user pointer msg_control_user one, and allow kernel_recvmsg operate on a proper kernel pointer using a bitfield to override the normal choice of a user pointer for recvmsg. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/linux/socket.h | 12 +++++++++++- net/compat.c | 5 +++-- net/core/scm.c | 49 ++++++++++++++++++++++++++++--------------------- net/ipv4/ip_sockglue.c | 3 ++- net/socket.c | 22 ++++++---------------- 5 files changed, 50 insertions(+), 41 deletions(-) (limited to 'net') diff --git a/include/linux/socket.h b/include/linux/socket.h index 4cc64d611cf4..04d2bc97f497 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -50,7 +50,17 @@ struct msghdr { void *msg_name; /* ptr to socket address structure */ int msg_namelen; /* size of socket address structure */ struct iov_iter msg_iter; /* data */ - void *msg_control; /* ancillary data */ + + /* + * Ancillary data. msg_control_user is the user buffer used for the + * recv* side when msg_control_is_user is set, msg_control is the kernel + * buffer used for all other cases. + */ + union { + void *msg_control; + void __user *msg_control_user; + }; + bool msg_control_is_user : 1; __kernel_size_t msg_controllen; /* ancillary data buffer length */ unsigned int msg_flags; /* flags on received message */ struct kiocb *msg_iocb; /* ptr to iocb for async requests */ diff --git a/net/compat.c b/net/compat.c index 4bed96e84d9a..69fc6d1e4e6e 100644 --- a/net/compat.c +++ b/net/compat.c @@ -56,7 +56,8 @@ int __get_compat_msghdr(struct msghdr *kmsg, if (kmsg->msg_namelen > sizeof(struct sockaddr_storage)) kmsg->msg_namelen = sizeof(struct sockaddr_storage); - kmsg->msg_control = compat_ptr(msg.msg_control); + kmsg->msg_control_is_user = true; + kmsg->msg_control_user = compat_ptr(msg.msg_control); kmsg->msg_controllen = msg.msg_controllen; if (save_addr) @@ -121,7 +122,7 @@ int get_compat_msghdr(struct msghdr *kmsg, ((ucmlen) >= sizeof(struct compat_cmsghdr) && \ (ucmlen) <= (unsigned long) \ ((mhdr)->msg_controllen - \ - ((char *)(ucmsg) - (char *)(mhdr)->msg_control))) + ((char __user *)(ucmsg) - (char __user *)(mhdr)->msg_control_user))) static inline struct compat_cmsghdr __user *cmsg_compat_nxthdr(struct msghdr *msg, struct compat_cmsghdr __user *cmsg, int cmsg_len) diff --git a/net/core/scm.c b/net/core/scm.c index 168b006a52ff..a75cd637a71f 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -212,16 +212,12 @@ EXPORT_SYMBOL(__scm_send); int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data) { - struct cmsghdr __user *cm - = (__force struct cmsghdr __user *)msg->msg_control; - struct cmsghdr cmhdr; int cmlen = CMSG_LEN(len); - int err; - if (MSG_CMSG_COMPAT & msg->msg_flags) + if (msg->msg_flags & MSG_CMSG_COMPAT) return put_cmsg_compat(msg, level, type, len, data); - if (cm==NULL || msg->msg_controllen < sizeof(*cm)) { + if (!msg->msg_control || msg->msg_controllen < sizeof(struct cmsghdr)) { msg->msg_flags |= MSG_CTRUNC; return 0; /* XXX: return error? check spec. */ } @@ -229,23 +225,30 @@ int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data) msg->msg_flags |= MSG_CTRUNC; cmlen = msg->msg_controllen; } - cmhdr.cmsg_level = level; - cmhdr.cmsg_type = type; - cmhdr.cmsg_len = cmlen; - - err = -EFAULT; - if (copy_to_user(cm, &cmhdr, sizeof cmhdr)) - goto out; - if (copy_to_user(CMSG_USER_DATA(cm), data, cmlen - sizeof(*cm))) - goto out; - cmlen = CMSG_SPACE(len); - if (msg->msg_controllen < cmlen) - cmlen = msg->msg_controllen; + + if (msg->msg_control_is_user) { + struct cmsghdr __user *cm = msg->msg_control_user; + struct cmsghdr cmhdr; + + cmhdr.cmsg_level = level; + cmhdr.cmsg_type = type; + cmhdr.cmsg_len = cmlen; + if (copy_to_user(cm, &cmhdr, sizeof cmhdr) || + copy_to_user(CMSG_USER_DATA(cm), data, cmlen - sizeof(*cm))) + return -EFAULT; + } else { + struct cmsghdr *cm = msg->msg_control; + + cm->cmsg_level = level; + cm->cmsg_type = type; + cm->cmsg_len = cmlen; + memcpy(CMSG_DATA(cm), data, cmlen - sizeof(*cm)); + } + + cmlen = min(CMSG_SPACE(len), msg->msg_controllen); msg->msg_control += cmlen; msg->msg_controllen -= cmlen; - err = 0; -out: - return err; + return 0; } EXPORT_SYMBOL(put_cmsg); @@ -328,6 +331,10 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) return; } + /* no use for FD passing from kernel space callers */ + if (WARN_ON_ONCE(!msg->msg_control_is_user)) + return; + for (i = 0; i < fdmax; i++) { err = __scm_install_fd(scm->fp->fp[i], cmsg_data + i, o_flags); if (err) diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index aa3fd61818c4..8206047d70b6 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -1492,7 +1492,8 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, if (sk->sk_type != SOCK_STREAM) return -ENOPROTOOPT; - msg.msg_control = (__force void *) optval; + msg.msg_control_is_user = true; + msg.msg_control_user = optval; msg.msg_controllen = len; msg.msg_flags = flags; diff --git a/net/socket.c b/net/socket.c index 2dd739fba866..1c9a7260a41d 100644 --- a/net/socket.c +++ b/net/socket.c @@ -924,14 +924,9 @@ EXPORT_SYMBOL(sock_recvmsg); int kernel_recvmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t num, size_t size, int flags) { - mm_segment_t oldfs = get_fs(); - int result; - + msg->msg_control_is_user = false; iov_iter_kvec(&msg->msg_iter, READ, vec, num, size); - set_fs(KERNEL_DS); - result = sock_recvmsg(sock, msg, flags); - set_fs(oldfs); - return result; + return sock_recvmsg(sock, msg, flags); } EXPORT_SYMBOL(kernel_recvmsg); @@ -2239,7 +2234,8 @@ int __copy_msghdr_from_user(struct msghdr *kmsg, if (copy_from_user(&msg, umsg, sizeof(*umsg))) return -EFAULT; - kmsg->msg_control = (void __force *)msg.msg_control; + kmsg->msg_control_is_user = true; + kmsg->msg_control_user = msg.msg_control; kmsg->msg_controllen = msg.msg_controllen; kmsg->msg_flags = msg.msg_flags; @@ -2331,16 +2327,10 @@ static int ____sys_sendmsg(struct socket *sock, struct msghdr *msg_sys, goto out; } err = -EFAULT; - /* - * Careful! Before this, msg_sys->msg_control contains a user pointer. - * Afterwards, it will be a kernel pointer. Thus the compiler-assisted - * checking falls down on this. - */ - if (copy_from_user(ctl_buf, - (void __user __force *)msg_sys->msg_control, - ctl_len)) + if (copy_from_user(ctl_buf, msg_sys->msg_control_user, ctl_len)) goto out_freectl; msg_sys->msg_control = ctl_buf; + msg_sys->msg_control_is_user = false; } msg_sys->msg_flags = flags; -- cgit v1.2.3 From 54a0ed0df49609f4e3f098f8943e38e389dc2e15 Mon Sep 17 00:00:00 2001 From: Russell King Date: Tue, 12 May 2020 20:20:25 +0300 Subject: net: dsa: provide an option for drivers to always receive bridge VLANs DSA assumes that a bridge which has vlan filtering disabled is not vlan aware, and ignores all vlan configuration. However, the kernel software bridge code allows configuration in this state. This causes the kernel's idea of the bridge vlan state and the hardware state to disagree, so "bridge vlan show" indicates a correct configuration but the hardware lacks all configuration. Even worse, enabling vlan filtering on a DSA bridge immediately blocks all traffic which, given the output of "bridge vlan show", is very confusing. Provide an option that drivers can set to indicate they want to receive vlan configuration even when vlan filtering is disabled. At the very least, this is safe for Marvell DSA bridges, which do not look up ingress traffic in the VTU if the port is in 8021Q disabled state. It is also safe for the Ocelot switch family. Whether this change is suitable for all DSA bridges is not known. Signed-off-by: Russell King Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 7 +++++++ net/dsa/dsa_priv.h | 1 + net/dsa/port.c | 14 ++++++++++++++ net/dsa/slave.c | 8 ++++---- 4 files changed, 26 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 312c2f067e65..50389772c597 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -282,6 +282,13 @@ struct dsa_switch { */ bool vlan_filtering_is_global; + /* Pass .port_vlan_add and .port_vlan_del to drivers even for bridges + * that have vlan_filtering=0. All drivers should ideally set this (and + * then the option would get removed), but it is unknown whether this + * would break things or not. + */ + bool configure_vlan_while_not_filtering; + /* In case vlan_filtering_is_global is set, the VLAN awareness state * should be retrieved from here and not from the per-port settings. */ diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index a1a0ae242012..adecf73bd608 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -138,6 +138,7 @@ int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br); void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br); int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering, struct switchdev_trans *trans); +bool dsa_port_skip_vlan_configuration(struct dsa_port *dp); int dsa_port_ageing_time(struct dsa_port *dp, clock_t ageing_clock, struct switchdev_trans *trans); int dsa_port_mtu_change(struct dsa_port *dp, int new_mtu, diff --git a/net/dsa/port.c b/net/dsa/port.c index ebc8d6cbd1d4..e23ece229c7e 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -257,6 +257,20 @@ int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering, return 0; } +/* This enforces legacy behavior for switch drivers which assume they can't + * receive VLAN configuration when enslaved to a bridge with vlan_filtering=0 + */ +bool dsa_port_skip_vlan_configuration(struct dsa_port *dp) +{ + struct dsa_switch *ds = dp->ds; + + if (!dp->bridge_dev) + return false; + + return (!ds->configure_vlan_while_not_filtering && + !br_vlan_enabled(dp->bridge_dev)); +} + int dsa_port_ageing_time(struct dsa_port *dp, clock_t ageing_clock, struct switchdev_trans *trans) { diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 61b0de52040a..886490fb203d 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -314,7 +314,7 @@ static int dsa_slave_vlan_add(struct net_device *dev, if (obj->orig_dev != dev) return -EOPNOTSUPP; - if (dp->bridge_dev && !br_vlan_enabled(dp->bridge_dev)) + if (dsa_port_skip_vlan_configuration(dp)) return 0; vlan = *SWITCHDEV_OBJ_PORT_VLAN(obj); @@ -381,7 +381,7 @@ static int dsa_slave_vlan_del(struct net_device *dev, if (obj->orig_dev != dev) return -EOPNOTSUPP; - if (dp->bridge_dev && !br_vlan_enabled(dp->bridge_dev)) + if (dsa_port_skip_vlan_configuration(dp)) return 0; /* Do not deprogram the CPU port as it may be shared with other user @@ -1240,7 +1240,7 @@ static int dsa_slave_vlan_rx_add_vid(struct net_device *dev, __be16 proto, * need to emulate the switchdev prepare + commit phase. */ if (dp->bridge_dev) { - if (!br_vlan_enabled(dp->bridge_dev)) + if (dsa_port_skip_vlan_configuration(dp)) return 0; /* br_vlan_get_info() returns -EINVAL or -ENOENT if the @@ -1274,7 +1274,7 @@ static int dsa_slave_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, * need to emulate the switchdev prepare + commit phase. */ if (dp->bridge_dev) { - if (!br_vlan_enabled(dp->bridge_dev)) + if (dsa_port_skip_vlan_configuration(dp)) return 0; /* br_vlan_get_info() returns -EINVAL or -ENOENT if the -- cgit v1.2.3 From 1f66b0f0aec671f8fbc86d75b2efdf7c7e0f7880 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 12 May 2020 20:20:26 +0300 Subject: net: dsa: tag_8021q: introduce a vid_is_dsa_8021q helper This function returns a boolean denoting whether the VLAN passed as argument is part of the 1024-3071 range that the dsa_8021q tagging scheme uses. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/dsa/8021q.h | 7 +++++++ net/dsa/tag_8021q.c | 7 +++++++ 2 files changed, 14 insertions(+) (limited to 'net') diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h index b8daaec0896e..ebc245ff838a 100644 --- a/include/linux/dsa/8021q.h +++ b/include/linux/dsa/8021q.h @@ -50,6 +50,8 @@ int dsa_8021q_rx_switch_id(u16 vid); int dsa_8021q_rx_source_port(u16 vid); +bool vid_is_dsa_8021q(u16 vid); + #else int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int index, @@ -107,6 +109,11 @@ int dsa_8021q_rx_source_port(u16 vid) return 0; } +bool vid_is_dsa_8021q(u16 vid) +{ + return false; +} + #endif /* IS_ENABLED(CONFIG_NET_DSA_TAG_8021Q) */ #endif /* _NET_DSA_8021Q_H */ diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c index ff9c5bf64bda..4774ecd1f8fc 100644 --- a/net/dsa/tag_8021q.c +++ b/net/dsa/tag_8021q.c @@ -93,6 +93,13 @@ int dsa_8021q_rx_source_port(u16 vid) } EXPORT_SYMBOL_GPL(dsa_8021q_rx_source_port); +bool vid_is_dsa_8021q(u16 vid) +{ + return ((vid & DSA_8021Q_DIR_MASK) == DSA_8021Q_DIR_RX || + (vid & DSA_8021Q_DIR_MASK) == DSA_8021Q_DIR_TX); +} +EXPORT_SYMBOL_GPL(vid_is_dsa_8021q); + static int dsa_8021q_restore_pvid(struct dsa_switch *ds, int port) { struct bridge_vlan_info vinfo; -- cgit v1.2.3 From ec5ae61076d07be986df19773662506220757c9f Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 12 May 2020 20:20:29 +0300 Subject: net: dsa: sja1105: save/restore VLANs using a delta commit method Managing the VLAN table that is present in hardware will become very difficult once we add a third operating state (best_effort_vlan_filtering). That is because correct cleanup (not too little, not too much) becomes virtually impossible, when VLANs can be added from the bridge layer, from dsa_8021q for basic tagging, for cross-chip bridging, as well as retagging rules for sub-VLANs and cross-chip sub-VLANs. So we need to rethink VLAN interaction with the switch in a more scalable way. In preparation for that, use the priv->expect_dsa_8021q boolean to classify any VLAN request received through .port_vlan_add or .port_vlan_del towards either one of 2 internal lists: bridge VLANs and dsa_8021q VLANs. Then, implement a central sja1105_build_vlan_table method that creates a VLAN configuration from scratch based on the 2 lists of VLANs kept by the driver, and based on the VLAN awareness state. Currently, if we are VLAN-unaware, install the dsa_8021q VLANs, otherwise the bridge VLANs. Then, implement a delta commit procedure that identifies which VLANs from this new configuration are actually different from the config previously committed to hardware. We apply the delta through the dynamic configuration interface (we don't reset the switch). The result is that the hardware should see the exact sequence of operations as before this patch. This also helps remove the "br" argument passed to dsa_8021q_crosschip_bridge_join, which it was only using to figure out whether it should commit the configuration back to us or not, based on the VLAN awareness state of the bridge. We can simplify that, by always allowing those VLANs inside of our dsa_8021q_vlans list, and committing those to hardware when necessary. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/sja1105/sja1105.h | 10 + drivers/net/dsa/sja1105/sja1105_main.c | 493 ++++++++++++++++++++++++--------- include/linux/dsa/8021q.h | 19 +- net/dsa/tag_8021q.c | 45 ++- 4 files changed, 393 insertions(+), 174 deletions(-) (limited to 'net') diff --git a/drivers/net/dsa/sja1105/sja1105.h b/drivers/net/dsa/sja1105/sja1105.h index 667056d0c819..c80f1999c694 100644 --- a/drivers/net/dsa/sja1105/sja1105.h +++ b/drivers/net/dsa/sja1105/sja1105.h @@ -178,6 +178,14 @@ struct sja1105_flow_block { int num_virtual_links; }; +struct sja1105_bridge_vlan { + struct list_head list; + int port; + u16 vid; + bool pvid; + bool untagged; +}; + enum sja1105_vlan_state { SJA1105_VLAN_UNAWARE, SJA1105_VLAN_FILTERING_FULL, @@ -191,6 +199,8 @@ struct sja1105_private { struct gpio_desc *reset_gpio; struct spi_device *spidev; struct dsa_switch *ds; + struct list_head dsa_8021q_vlans; + struct list_head bridge_vlans; struct list_head crosschip_links; struct sja1105_flow_block flow_block; struct sja1105_port ports[SJA1105_NUM_PORTS]; diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c index 8e68adba9144..fb95130299b1 100644 --- a/drivers/net/dsa/sja1105/sja1105_main.c +++ b/drivers/net/dsa/sja1105/sja1105_main.c @@ -303,7 +303,8 @@ static int sja1105_init_static_vlan(struct sja1105_private *priv) .tag_port = 0, .vlanid = 1, }; - int i; + struct dsa_switch *ds = priv->ds; + int port; table = &priv->static_config.tables[BLK_IDX_VLAN_LOOKUP]; @@ -324,12 +325,31 @@ static int sja1105_init_static_vlan(struct sja1105_private *priv) table->entry_count = 1; /* VLAN 1: all DT-defined ports are members; no restrictions on - * forwarding; always transmit priority-tagged frames as untagged. + * forwarding; always transmit as untagged. */ - for (i = 0; i < SJA1105_NUM_PORTS; i++) { - pvid.vmemb_port |= BIT(i); - pvid.vlan_bc |= BIT(i); - pvid.tag_port &= ~BIT(i); + for (port = 0; port < ds->num_ports; port++) { + struct sja1105_bridge_vlan *v; + + if (dsa_is_unused_port(ds, port)) + continue; + + pvid.vmemb_port |= BIT(port); + pvid.vlan_bc |= BIT(port); + pvid.tag_port &= ~BIT(port); + + /* Let traffic that don't need dsa_8021q (e.g. STP, PTP) be + * transmitted as untagged. + */ + v = kzalloc(sizeof(*v), GFP_KERNEL); + if (!v) + return -ENOMEM; + + v->port = port; + v->vid = 1; + v->untagged = true; + if (dsa_is_cpu_port(ds, port)) + v->pvid = true; + list_add(&v->list, &priv->dsa_8021q_vlans); } ((struct sja1105_vlan_lookup_entry *)table->entries)[0] = pvid; @@ -1717,82 +1737,6 @@ static int sja1105_pvid_apply(struct sja1105_private *priv, int port, u16 pvid) &mac[port], true); } -static int sja1105_is_vlan_configured(struct sja1105_private *priv, u16 vid) -{ - struct sja1105_vlan_lookup_entry *vlan; - int count, i; - - vlan = priv->static_config.tables[BLK_IDX_VLAN_LOOKUP].entries; - count = priv->static_config.tables[BLK_IDX_VLAN_LOOKUP].entry_count; - - for (i = 0; i < count; i++) - if (vlan[i].vlanid == vid) - return i; - - /* Return an invalid entry index if not found */ - return -1; -} - -static int sja1105_vlan_apply(struct sja1105_private *priv, int port, u16 vid, - bool enabled, bool untagged) -{ - struct sja1105_vlan_lookup_entry *vlan; - struct sja1105_table *table; - bool keep = true; - int match, rc; - - table = &priv->static_config.tables[BLK_IDX_VLAN_LOOKUP]; - - match = sja1105_is_vlan_configured(priv, vid); - if (match < 0) { - /* Can't delete a missing entry. */ - if (!enabled) - return 0; - rc = sja1105_table_resize(table, table->entry_count + 1); - if (rc) - return rc; - match = table->entry_count - 1; - } - /* Assign pointer after the resize (it's new memory) */ - vlan = table->entries; - vlan[match].vlanid = vid; - if (enabled) { - vlan[match].vlan_bc |= BIT(port); - vlan[match].vmemb_port |= BIT(port); - } else { - vlan[match].vlan_bc &= ~BIT(port); - vlan[match].vmemb_port &= ~BIT(port); - } - /* Also unset tag_port if removing this VLAN was requested, - * just so we don't have a confusing bitmap (no practical purpose). - */ - if (untagged || !enabled) - vlan[match].tag_port &= ~BIT(port); - else - vlan[match].tag_port |= BIT(port); - /* If there's no port left as member of this VLAN, - * it's time for it to go. - */ - if (!vlan[match].vmemb_port) - keep = false; - - dev_dbg(priv->ds->dev, - "%s: port %d, vid %llu, broadcast domain 0x%llx, " - "port members 0x%llx, tagged ports 0x%llx, keep %d\n", - __func__, port, vlan[match].vlanid, vlan[match].vlan_bc, - vlan[match].vmemb_port, vlan[match].tag_port, keep); - - rc = sja1105_dynamic_config_write(priv, BLK_IDX_VLAN_LOOKUP, vid, - &vlan[match], keep); - if (rc < 0) - return rc; - - if (!keep) - return sja1105_table_delete_entry(table, match); - - return 0; -} - static int sja1105_crosschip_bridge_join(struct dsa_switch *ds, int tree_index, int sw_index, int other_port, struct net_device *br) @@ -1813,7 +1757,7 @@ static int sja1105_crosschip_bridge_join(struct dsa_switch *ds, other_priv->expect_dsa_8021q = true; rc = dsa_8021q_crosschip_bridge_join(ds, port, other_ds, - other_port, br, + other_port, &priv->crosschip_links); other_priv->expect_dsa_8021q = false; if (rc) @@ -1821,7 +1765,7 @@ static int sja1105_crosschip_bridge_join(struct dsa_switch *ds, priv->expect_dsa_8021q = true; rc = dsa_8021q_crosschip_bridge_join(other_ds, other_port, ds, - port, br, + port, &other_priv->crosschip_links); priv->expect_dsa_8021q = false; if (rc) @@ -1852,35 +1796,16 @@ static void sja1105_crosschip_bridge_leave(struct dsa_switch *ds, other_priv->expect_dsa_8021q = true; dsa_8021q_crosschip_bridge_leave(ds, port, other_ds, other_port, - br, &priv->crosschip_links); + &priv->crosschip_links); other_priv->expect_dsa_8021q = false; priv->expect_dsa_8021q = true; - dsa_8021q_crosschip_bridge_leave(other_ds, other_port, ds, - port, br, + dsa_8021q_crosschip_bridge_leave(other_ds, other_port, ds, port, &other_priv->crosschip_links); priv->expect_dsa_8021q = false; } } -static int sja1105_replay_crosschip_vlans(struct dsa_switch *ds, bool enabled) -{ - struct sja1105_private *priv = ds->priv; - struct dsa_8021q_crosschip_link *c; - int rc; - - list_for_each_entry(c, &priv->crosschip_links, list) { - priv->expect_dsa_8021q = true; - rc = dsa_8021q_crosschip_link_apply(ds, c->port, c->other_ds, - c->other_port, enabled); - priv->expect_dsa_8021q = false; - if (rc) - break; - } - - return rc; -} - static int sja1105_setup_8021q_tagging(struct dsa_switch *ds, bool enabled) { struct sja1105_private *priv = ds->priv; @@ -1896,11 +1821,6 @@ static int sja1105_setup_8021q_tagging(struct dsa_switch *ds, bool enabled) return rc; } } - rc = sja1105_replay_crosschip_vlans(ds, enabled); - if (rc) { - dev_err(ds->dev, "Failed to replay crosschip VLANs: %d\n", rc); - return rc; - } dev_info(ds->dev, "%s switch tagging\n", enabled ? "Enabled" : "Disabled"); @@ -1914,6 +1834,269 @@ sja1105_get_tag_protocol(struct dsa_switch *ds, int port, return DSA_TAG_PROTO_SJA1105; } +static int sja1105_is_vlan_configured(struct sja1105_private *priv, u16 vid) +{ + struct sja1105_vlan_lookup_entry *vlan; + int count, i; + + vlan = priv->static_config.tables[BLK_IDX_VLAN_LOOKUP].entries; + count = priv->static_config.tables[BLK_IDX_VLAN_LOOKUP].entry_count; + + for (i = 0; i < count; i++) + if (vlan[i].vlanid == vid) + return i; + + /* Return an invalid entry index if not found */ + return -1; +} + +static int sja1105_commit_vlans(struct sja1105_private *priv, + struct sja1105_vlan_lookup_entry *new_vlan) +{ + struct sja1105_vlan_lookup_entry *vlan; + struct sja1105_table *table; + int num_vlans = 0; + int rc, i, k = 0; + + /* VLAN table */ + table = &priv->static_config.tables[BLK_IDX_VLAN_LOOKUP]; + vlan = table->entries; + + for (i = 0; i < VLAN_N_VID; i++) { + int match = sja1105_is_vlan_configured(priv, i); + + if (new_vlan[i].vlanid != VLAN_N_VID) + num_vlans++; + + if (new_vlan[i].vlanid == VLAN_N_VID && match >= 0) { + /* Was there before, no longer is. Delete */ + dev_dbg(priv->ds->dev, "Deleting VLAN %d\n", i); + rc = sja1105_dynamic_config_write(priv, + BLK_IDX_VLAN_LOOKUP, + i, &vlan[match], false); + if (rc < 0) + return rc; + } else if (new_vlan[i].vlanid != VLAN_N_VID) { + /* Nothing changed, don't do anything */ + if (match >= 0 && + vlan[match].vlanid == new_vlan[i].vlanid && + vlan[match].tag_port == new_vlan[i].tag_port && + vlan[match].vlan_bc == new_vlan[i].vlan_bc && + vlan[match].vmemb_port == new_vlan[i].vmemb_port) + continue; + /* Update entry */ + dev_dbg(priv->ds->dev, "Updating VLAN %d\n", i); + rc = sja1105_dynamic_config_write(priv, + BLK_IDX_VLAN_LOOKUP, + i, &new_vlan[i], + true); + if (rc < 0) + return rc; + } + } + + if (table->entry_count) + kfree(table->entries); + + table->entries = kcalloc(num_vlans, table->ops->unpacked_entry_size, + GFP_KERNEL); + if (!table->entries) + return -ENOMEM; + + table->entry_count = num_vlans; + vlan = table->entries; + + for (i = 0; i < VLAN_N_VID; i++) { + if (new_vlan[i].vlanid == VLAN_N_VID) + continue; + vlan[k++] = new_vlan[i]; + } + + return 0; +} + +struct sja1105_crosschip_switch { + struct list_head list; + struct dsa_switch *other_ds; +}; + +static int sja1105_commit_pvid(struct sja1105_private *priv) +{ + struct sja1105_bridge_vlan *v; + struct list_head *vlan_list; + int rc = 0; + + if (priv->vlan_state == SJA1105_VLAN_FILTERING_FULL) + vlan_list = &priv->bridge_vlans; + else + vlan_list = &priv->dsa_8021q_vlans; + + list_for_each_entry(v, vlan_list, list) { + if (v->pvid) { + rc = sja1105_pvid_apply(priv, v->port, v->vid); + if (rc) + break; + } + } + + return rc; +} + +static int +sja1105_build_bridge_vlans(struct sja1105_private *priv, + struct sja1105_vlan_lookup_entry *new_vlan) +{ + struct sja1105_bridge_vlan *v; + + if (priv->vlan_state == SJA1105_VLAN_UNAWARE) + return 0; + + list_for_each_entry(v, &priv->bridge_vlans, list) { + int match = v->vid; + + new_vlan[match].vlanid = v->vid; + new_vlan[match].vmemb_port |= BIT(v->port); + new_vlan[match].vlan_bc |= BIT(v->port); + if (!v->untagged) + new_vlan[match].tag_port |= BIT(v->port); + } + + return 0; +} + +static int +sja1105_build_dsa_8021q_vlans(struct sja1105_private *priv, + struct sja1105_vlan_lookup_entry *new_vlan) +{ + struct sja1105_bridge_vlan *v; + + if (priv->vlan_state == SJA1105_VLAN_FILTERING_FULL) + return 0; + + list_for_each_entry(v, &priv->dsa_8021q_vlans, list) { + int match = v->vid; + + new_vlan[match].vlanid = v->vid; + new_vlan[match].vmemb_port |= BIT(v->port); + new_vlan[match].vlan_bc |= BIT(v->port); + if (!v->untagged) + new_vlan[match].tag_port |= BIT(v->port); + } + + return 0; +} + +static int sja1105_build_vlan_table(struct sja1105_private *priv, bool notify); + +static int sja1105_notify_crosschip_switches(struct sja1105_private *priv) +{ + struct sja1105_crosschip_switch *s, *pos; + struct list_head crosschip_switches; + struct dsa_8021q_crosschip_link *c; + int rc = 0; + + INIT_LIST_HEAD(&crosschip_switches); + + list_for_each_entry(c, &priv->crosschip_links, list) { + bool already_added = false; + + list_for_each_entry(s, &crosschip_switches, list) { + if (s->other_ds == c->other_ds) { + already_added = true; + break; + } + } + + if (already_added) + continue; + + s = kzalloc(sizeof(*s), GFP_KERNEL); + if (!s) { + dev_err(priv->ds->dev, "Failed to allocate memory\n"); + rc = -ENOMEM; + goto out; + } + s->other_ds = c->other_ds; + list_add(&s->list, &crosschip_switches); + } + + list_for_each_entry(s, &crosschip_switches, list) { + struct sja1105_private *other_priv = s->other_ds->priv; + + rc = sja1105_build_vlan_table(other_priv, false); + if (rc) + goto out; + } + +out: + list_for_each_entry_safe(s, pos, &crosschip_switches, list) { + list_del(&s->list); + kfree(s); + } + + return rc; +} + +static int sja1105_build_vlan_table(struct sja1105_private *priv, bool notify) +{ + struct sja1105_vlan_lookup_entry *new_vlan; + struct sja1105_table *table; + int rc; + int i; + + table = &priv->static_config.tables[BLK_IDX_VLAN_LOOKUP]; + new_vlan = kcalloc(VLAN_N_VID, + table->ops->unpacked_entry_size, GFP_KERNEL); + if (!new_vlan) + return -ENOMEM; + + for (i = 0; i < VLAN_N_VID; i++) + new_vlan[i].vlanid = VLAN_N_VID; + + /* Bridge VLANs */ + rc = sja1105_build_bridge_vlans(priv, new_vlan); + if (rc) + goto out; + + /* VLANs necessary for dsa_8021q operation, given to us by tag_8021q.c: + * - RX VLANs + * - TX VLANs + * - Crosschip links + */ + rc = sja1105_build_dsa_8021q_vlans(priv, new_vlan); + if (rc) + goto out; + + rc = sja1105_commit_vlans(priv, new_vlan); + if (rc) + goto out; + + rc = sja1105_commit_pvid(priv); + if (rc) + goto out; + + if (notify) { + rc = sja1105_notify_crosschip_switches(priv); + if (rc) + goto out; + } + +out: + kfree(new_vlan); + + return rc; +} + +/* Select the list to which we should add this VLAN. */ +static struct list_head *sja1105_classify_vlan(struct sja1105_private *priv, + u16 vid) +{ + if (priv->expect_dsa_8021q) + return &priv->dsa_8021q_vlans; + + return &priv->bridge_vlans; +} + static int sja1105_vlan_prepare(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan) { @@ -2026,45 +2209,80 @@ static void sja1105_vlan_add(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan) { struct sja1105_private *priv = ds->priv; + bool vlan_table_changed = false; u16 vid; int rc; for (vid = vlan->vid_begin; vid <= vlan->vid_end; vid++) { - rc = sja1105_vlan_apply(priv, port, vid, true, vlan->flags & - BRIDGE_VLAN_INFO_UNTAGGED); - if (rc < 0) { - dev_err(ds->dev, "Failed to add VLAN %d to port %d: %d\n", - vid, port, rc); - return; - } - if (vlan->flags & BRIDGE_VLAN_INFO_PVID) { - rc = sja1105_pvid_apply(ds->priv, port, vid); - if (rc < 0) { - dev_err(ds->dev, "Failed to set pvid %d on port %d: %d\n", - vid, port, rc); - return; + bool untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED; + bool pvid = vlan->flags & BRIDGE_VLAN_INFO_PVID; + struct sja1105_bridge_vlan *v; + struct list_head *vlan_list; + bool already_added = false; + + vlan_list = sja1105_classify_vlan(priv, vid); + + list_for_each_entry(v, vlan_list, list) { + if (v->port == port && v->vid == vid && + v->untagged == untagged && v->pvid == pvid) { + already_added = true; + break; } } + + if (already_added) + continue; + + v = kzalloc(sizeof(*v), GFP_KERNEL); + if (!v) { + dev_err(ds->dev, "Out of memory while storing VLAN\n"); + return; + } + + v->port = port; + v->vid = vid; + v->untagged = untagged; + v->pvid = pvid; + list_add(&v->list, vlan_list); + + vlan_table_changed = true; } + + if (!vlan_table_changed) + return; + + rc = sja1105_build_vlan_table(priv, true); + if (rc) + dev_err(ds->dev, "Failed to build VLAN table: %d\n", rc); } static int sja1105_vlan_del(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan) { struct sja1105_private *priv = ds->priv; + bool vlan_table_changed = false; u16 vid; - int rc; for (vid = vlan->vid_begin; vid <= vlan->vid_end; vid++) { - rc = sja1105_vlan_apply(priv, port, vid, false, vlan->flags & - BRIDGE_VLAN_INFO_UNTAGGED); - if (rc < 0) { - dev_err(ds->dev, "Failed to remove VLAN %d from port %d: %d\n", - vid, port, rc); - return rc; + struct sja1105_bridge_vlan *v, *n; + struct list_head *vlan_list; + + vlan_list = sja1105_classify_vlan(priv, vid); + + list_for_each_entry_safe(v, n, vlan_list, list) { + if (v->port == port && v->vid == vid) { + list_del(&v->list); + kfree(v); + vlan_table_changed = true; + break; + } } } - return 0; + + if (!vlan_table_changed) + return 0; + + return sja1105_build_vlan_table(priv, true); } /* The programming model for the SJA1105 switch is "all-at-once" via static @@ -2142,6 +2360,7 @@ static int sja1105_setup(struct dsa_switch *ds) static void sja1105_teardown(struct dsa_switch *ds) { struct sja1105_private *priv = ds->priv; + struct sja1105_bridge_vlan *v, *n; int port; for (port = 0; port < SJA1105_NUM_PORTS; port++) { @@ -2158,6 +2377,16 @@ static void sja1105_teardown(struct dsa_switch *ds) sja1105_tas_teardown(ds); sja1105_ptp_clock_unregister(ds); sja1105_static_config_free(&priv->static_config); + + list_for_each_entry_safe(v, n, &priv->dsa_8021q_vlans, list) { + list_del(&v->list); + kfree(v); + } + + list_for_each_entry_safe(v, n, &priv->bridge_vlans, list) { + list_del(&v->list); + kfree(v); + } } static int sja1105_port_enable(struct dsa_switch *ds, int port, @@ -2598,6 +2827,8 @@ static int sja1105_probe(struct spi_device *spi) mutex_init(&priv->mgmt_lock); INIT_LIST_HEAD(&priv->crosschip_links); + INIT_LIST_HEAD(&priv->bridge_vlans); + INIT_LIST_HEAD(&priv->dsa_8021q_vlans); sja1105_tas_setup(ds); sja1105_flower_setup(ds); diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h index ebc245ff838a..404bd2cce642 100644 --- a/include/linux/dsa/8021q.h +++ b/include/linux/dsa/8021q.h @@ -25,18 +25,14 @@ struct dsa_8021q_crosschip_link { int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int index, bool enabled); -int dsa_8021q_crosschip_link_apply(struct dsa_switch *ds, int port, - struct dsa_switch *other_ds, - int other_port, bool enabled); - int dsa_8021q_crosschip_bridge_join(struct dsa_switch *ds, int port, struct dsa_switch *other_ds, - int other_port, struct net_device *br, + int other_port, struct list_head *crosschip_links); int dsa_8021q_crosschip_bridge_leave(struct dsa_switch *ds, int port, struct dsa_switch *other_ds, - int other_port, struct net_device *br, + int other_port, struct list_head *crosschip_links); struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev, @@ -60,16 +56,9 @@ int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int index, return 0; } -int dsa_8021q_crosschip_link_apply(struct dsa_switch *ds, int port, - struct dsa_switch *other_ds, - int other_port, bool enabled) -{ - return 0; -} - int dsa_8021q_crosschip_bridge_join(struct dsa_switch *ds, int port, struct dsa_switch *other_ds, - int other_port, struct net_device *br, + int other_port, struct list_head *crosschip_links) { return 0; @@ -77,7 +66,7 @@ int dsa_8021q_crosschip_bridge_join(struct dsa_switch *ds, int port, int dsa_8021q_crosschip_bridge_leave(struct dsa_switch *ds, int port, struct dsa_switch *other_ds, - int other_port, struct net_device *br, + int other_port, struct list_head *crosschip_links) { return 0; diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c index 4774ecd1f8fc..3236fbbf85b9 100644 --- a/net/dsa/tag_8021q.c +++ b/net/dsa/tag_8021q.c @@ -296,9 +296,9 @@ int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int port, bool enabled) } EXPORT_SYMBOL_GPL(dsa_port_setup_8021q_tagging); -int dsa_8021q_crosschip_link_apply(struct dsa_switch *ds, int port, - struct dsa_switch *other_ds, - int other_port, bool enabled) +static int dsa_8021q_crosschip_link_apply(struct dsa_switch *ds, int port, + struct dsa_switch *other_ds, + int other_port, bool enabled) { u16 rx_vid = dsa_8021q_rx_vid(ds, port); @@ -308,7 +308,6 @@ int dsa_8021q_crosschip_link_apply(struct dsa_switch *ds, int port, return dsa_8021q_vid_apply(other_ds, other_port, rx_vid, BRIDGE_VLAN_INFO_UNTAGGED, enabled); } -EXPORT_SYMBOL_GPL(dsa_8021q_crosschip_link_apply); static int dsa_8021q_crosschip_link_add(struct dsa_switch *ds, int port, struct dsa_switch *other_ds, @@ -369,7 +368,7 @@ static void dsa_8021q_crosschip_link_del(struct dsa_switch *ds, */ int dsa_8021q_crosschip_bridge_join(struct dsa_switch *ds, int port, struct dsa_switch *other_ds, - int other_port, struct net_device *br, + int other_port, struct list_head *crosschip_links) { /* @other_upstream is how @other_ds reaches us. If we are part @@ -385,12 +384,10 @@ int dsa_8021q_crosschip_bridge_join(struct dsa_switch *ds, int port, if (rc) return rc; - if (!br_vlan_enabled(br)) { - rc = dsa_8021q_crosschip_link_apply(ds, port, other_ds, - other_port, true); - if (rc) - return rc; - } + rc = dsa_8021q_crosschip_link_apply(ds, port, other_ds, + other_port, true); + if (rc) + return rc; rc = dsa_8021q_crosschip_link_add(ds, port, other_ds, other_upstream, @@ -398,20 +395,14 @@ int dsa_8021q_crosschip_bridge_join(struct dsa_switch *ds, int port, if (rc) return rc; - if (!br_vlan_enabled(br)) { - rc = dsa_8021q_crosschip_link_apply(ds, port, other_ds, - other_upstream, true); - if (rc) - return rc; - } - - return 0; + return dsa_8021q_crosschip_link_apply(ds, port, other_ds, + other_upstream, true); } EXPORT_SYMBOL_GPL(dsa_8021q_crosschip_bridge_join); int dsa_8021q_crosschip_bridge_leave(struct dsa_switch *ds, int port, struct dsa_switch *other_ds, - int other_port, struct net_device *br, + int other_port, struct list_head *crosschip_links) { int other_upstream = dsa_upstream_port(other_ds, other_port); @@ -431,14 +422,12 @@ int dsa_8021q_crosschip_bridge_leave(struct dsa_switch *ds, int port, if (keep) continue; - if (!br_vlan_enabled(br)) { - rc = dsa_8021q_crosschip_link_apply(ds, port, - other_ds, - other_port, - false); - if (rc) - return rc; - } + rc = dsa_8021q_crosschip_link_apply(ds, port, + other_ds, + other_port, + false); + if (rc) + return rc; } } -- cgit v1.2.3 From 38b5beeae7a4cde87edabb0196fac1f55ae668ee Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 12 May 2020 20:20:32 +0300 Subject: net: dsa: sja1105: prepare tagger for handling DSA tags and VLAN simultaneously In VLAN-unaware mode, sja1105 uses VLAN tags with a custom TPID of 0xdadb. While in the yet-to-be introduced best_effort_vlan_filtering mode, it needs to work with normal VLAN TPID values. A complication arises when we must transmit a VLAN-tagged packet to the switch when it's in VLAN-aware mode. We need to construct a packet with 2 VLAN tags, and the switch will use the outer header for routing and pop it on egress. But sadly, here the 2 hardware generations don't behave the same: - E/T switches won't pop an ETH_P_8021AD tag on egress, it seems (packets will remain double-tagged). - P/Q/R/S switches will drop a packet with 2 ETH_P_8021Q tags (it looks like it tries to prevent VLAN hopping). But looks like the reverse is also true: - E/T switches have no problem popping the outer tag from packets with 2 ETH_P_8021Q tags. - P/Q/R/S will have no problem popping a single tag even if that is ETH_P_8021AD. So it is clear that if we want the hardware to work with dsa_8021q tagging in VLAN-aware mode, we need to send different TPIDs depending on revision. Keep that information in priv->info->qinq_tpid. The per-port tagger structure will hold an xmit_tpid value that depends not only upon the qinq_tpid, but also upon the VLAN awareness state itself (in case we must transmit using 0xdadb). Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/sja1105/sja1105.h | 6 ++++++ drivers/net/dsa/sja1105/sja1105_main.c | 10 ++++++++++ drivers/net/dsa/sja1105/sja1105_spi.c | 6 ++++++ include/linux/dsa/sja1105.h | 1 + net/dsa/tag_sja1105.c | 32 +++++++++++++++++++++----------- 5 files changed, 44 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/drivers/net/dsa/sja1105/sja1105.h b/drivers/net/dsa/sja1105/sja1105.h index c80f1999c694..a019ffae38f1 100644 --- a/drivers/net/dsa/sja1105/sja1105.h +++ b/drivers/net/dsa/sja1105/sja1105.h @@ -87,6 +87,12 @@ struct sja1105_info { const struct sja1105_dynamic_table_ops *dyn_ops; const struct sja1105_table_ops *static_ops; const struct sja1105_regs *regs; + /* Both E/T and P/Q/R/S have quirks when it comes to popping the S-Tag + * from double-tagged frames. E/T will pop it only when it's equal to + * TPID from the General Parameters Table, while P/Q/R/S will only + * pop it when it's equal to TPID2. + */ + u16 qinq_tpid; int (*reset_cmd)(struct dsa_switch *ds); int (*setup_rgmii_delay)(const void *ctx, int port); /* Prototypes from include/net/dsa.h */ diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c index 7b9c3db98e1d..b7e4a85caade 100644 --- a/drivers/net/dsa/sja1105/sja1105_main.c +++ b/drivers/net/dsa/sja1105/sja1105_main.c @@ -2153,6 +2153,15 @@ static int sja1105_vlan_filtering(struct dsa_switch *ds, int port, bool enabled) tpid2 = ETH_P_SJA1105; } + for (port = 0; port < ds->num_ports; port++) { + struct sja1105_port *sp = &priv->ports[port]; + + if (enabled) + sp->xmit_tpid = priv->info->qinq_tpid; + else + sp->xmit_tpid = ETH_P_SJA1105; + } + if (!enabled) state = SJA1105_VLAN_UNAWARE; else @@ -2866,6 +2875,7 @@ static int sja1105_probe(struct spi_device *spi) goto out; } skb_queue_head_init(&sp->xmit_queue); + sp->xmit_tpid = ETH_P_SJA1105; } return 0; diff --git a/drivers/net/dsa/sja1105/sja1105_spi.c b/drivers/net/dsa/sja1105/sja1105_spi.c index 0be75c49e6c3..a0dacae803cc 100644 --- a/drivers/net/dsa/sja1105/sja1105_spi.c +++ b/drivers/net/dsa/sja1105/sja1105_spi.c @@ -512,6 +512,7 @@ struct sja1105_info sja1105e_info = { .part_no = SJA1105ET_PART_NO, .static_ops = sja1105e_table_ops, .dyn_ops = sja1105et_dyn_ops, + .qinq_tpid = ETH_P_8021Q, .ptp_ts_bits = 24, .ptpegr_ts_bytes = 4, .reset_cmd = sja1105et_reset_cmd, @@ -526,6 +527,7 @@ struct sja1105_info sja1105t_info = { .part_no = SJA1105ET_PART_NO, .static_ops = sja1105t_table_ops, .dyn_ops = sja1105et_dyn_ops, + .qinq_tpid = ETH_P_8021Q, .ptp_ts_bits = 24, .ptpegr_ts_bytes = 4, .reset_cmd = sja1105et_reset_cmd, @@ -540,6 +542,7 @@ struct sja1105_info sja1105p_info = { .part_no = SJA1105P_PART_NO, .static_ops = sja1105p_table_ops, .dyn_ops = sja1105pqrs_dyn_ops, + .qinq_tpid = ETH_P_8021AD, .ptp_ts_bits = 32, .ptpegr_ts_bytes = 8, .setup_rgmii_delay = sja1105pqrs_setup_rgmii_delay, @@ -555,6 +558,7 @@ struct sja1105_info sja1105q_info = { .part_no = SJA1105Q_PART_NO, .static_ops = sja1105q_table_ops, .dyn_ops = sja1105pqrs_dyn_ops, + .qinq_tpid = ETH_P_8021AD, .ptp_ts_bits = 32, .ptpegr_ts_bytes = 8, .setup_rgmii_delay = sja1105pqrs_setup_rgmii_delay, @@ -570,6 +574,7 @@ struct sja1105_info sja1105r_info = { .part_no = SJA1105R_PART_NO, .static_ops = sja1105r_table_ops, .dyn_ops = sja1105pqrs_dyn_ops, + .qinq_tpid = ETH_P_8021AD, .ptp_ts_bits = 32, .ptpegr_ts_bytes = 8, .setup_rgmii_delay = sja1105pqrs_setup_rgmii_delay, @@ -586,6 +591,7 @@ struct sja1105_info sja1105s_info = { .static_ops = sja1105s_table_ops, .dyn_ops = sja1105pqrs_dyn_ops, .regs = &sja1105pqrs_regs, + .qinq_tpid = ETH_P_8021AD, .ptp_ts_bits = 32, .ptpegr_ts_bytes = 8, .setup_rgmii_delay = sja1105pqrs_setup_rgmii_delay, diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h index fa5735c353cd..f821d08b1b5f 100644 --- a/include/linux/dsa/sja1105.h +++ b/include/linux/dsa/sja1105.h @@ -59,6 +59,7 @@ struct sja1105_port { struct sja1105_tagger_data *data; struct dsa_port *dp; bool hwts_tx_en; + u16 xmit_tpid; }; #endif /* _NET_DSA_SJA1105_H */ diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c index 5ecac5921a7d..398e2b9a1b96 100644 --- a/net/dsa/tag_sja1105.c +++ b/net/dsa/tag_sja1105.c @@ -69,12 +69,25 @@ static inline bool sja1105_is_meta_frame(const struct sk_buff *skb) return true; } +static bool sja1105_can_use_vlan_as_tags(const struct sk_buff *skb) +{ + struct vlan_ethhdr *hdr = vlan_eth_hdr(skb); + + if (hdr->h_vlan_proto == ntohs(ETH_P_SJA1105)) + return true; + + if (hdr->h_vlan_proto != ntohs(ETH_P_8021Q)) + return false; + + return vid_is_dsa_8021q(ntohs(hdr->h_vlan_TCI) & VLAN_VID_MASK); +} + /* This is the first time the tagger sees the frame on RX. * Figure out if we can decode it. */ static bool sja1105_filter(const struct sk_buff *skb, struct net_device *dev) { - if (!dsa_port_is_vlan_filtering(dev->dsa_ptr)) + if (sja1105_can_use_vlan_as_tags(skb)) return true; if (sja1105_is_link_local(skb)) return true; @@ -96,6 +109,11 @@ static struct sk_buff *sja1105_defer_xmit(struct sja1105_port *sp, return NULL; } +static u16 sja1105_xmit_tpid(struct sja1105_port *sp) +{ + return sp->xmit_tpid; +} + static struct sk_buff *sja1105_xmit(struct sk_buff *skb, struct net_device *netdev) { @@ -111,15 +129,7 @@ static struct sk_buff *sja1105_xmit(struct sk_buff *skb, if (unlikely(sja1105_is_link_local(skb))) return sja1105_defer_xmit(dp->priv, skb); - /* If we are under a vlan_filtering bridge, IP termination on - * switch ports based on 802.1Q tags is simply too brittle to - * be passable. So just defer to the dsa_slave_notag_xmit - * implementation. - */ - if (dsa_port_is_vlan_filtering(dp)) - return skb; - - return dsa_8021q_xmit(skb, netdev, ETH_P_SJA1105, + return dsa_8021q_xmit(skb, netdev, sja1105_xmit_tpid(dp->priv), ((pcp << VLAN_PRIO_SHIFT) | tx_vid)); } @@ -258,7 +268,7 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb, hdr = eth_hdr(skb); tpid = ntohs(hdr->h_proto); - is_tagged = (tpid == ETH_P_SJA1105); + is_tagged = (tpid == ETH_P_SJA1105 || tpid == ETH_P_8021Q); is_link_local = sja1105_is_link_local(skb); is_meta = sja1105_is_meta_frame(skb); -- cgit v1.2.3 From 3eaae1d05f2b5be1be834bfad64f8fc2ad39a56d Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 12 May 2020 20:20:33 +0300 Subject: net: dsa: tag_8021q: support up to 8 VLANs per port using sub-VLANs For switches that support VLAN retagging, such as sja1105, we extend dsa_8021q by encoding a "sub-VLAN" into the remaining 3 free bits in the dsa_8021q tag. A sub-VLAN is nothing more than a number in the range 0-7, which serves as an index into a per-port driver lookup table. The sub-VLAN value of zero means that traffic is untagged (this is also backwards-compatible with dsa_8021q without retagging). The switch should be configured to retag VLAN-tagged traffic that gets transmitted towards the CPU port (and towards the CPU only). Example: bridge vlan add dev sw1p0 vid 100 The switch retags frames received on port 0, going to the CPU, and having VID 100, to the VID of 1104 (0x0450). In dsa_8021q language: | 11 | 10 | 9 | 8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 | +-----------+-----+-----------------+-----------+-----------------------+ | DIR | SVL | SWITCH_ID | SUBVLAN | PORT | +-----------+-----+-----------------+-----------+-----------------------+ 0x0450 means: - DIR = 0b01: this is an RX VLAN - SUBVLAN = 0b001: this is subvlan #1 - SWITCH_ID = 0b001: this is switch 1 (see the name "sw1p0") - PORT = 0b0000: this is port 0 (see the name "sw1p0") The driver also remembers the "1 -> 100" mapping. In the hotpath, if the sub-VLAN from the tag encodes a non-untagged frame, this mapping is used to create a VLAN hwaccel tag, with the value of 100. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/dsa/8021q.h | 16 ++++++++++++++ net/dsa/tag_8021q.c | 56 ++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 64 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h index 404bd2cce642..311aa04e7520 100644 --- a/include/linux/dsa/8021q.h +++ b/include/linux/dsa/8021q.h @@ -20,6 +20,8 @@ struct dsa_8021q_crosschip_link { refcount_t refcount; }; +#define DSA_8021Q_N_SUBVLAN 8 + #if IS_ENABLED(CONFIG_NET_DSA_TAG_8021Q) int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int index, @@ -42,10 +44,14 @@ u16 dsa_8021q_tx_vid(struct dsa_switch *ds, int port); u16 dsa_8021q_rx_vid(struct dsa_switch *ds, int port); +u16 dsa_8021q_rx_vid_subvlan(struct dsa_switch *ds, int port, u16 subvlan); + int dsa_8021q_rx_switch_id(u16 vid); int dsa_8021q_rx_source_port(u16 vid); +u16 dsa_8021q_rx_subvlan(u16 vid); + bool vid_is_dsa_8021q(u16 vid); #else @@ -88,6 +94,11 @@ u16 dsa_8021q_rx_vid(struct dsa_switch *ds, int port) return 0; } +u16 dsa_8021q_rx_vid_subvlan(struct dsa_switch *ds, int port, u16 subvlan) +{ + return 0; +} + int dsa_8021q_rx_switch_id(u16 vid) { return 0; @@ -98,6 +109,11 @@ int dsa_8021q_rx_source_port(u16 vid) return 0; } +u16 dsa_8021q_rx_subvlan(u16 vid) +{ + return 0; +} + bool vid_is_dsa_8021q(u16 vid) { return false; diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c index 3236fbbf85b9..3052da668156 100644 --- a/net/dsa/tag_8021q.c +++ b/net/dsa/tag_8021q.c @@ -17,7 +17,7 @@ * * | 11 | 10 | 9 | 8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 | * +-----------+-----+-----------------+-----------+-----------------------+ - * | DIR | RSV | SWITCH_ID | RSV | PORT | + * | DIR | SVL | SWITCH_ID | SUBVLAN | PORT | * +-----------+-----+-----------------+-----------+-----------------------+ * * DIR - VID[11:10]: @@ -27,17 +27,24 @@ * These values make the special VIDs of 0, 1 and 4095 to be left * unused by this coding scheme. * - * RSV - VID[9]: - * To be used for further expansion of SWITCH_ID or for other purposes. - * Must be transmitted as zero and ignored on receive. + * SVL/SUBVLAN - { VID[9], VID[5:4] }: + * Sub-VLAN encoding. Valid only when DIR indicates an RX VLAN. + * * 0 (0b000): Field does not encode a sub-VLAN, either because + * received traffic is untagged, PVID-tagged or because a second + * VLAN tag is present after this tag and not inside of it. + * * 1 (0b001): Received traffic is tagged with a VID value private + * to the host. This field encodes the index in the host's lookup + * table through which the value of the ingress VLAN ID can be + * recovered. + * * 2 (0b010): Field encodes a sub-VLAN. + * ... + * * 7 (0b111): Field encodes a sub-VLAN. + * When DIR indicates a TX VLAN, SUBVLAN must be transmitted as zero + * (by the host) and ignored on receive (by the switch). * * SWITCH_ID - VID[8:6]: * Index of switch within DSA tree. Must be between 0 and 7. * - * RSV - VID[5:4]: - * To be used for further expansion of PORT or for other purposes. - * Must be transmitted as zero and ignored on receive. - * * PORT - VID[3:0]: * Index of switch port. Must be between 0 and 15. */ @@ -54,6 +61,18 @@ #define DSA_8021Q_SWITCH_ID(x) (((x) << DSA_8021Q_SWITCH_ID_SHIFT) & \ DSA_8021Q_SWITCH_ID_MASK) +#define DSA_8021Q_SUBVLAN_HI_SHIFT 9 +#define DSA_8021Q_SUBVLAN_HI_MASK GENMASK(9, 9) +#define DSA_8021Q_SUBVLAN_LO_SHIFT 4 +#define DSA_8021Q_SUBVLAN_LO_MASK GENMASK(4, 3) +#define DSA_8021Q_SUBVLAN_HI(x) (((x) & GENMASK(2, 2)) >> 2) +#define DSA_8021Q_SUBVLAN_LO(x) ((x) & GENMASK(1, 0)) +#define DSA_8021Q_SUBVLAN(x) \ + (((DSA_8021Q_SUBVLAN_LO(x) << DSA_8021Q_SUBVLAN_LO_SHIFT) & \ + DSA_8021Q_SUBVLAN_LO_MASK) | \ + ((DSA_8021Q_SUBVLAN_HI(x) << DSA_8021Q_SUBVLAN_HI_SHIFT) & \ + DSA_8021Q_SUBVLAN_HI_MASK)) + #define DSA_8021Q_PORT_SHIFT 0 #define DSA_8021Q_PORT_MASK GENMASK(3, 0) #define DSA_8021Q_PORT(x) (((x) << DSA_8021Q_PORT_SHIFT) & \ @@ -79,6 +98,13 @@ u16 dsa_8021q_rx_vid(struct dsa_switch *ds, int port) } EXPORT_SYMBOL_GPL(dsa_8021q_rx_vid); +u16 dsa_8021q_rx_vid_subvlan(struct dsa_switch *ds, int port, u16 subvlan) +{ + return DSA_8021Q_DIR_RX | DSA_8021Q_SWITCH_ID(ds->index) | + DSA_8021Q_PORT(port) | DSA_8021Q_SUBVLAN(subvlan); +} +EXPORT_SYMBOL_GPL(dsa_8021q_rx_vid_subvlan); + /* Returns the decoded switch ID from the RX VID. */ int dsa_8021q_rx_switch_id(u16 vid) { @@ -93,6 +119,20 @@ int dsa_8021q_rx_source_port(u16 vid) } EXPORT_SYMBOL_GPL(dsa_8021q_rx_source_port); +/* Returns the decoded subvlan from the RX VID. */ +u16 dsa_8021q_rx_subvlan(u16 vid) +{ + u16 svl_hi, svl_lo; + + svl_hi = (vid & DSA_8021Q_SUBVLAN_HI_MASK) >> + DSA_8021Q_SUBVLAN_HI_SHIFT; + svl_lo = (vid & DSA_8021Q_SUBVLAN_LO_MASK) >> + DSA_8021Q_SUBVLAN_LO_SHIFT; + + return (svl_hi << 2) | svl_lo; +} +EXPORT_SYMBOL_GPL(dsa_8021q_rx_subvlan); + bool vid_is_dsa_8021q(u16 vid) { return ((vid & DSA_8021Q_DIR_MASK) == DSA_8021Q_DIR_RX || -- cgit v1.2.3 From 84eeb5d460e399795e9a92a0cd44999254886150 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 12 May 2020 20:20:34 +0300 Subject: net: dsa: tag_sja1105: implement sub-VLAN decoding Create a subvlan_map as part of each port's tagger private structure. This keeps reverse mappings of bridge-to-dsa_8021q VLAN retagging rules. Note that as of this patch, this piece of code is never engaged, due to the fact that the driver hasn't installed any retagging rule, so we'll always see packets with a subvlan code of 0 (untagged). Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/sja1105/sja1105_main.c | 4 ++++ include/linux/dsa/sja1105.h | 2 ++ net/dsa/tag_sja1105.c | 19 +++++++++++++++++++ 3 files changed, 25 insertions(+) (limited to 'net') diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c index b7e4a85caade..fd15a18596ea 100644 --- a/drivers/net/dsa/sja1105/sja1105_main.c +++ b/drivers/net/dsa/sja1105/sja1105_main.c @@ -2856,6 +2856,7 @@ static int sja1105_probe(struct spi_device *spi) struct sja1105_port *sp = &priv->ports[port]; struct dsa_port *dp = dsa_to_port(ds, port); struct net_device *slave; + int subvlan; if (!dsa_is_user_port(ds, port)) continue; @@ -2876,6 +2877,9 @@ static int sja1105_probe(struct spi_device *spi) } skb_queue_head_init(&sp->xmit_queue); sp->xmit_tpid = ETH_P_SJA1105; + + for (subvlan = 0; subvlan < DSA_8021Q_N_SUBVLAN; subvlan++) + sp->subvlan_map[subvlan] = VLAN_N_VID; } return 0; diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h index f821d08b1b5f..dd93735ae228 100644 --- a/include/linux/dsa/sja1105.h +++ b/include/linux/dsa/sja1105.h @@ -9,6 +9,7 @@ #include #include +#include #include #define ETH_P_SJA1105 ETH_P_DSA_8021Q @@ -53,6 +54,7 @@ struct sja1105_skb_cb { ((struct sja1105_skb_cb *)DSA_SKB_CB_PRIV(skb)) struct sja1105_port { + u16 subvlan_map[DSA_8021Q_N_SUBVLAN]; struct kthread_worker *xmit_worker; struct kthread_work xmit_work; struct sk_buff_head xmit_queue; diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c index 398e2b9a1b96..ad105550b145 100644 --- a/net/dsa/tag_sja1105.c +++ b/net/dsa/tag_sja1105.c @@ -254,6 +254,20 @@ static struct sk_buff return skb; } +static void sja1105_decode_subvlan(struct sk_buff *skb, u16 subvlan) +{ + struct dsa_port *dp = dsa_slave_to_port(skb->dev); + struct sja1105_port *sp = dp->priv; + u16 vid = sp->subvlan_map[subvlan]; + u16 vlan_tci; + + if (vid == VLAN_N_VID) + return; + + vlan_tci = (skb->priority << VLAN_PRIO_SHIFT) | vid; + __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_tci); +} + static struct sk_buff *sja1105_rcv(struct sk_buff *skb, struct net_device *netdev, struct packet_type *pt) @@ -263,6 +277,7 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb, struct ethhdr *hdr; u16 tpid, vid, tci; bool is_link_local; + u16 subvlan = 0; bool is_tagged; bool is_meta; @@ -286,6 +301,7 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb, source_port = dsa_8021q_rx_source_port(vid); switch_id = dsa_8021q_rx_switch_id(vid); skb->priority = (tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; + subvlan = dsa_8021q_rx_subvlan(vid); } else if (is_link_local) { /* Management traffic path. Switch embeds the switch ID and * port ID into bytes of the destination MAC, courtesy of @@ -310,6 +326,9 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb, return NULL; } + if (subvlan) + sja1105_decode_subvlan(skb, subvlan); + return sja1105_rcv_meta_state_machine(skb, &meta, is_link_local, is_meta); } -- cgit v1.2.3 From 51fa960d3b5163b1af22efdebcabfccc5d615ad6 Mon Sep 17 00:00:00 2001 From: William Tu Date: Tue, 12 May 2020 10:36:23 -0700 Subject: erspan: Check IFLA_GRE_ERSPAN_VER is set. Add a check to make sure the IFLA_GRE_ERSPAN_VER is provided by users. Fixes: f989d546a2d5 ("erspan: Add type I version 0 support.") Cc: Eric Dumazet Signed-off-by: William Tu Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index e29cd48674d7..0ce9b91ff55c 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1087,7 +1087,8 @@ static int erspan_validate(struct nlattr *tb[], struct nlattr *data[], if (ret) return ret; - if (nla_get_u8(data[IFLA_GRE_ERSPAN_VER]) == 0) + if (data[IFLA_GRE_ERSPAN_VER] && + nla_get_u8(data[IFLA_GRE_ERSPAN_VER]) == 0) return 0; /* ERSPAN type II/III should only have GRE sequence and key flag */ -- cgit v1.2.3 From fb9f2e92864f51d25e790947cca2ac4426a12f9c Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 13 May 2020 03:23:27 +0300 Subject: net: dsa: tag_sja1105: appease sparse checks for ethertype accessors A comparison between a value from the packet and an integer constant value needs to be done by converting the value from the packet from net->host, or the constant from host->net. Not the other way around. Even though it makes no practical difference, correct that. Fixes: 38b5beeae7a4 ("net: dsa: sja1105: prepare tagger for handling DSA tags and VLAN simultaneously") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- net/dsa/tag_sja1105.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c index ad105550b145..9b4a4d719291 100644 --- a/net/dsa/tag_sja1105.c +++ b/net/dsa/tag_sja1105.c @@ -73,10 +73,10 @@ static bool sja1105_can_use_vlan_as_tags(const struct sk_buff *skb) { struct vlan_ethhdr *hdr = vlan_eth_hdr(skb); - if (hdr->h_vlan_proto == ntohs(ETH_P_SJA1105)) + if (hdr->h_vlan_proto == htons(ETH_P_SJA1105)) return true; - if (hdr->h_vlan_proto != ntohs(ETH_P_8021Q)) + if (hdr->h_vlan_proto != htons(ETH_P_8021Q)) return false; return vid_is_dsa_8021q(ntohs(hdr->h_vlan_TCI) & VLAN_VID_MASK); -- cgit v1.2.3 From aaebf8e6088270e45d30314031b5d9a88a589cb9 Mon Sep 17 00:00:00 2001 From: Abhishek Pandit-Subedi Date: Tue, 12 May 2020 19:09:32 -0700 Subject: Bluetooth: Fix incorrect type for window and interval The types for window and interval should be uint16, not uint8. Signed-off-by: Abhishek Pandit-Subedi Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_request.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index 3f470f0e432c..f6870e98faab 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -890,7 +890,7 @@ void hci_req_add_le_passive_scan(struct hci_request *req) struct hci_dev *hdev = req->hdev; u8 own_addr_type; u8 filter_policy; - u8 window, interval; + u16 window, interval; if (hdev->scanning_paused) { bt_dev_dbg(hdev, "Scanning is paused for suspend"); -- cgit v1.2.3 From 91779665c129d361c032d2a590b37a077b6cb9d7 Mon Sep 17 00:00:00 2001 From: Abhishek Pandit-Subedi Date: Tue, 12 May 2020 19:09:33 -0700 Subject: Bluetooth: Modify LE window and interval for suspend When a device is suspended, it doesn't need to be as responsive to connection events. Increase the interval to 640ms (creating a duty cycle of roughly 1.75%) so that passive scanning uses much less power (vs previous duty cycle of 18.75%). The new window + interval combination has been tested to work with HID devices (which are currently the only devices capable of wake up). Signed-off-by: Abhishek Pandit-Subedi Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_request.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index f6870e98faab..6b45e31432a7 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -35,7 +35,7 @@ #define HCI_REQ_CANCELED 2 #define LE_SUSPEND_SCAN_WINDOW 0x0012 -#define LE_SUSPEND_SCAN_INTERVAL 0x0060 +#define LE_SUSPEND_SCAN_INTERVAL 0x0400 void hci_req_init(struct hci_request *req, struct hci_dev *hdev) { -- cgit v1.2.3 From 0d2c9825e46d45f8a520135c9c791b5c73a165ab Mon Sep 17 00:00:00 2001 From: Abhishek Pandit-Subedi Date: Tue, 12 May 2020 19:19:25 -0700 Subject: Bluetooth: Rename BT_SUSPEND_COMPLETE Renamed BT_SUSPEND_COMPLETE to BT_SUSPEND_CONFIGURE_WAKE since it sets up the event filter and whitelist for wake-up. Signed-off-by: Abhishek Pandit-Subedi Reviewed-by: Alain Michaud Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 2 +- net/bluetooth/hci_core.c | 2 +- net/bluetooth/hci_request.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 0c7f3ad76665..869ee2b30a4c 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -110,7 +110,7 @@ enum suspend_tasks { enum suspended_state { BT_RUNNING = 0, BT_SUSPEND_DISCONNECT, - BT_SUSPEND_COMPLETE, + BT_SUSPEND_CONFIGURE_WAKE, }; struct hci_conn_hash { diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 51d399273276..de1f4e72ec06 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -3353,7 +3353,7 @@ static int hci_suspend_notifier(struct notifier_block *nb, unsigned long action, /* Only configure whitelist if disconnect succeeded */ if (!ret) ret = hci_change_suspend_state(hdev, - BT_SUSPEND_COMPLETE); + BT_SUSPEND_CONFIGURE_WAKE); } else if (action == PM_POST_SUSPEND) { ret = hci_change_suspend_state(hdev, BT_RUNNING); } diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index 6b45e31432a7..1fc55685da62 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -1090,7 +1090,7 @@ void hci_req_prepare_suspend(struct hci_dev *hdev, enum suspended_state next) disconnect_counter); set_bit(SUSPEND_DISCONNECTING, hdev->suspend_tasks); } - } else if (next == BT_SUSPEND_COMPLETE) { + } else if (next == BT_SUSPEND_CONFIGURE_WAKE) { /* Unpause to take care of updating scanning params */ hdev->scanning_paused = false; /* Enable event filter for paired devices */ -- cgit v1.2.3 From 81dafad53c67abe4f09b0b04131fe490e76e5602 Mon Sep 17 00:00:00 2001 From: Abhishek Pandit-Subedi Date: Tue, 12 May 2020 19:19:26 -0700 Subject: Bluetooth: Add hook for driver to prevent wake from suspend Let drivers have a hook to disable configuring scanning during suspend. Drivers should use the device_may_wakeup function call to determine whether hci should be configured for wakeup. For example, an implementation for btusb may look like the following: bool btusb_prevent_wake(struct hci_dev *hdev) { struct btusb_data *data = hci_get_drvdata(hdev); return !device_may_wakeup(&data->udev->dev); } Signed-off-by: Abhishek Pandit-Subedi Reviewed-by: Alain Michaud Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 1 + net/bluetooth/hci_core.c | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 869ee2b30a4c..5dcf85f186c6 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -503,6 +503,7 @@ struct hci_dev { int (*set_diag)(struct hci_dev *hdev, bool enable); int (*set_bdaddr)(struct hci_dev *hdev, const bdaddr_t *bdaddr); void (*cmd_timeout)(struct hci_dev *hdev); + bool (*prevent_wake)(struct hci_dev *hdev); }; #define HCI_PHY_HANDLE(handle) (handle & 0xff) diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index de1f4e72ec06..dbe2d79f233f 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -3350,8 +3350,10 @@ static int hci_suspend_notifier(struct notifier_block *nb, unsigned long action, */ ret = hci_change_suspend_state(hdev, BT_SUSPEND_DISCONNECT); - /* Only configure whitelist if disconnect succeeded */ - if (!ret) + /* Only configure whitelist if disconnect succeeded and wake + * isn't being prevented. + */ + if (!ret && !(hdev->prevent_wake && hdev->prevent_wake(hdev))) ret = hci_change_suspend_state(hdev, BT_SUSPEND_CONFIGURE_WAKE); } else if (action == PM_POST_SUSPEND) { -- cgit v1.2.3 From adf1d6926444029396861413aba8a0f2a805742a Mon Sep 17 00:00:00 2001 From: Sonny Sasaka Date: Wed, 6 May 2020 12:55:03 -0700 Subject: Bluetooth: Handle Inquiry Cancel error after Inquiry Complete After sending Inquiry Cancel command to the controller, it is possible that Inquiry Complete event comes before Inquiry Cancel command complete event. In this case the Inquiry Cancel command will have status of Command Disallowed since there is no Inquiry session to be cancelled. This case should not be treated as error, otherwise we can reach an inconsistent state. Example of a btmon trace when this happened: < HCI Command: Inquiry Cancel (0x01|0x0002) plen 0 > HCI Event: Inquiry Complete (0x01) plen 1 Status: Success (0x00) > HCI Event: Command Complete (0x0e) plen 4 Inquiry Cancel (0x01|0x0002) ncmd 1 Status: Command Disallowed (0x0c) Signed-off-by: Sonny Sasaka Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_event.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 006c24e04b44..73aabca0064b 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -42,12 +42,27 @@ /* Handle HCI Event packets */ -static void hci_cc_inquiry_cancel(struct hci_dev *hdev, struct sk_buff *skb) +static void hci_cc_inquiry_cancel(struct hci_dev *hdev, struct sk_buff *skb, + u8 *new_status) { __u8 status = *((__u8 *) skb->data); BT_DBG("%s status 0x%2.2x", hdev->name, status); + /* It is possible that we receive Inquiry Complete event right + * before we receive Inquiry Cancel Command Complete event, in + * which case the latter event should have status of Command + * Disallowed (0x0c). This should not be treated as error, since + * we actually achieve what Inquiry Cancel wants to achieve, + * which is to end the last Inquiry session. + */ + if (status == 0x0c && !test_bit(HCI_INQUIRY, &hdev->flags)) { + bt_dev_warn(hdev, "Ignoring error of Inquiry Cancel command"); + status = 0x00; + } + + *new_status = status; + if (status) return; @@ -3233,7 +3248,7 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb, switch (*opcode) { case HCI_OP_INQUIRY_CANCEL: - hci_cc_inquiry_cancel(hdev, skb); + hci_cc_inquiry_cancel(hdev, skb, status); break; case HCI_OP_PERIODIC_INQ: -- cgit v1.2.3 From 5b440676c15bbe1a40f2546ec92db83ed66d9e22 Mon Sep 17 00:00:00 2001 From: Archie Pusaka Date: Tue, 14 Apr 2020 16:08:40 +0800 Subject: Bluetooth: L2CAP: add support for waiting disconnection resp Whenever we disconnect a L2CAP connection, we would immediately report a disconnection event (EPOLLHUP) to the upper layer, without waiting for the response of the other device. This patch offers an option to wait until we receive a disconnection response before reporting disconnection event, by using the "how" parameter in l2cap_sock_shutdown(). Therefore, upper layer can opt to wait for disconnection response by shutdown(sock, SHUT_WR). This can be used to enforce proper disconnection order in HID, where the disconnection of the interrupt channel must be complete before attempting to disconnect the control channel. Signed-off-by: Archie Pusaka Signed-off-by: Marcel Holtmann --- net/bluetooth/l2cap_sock.c | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index 1cea42ee1e92..a995d2c51fa7 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -1271,14 +1271,21 @@ static int l2cap_sock_shutdown(struct socket *sock, int how) struct l2cap_conn *conn; int err = 0; - BT_DBG("sock %p, sk %p", sock, sk); + BT_DBG("sock %p, sk %p, how %d", sock, sk, how); + + /* 'how' parameter is mapped to sk_shutdown as follows: + * SHUT_RD (0) --> RCV_SHUTDOWN (1) + * SHUT_WR (1) --> SEND_SHUTDOWN (2) + * SHUT_RDWR (2) --> SHUTDOWN_MASK (3) + */ + how++; if (!sk) return 0; lock_sock(sk); - if (sk->sk_shutdown) + if ((sk->sk_shutdown & how) == how) goto shutdown_already; BT_DBG("Handling sock shutdown"); @@ -1301,11 +1308,20 @@ static int l2cap_sock_shutdown(struct socket *sock, int how) * has already been actioned to close the L2CAP * link such as by l2cap_disconnection_req(). */ - if (sk->sk_shutdown) - goto has_shutdown; + if ((sk->sk_shutdown & how) == how) + goto shutdown_matched; } - sk->sk_shutdown = SHUTDOWN_MASK; + /* Try setting the RCV_SHUTDOWN bit, return early if SEND_SHUTDOWN + * is already set + */ + if ((how & RCV_SHUTDOWN) && !(sk->sk_shutdown & RCV_SHUTDOWN)) { + sk->sk_shutdown |= RCV_SHUTDOWN; + if ((sk->sk_shutdown & how) == how) + goto shutdown_matched; + } + + sk->sk_shutdown |= SEND_SHUTDOWN; release_sock(sk); l2cap_chan_lock(chan); @@ -1335,7 +1351,7 @@ static int l2cap_sock_shutdown(struct socket *sock, int how) err = bt_sock_wait_state(sk, BT_CLOSED, sk->sk_lingertime); -has_shutdown: +shutdown_matched: l2cap_chan_put(chan); sock_put(sk); @@ -1363,7 +1379,7 @@ static int l2cap_sock_release(struct socket *sock) bt_sock_unlink(&l2cap_sk_list, sk); - err = l2cap_sock_shutdown(sock, 2); + err = l2cap_sock_shutdown(sock, SHUT_RDWR); chan = l2cap_pi(sk)->chan; l2cap_chan_hold(chan); -- cgit v1.2.3 From 15172a46fa2796c1a1358a36babd31274716ed41 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 13 May 2020 11:02:19 -0700 Subject: bpf: net: Refactor bpf_iter target registration Currently bpf_iter_reg_target takes parameters from target and allocates memory to save them. This is really not necessary, esp. in the future we may grow information passed from targets to bpf_iter manager. The patch refactors the code so target reg_info becomes static and bpf_iter manager can just take a reference to it. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200513180219.2949605-1-yhs@fb.com --- include/linux/bpf.h | 2 +- kernel/bpf/bpf_iter.c | 36 +++++++++++++++++------------------- kernel/bpf/map_iter.c | 18 +++++++++--------- kernel/bpf/task_iter.c | 30 ++++++++++++++++-------------- net/ipv6/route.c | 18 +++++++++--------- net/netlink/af_netlink.c | 18 +++++++++--------- 6 files changed, 61 insertions(+), 61 deletions(-) (limited to 'net') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ab94dfd8826f..6fa773e2d1bf 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1153,7 +1153,7 @@ struct bpf_iter_meta { u64 seq_num; }; -int bpf_iter_reg_target(struct bpf_iter_reg *reg_info); +int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info); void bpf_iter_unreg_target(const char *target); bool bpf_iter_prog_supported(struct bpf_prog *prog); int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 0a45a6cdfabd..051fb8cab62a 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -8,11 +8,7 @@ struct bpf_iter_target_info { struct list_head list; - const char *target; - const struct seq_operations *seq_ops; - bpf_iter_init_seq_priv_t init_seq_private; - bpf_iter_fini_seq_priv_t fini_seq_private; - u32 seq_priv_size; + const struct bpf_iter_reg *reg_info; u32 btf_id; /* cached value */ }; @@ -222,8 +218,8 @@ static int iter_release(struct inode *inode, struct file *file) iter_priv = container_of(seq->private, struct bpf_iter_priv_data, target_private); - if (iter_priv->tinfo->fini_seq_private) - iter_priv->tinfo->fini_seq_private(seq->private); + if (iter_priv->tinfo->reg_info->fini_seq_private) + iter_priv->tinfo->reg_info->fini_seq_private(seq->private); bpf_prog_put(iter_priv->prog); seq->private = iter_priv; @@ -238,7 +234,12 @@ const struct file_operations bpf_iter_fops = { .release = iter_release, }; -int bpf_iter_reg_target(struct bpf_iter_reg *reg_info) +/* The argument reg_info will be cached in bpf_iter_target_info. + * The common practice is to declare target reg_info as + * a const static variable and passed as an argument to + * bpf_iter_reg_target(). + */ +int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info) { struct bpf_iter_target_info *tinfo; @@ -246,11 +247,7 @@ int bpf_iter_reg_target(struct bpf_iter_reg *reg_info) if (!tinfo) return -ENOMEM; - tinfo->target = reg_info->target; - tinfo->seq_ops = reg_info->seq_ops; - tinfo->init_seq_private = reg_info->init_seq_private; - tinfo->fini_seq_private = reg_info->fini_seq_private; - tinfo->seq_priv_size = reg_info->seq_priv_size; + tinfo->reg_info = reg_info; INIT_LIST_HEAD(&tinfo->list); mutex_lock(&targets_mutex); @@ -267,7 +264,7 @@ void bpf_iter_unreg_target(const char *target) mutex_lock(&targets_mutex); list_for_each_entry(tinfo, &targets, list) { - if (!strcmp(target, tinfo->target)) { + if (!strcmp(target, tinfo->reg_info->target)) { list_del(&tinfo->list); kfree(tinfo); found = true; @@ -303,7 +300,7 @@ bool bpf_iter_prog_supported(struct bpf_prog *prog) supported = true; break; } - if (!strcmp(attach_fname + prefix_len, tinfo->target)) { + if (!strcmp(attach_fname + prefix_len, tinfo->reg_info->target)) { cache_btf_id(tinfo, prog); supported = true; break; @@ -431,15 +428,16 @@ static int prepare_seq_file(struct file *file, struct bpf_iter_link *link) tinfo = link->tinfo; total_priv_dsize = offsetof(struct bpf_iter_priv_data, target_private) + - tinfo->seq_priv_size; - priv_data = __seq_open_private(file, tinfo->seq_ops, total_priv_dsize); + tinfo->reg_info->seq_priv_size; + priv_data = __seq_open_private(file, tinfo->reg_info->seq_ops, + total_priv_dsize); if (!priv_data) { err = -ENOMEM; goto release_prog; } - if (tinfo->init_seq_private) { - err = tinfo->init_seq_private(priv_data->target_private); + if (tinfo->reg_info->init_seq_private) { + err = tinfo->reg_info->init_seq_private(priv_data->target_private); if (err) goto release_seq_file; } diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index 8162e0c00b9f..c6216a5fe56e 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -81,17 +81,17 @@ static const struct seq_operations bpf_map_seq_ops = { .show = bpf_map_seq_show, }; +static const struct bpf_iter_reg bpf_map_reg_info = { + .target = "bpf_map", + .seq_ops = &bpf_map_seq_ops, + .init_seq_private = NULL, + .fini_seq_private = NULL, + .seq_priv_size = sizeof(struct bpf_iter_seq_map_info), +}; + static int __init bpf_map_iter_init(void) { - struct bpf_iter_reg reg_info = { - .target = "bpf_map", - .seq_ops = &bpf_map_seq_ops, - .init_seq_private = NULL, - .fini_seq_private = NULL, - .seq_priv_size = sizeof(struct bpf_iter_seq_map_info), - }; - - return bpf_iter_reg_target(®_info); + return bpf_iter_reg_target(&bpf_map_reg_info); } late_initcall(bpf_map_iter_init); diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index aeed662d8451..bd7bfd83d9e0 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -306,22 +306,24 @@ static const struct seq_operations task_file_seq_ops = { .show = task_file_seq_show, }; +static const struct bpf_iter_reg task_reg_info = { + .target = "task", + .seq_ops = &task_seq_ops, + .init_seq_private = init_seq_pidns, + .fini_seq_private = fini_seq_pidns, + .seq_priv_size = sizeof(struct bpf_iter_seq_task_info), +}; + +static const struct bpf_iter_reg task_file_reg_info = { + .target = "task_file", + .seq_ops = &task_file_seq_ops, + .init_seq_private = init_seq_pidns, + .fini_seq_private = fini_seq_pidns, + .seq_priv_size = sizeof(struct bpf_iter_seq_task_file_info), +}; + static int __init task_iter_init(void) { - struct bpf_iter_reg task_file_reg_info = { - .target = "task_file", - .seq_ops = &task_file_seq_ops, - .init_seq_private = init_seq_pidns, - .fini_seq_private = fini_seq_pidns, - .seq_priv_size = sizeof(struct bpf_iter_seq_task_file_info), - }; - struct bpf_iter_reg task_reg_info = { - .target = "task", - .seq_ops = &task_seq_ops, - .init_seq_private = init_seq_pidns, - .fini_seq_private = fini_seq_pidns, - .seq_priv_size = sizeof(struct bpf_iter_seq_task_info), - }; int ret; ret = bpf_iter_reg_target(&task_reg_info); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 25f6d3e619d0..6ad2fa51a23a 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -6397,17 +6397,17 @@ void __init ip6_route_init_special_entries(void) #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) DEFINE_BPF_ITER_FUNC(ipv6_route, struct bpf_iter_meta *meta, struct fib6_info *rt) +static const struct bpf_iter_reg ipv6_route_reg_info = { + .target = "ipv6_route", + .seq_ops = &ipv6_route_seq_ops, + .init_seq_private = bpf_iter_init_seq_net, + .fini_seq_private = bpf_iter_fini_seq_net, + .seq_priv_size = sizeof(struct ipv6_route_iter), +}; + static int __init bpf_iter_register(void) { - struct bpf_iter_reg reg_info = { - .target = "ipv6_route", - .seq_ops = &ipv6_route_seq_ops, - .init_seq_private = bpf_iter_init_seq_net, - .fini_seq_private = bpf_iter_fini_seq_net, - .seq_priv_size = sizeof(struct ipv6_route_iter), - }; - - return bpf_iter_reg_target(®_info); + return bpf_iter_reg_target(&ipv6_route_reg_info); } static void bpf_iter_unregister(void) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 33cda9baa979..839827227e98 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2803,17 +2803,17 @@ static const struct rhashtable_params netlink_rhashtable_params = { }; #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) +static const struct bpf_iter_reg netlink_reg_info = { + .target = "netlink", + .seq_ops = &netlink_seq_ops, + .init_seq_private = bpf_iter_init_seq_net, + .fini_seq_private = bpf_iter_fini_seq_net, + .seq_priv_size = sizeof(struct nl_seq_iter), +}; + static int __init bpf_iter_register(void) { - struct bpf_iter_reg reg_info = { - .target = "netlink", - .seq_ops = &netlink_seq_ops, - .init_seq_private = bpf_iter_init_seq_net, - .fini_seq_private = bpf_iter_fini_seq_net, - .seq_priv_size = sizeof(struct nl_seq_iter), - }; - - return bpf_iter_reg_target(®_info); + return bpf_iter_reg_target(&netlink_reg_info); } #endif -- cgit v1.2.3 From ab2ee4fcb9d61fd57db70db694adbcf54662bd80 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 13 May 2020 11:02:20 -0700 Subject: bpf: Change func bpf_iter_unreg_target() signature Change func bpf_iter_unreg_target() parameter from target name to target reg_info, similar to bpf_iter_reg_target(). Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200513180220.2949737-1-yhs@fb.com --- include/linux/bpf.h | 2 +- kernel/bpf/bpf_iter.c | 4 ++-- net/ipv6/route.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6fa773e2d1bf..534174eca86b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1154,7 +1154,7 @@ struct bpf_iter_meta { }; int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info); -void bpf_iter_unreg_target(const char *target); +void bpf_iter_unreg_target(const struct bpf_iter_reg *reg_info); bool bpf_iter_prog_supported(struct bpf_prog *prog); int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); int bpf_iter_new_fd(struct bpf_link *link); diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 051fb8cab62a..644f8626b2c0 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -257,14 +257,14 @@ int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info) return 0; } -void bpf_iter_unreg_target(const char *target) +void bpf_iter_unreg_target(const struct bpf_iter_reg *reg_info) { struct bpf_iter_target_info *tinfo; bool found = false; mutex_lock(&targets_mutex); list_for_each_entry(tinfo, &targets, list) { - if (!strcmp(target, tinfo->reg_info->target)) { + if (reg_info == tinfo->reg_info) { list_del(&tinfo->list); kfree(tinfo); found = true; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 6ad2fa51a23a..22bf4e36c093 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -6412,7 +6412,7 @@ static int __init bpf_iter_register(void) static void bpf_iter_unregister(void) { - bpf_iter_unreg_target("ipv6_route"); + bpf_iter_unreg_target(&ipv6_route_reg_info); } #endif #endif -- cgit v1.2.3 From 3c32cc1bceba8a1755dc35cd97516f6c67856844 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 13 May 2020 11:02:21 -0700 Subject: bpf: Enable bpf_iter targets registering ctx argument types Commit b121b341e598 ("bpf: Add PTR_TO_BTF_ID_OR_NULL support") adds a field btf_id_or_null_non0_off to bpf_prog->aux structure to indicate that the first ctx argument is PTR_TO_BTF_ID reg_type and all others are PTR_TO_BTF_ID_OR_NULL. This approach does not really scale if we have other different reg types in the future, e.g., a pointer to a buffer. This patch enables bpf_iter targets registering ctx argument reg types which may be different from the default one. For example, for pointers to structures, the default reg_type is PTR_TO_BTF_ID for tracing program. The target can register a particular pointer type as PTR_TO_BTF_ID_OR_NULL which can be used by the verifier to enforce accesses. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200513180221.2949882-1-yhs@fb.com --- include/linux/bpf.h | 12 +++++++++++- include/net/ip6_fib.h | 7 +++++++ kernel/bpf/bpf_iter.c | 5 +++++ kernel/bpf/btf.c | 15 ++++++++++----- kernel/bpf/map_iter.c | 5 +++++ kernel/bpf/task_iter.c | 12 ++++++++++++ kernel/bpf/verifier.c | 1 - net/ipv6/ip6_fib.c | 5 ----- net/ipv6/route.c | 5 +++++ net/netlink/af_netlink.c | 5 +++++ 10 files changed, 60 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 534174eca86b..c45d198ac38c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -643,6 +643,12 @@ struct bpf_jit_poke_descriptor { u16 reason; }; +/* reg_type info for ctx arguments */ +struct bpf_ctx_arg_aux { + u32 offset; + enum bpf_reg_type reg_type; +}; + struct bpf_prog_aux { atomic64_t refcnt; u32 used_map_cnt; @@ -654,12 +660,13 @@ struct bpf_prog_aux { u32 func_cnt; /* used by non-func prog as the number of func progs */ u32 func_idx; /* 0 for non-func prog, the index in func array for func prog */ u32 attach_btf_id; /* in-kernel BTF type id to attach to */ + u32 ctx_arg_info_size; + const struct bpf_ctx_arg_aux *ctx_arg_info; struct bpf_prog *linked_prog; bool verifier_zext; /* Zero extensions has been inserted by verifier. */ bool offload_requested; bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */ bool func_proto_unreliable; - bool btf_id_or_null_non0_off; enum bpf_tramp_prog_type trampoline_prog_type; struct bpf_trampoline *trampoline; struct hlist_node tramp_hlist; @@ -1139,12 +1146,15 @@ int bpf_obj_get_user(const char __user *pathname, int flags); typedef int (*bpf_iter_init_seq_priv_t)(void *private_data); typedef void (*bpf_iter_fini_seq_priv_t)(void *private_data); +#define BPF_ITER_CTX_ARG_MAX 2 struct bpf_iter_reg { const char *target; const struct seq_operations *seq_ops; bpf_iter_init_seq_priv_t init_seq_private; bpf_iter_fini_seq_priv_t fini_seq_private; u32 seq_priv_size; + u32 ctx_arg_info_size; + struct bpf_ctx_arg_aux ctx_arg_info[BPF_ITER_CTX_ARG_MAX]; }; struct bpf_iter_meta { diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 80262d2980f5..870b646c5797 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -540,6 +540,13 @@ static inline bool fib6_metric_locked(struct fib6_info *f6i, int metric) return !!(f6i->fib6_metrics->metrics[RTAX_LOCK - 1] & (1 << metric)); } +#if IS_BUILTIN(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL) +struct bpf_iter__ipv6_route { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct fib6_info *, rt); +}; +#endif + #ifdef CONFIG_IPV6_MULTIPLE_TABLES static inline bool fib6_has_custom_rules(const struct net *net) { diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 644f8626b2c0..dd612b80b9fe 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -308,6 +308,11 @@ bool bpf_iter_prog_supported(struct bpf_prog *prog) } mutex_unlock(&targets_mutex); + if (supported) { + prog->aux->ctx_arg_info_size = tinfo->reg_info->ctx_arg_info_size; + prog->aux->ctx_arg_info = tinfo->reg_info->ctx_arg_info; + } + return supported; } diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index dcd233139294..58c9af1d4808 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3694,7 +3694,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, struct bpf_verifier_log *log = info->log; const struct btf_param *args; u32 nr_args, arg; - int ret; + int i, ret; if (off % 8) { bpf_log(log, "func '%s' offset %d is not multiple of 8\n", @@ -3790,10 +3790,15 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, return true; /* this is a pointer to another type */ - if (off != 0 && prog->aux->btf_id_or_null_non0_off) - info->reg_type = PTR_TO_BTF_ID_OR_NULL; - else - info->reg_type = PTR_TO_BTF_ID; + info->reg_type = PTR_TO_BTF_ID; + for (i = 0; i < prog->aux->ctx_arg_info_size; i++) { + const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i]; + + if (ctx_arg_info->offset == off) { + info->reg_type = ctx_arg_info->reg_type; + break; + } + } if (tgt_prog) { ret = btf_translate_to_vmlinux(log, btf, t, tgt_prog->type, arg); diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index c6216a5fe56e..c69071e334bf 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -87,6 +87,11 @@ static const struct bpf_iter_reg bpf_map_reg_info = { .init_seq_private = NULL, .fini_seq_private = NULL, .seq_priv_size = sizeof(struct bpf_iter_seq_map_info), + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__bpf_map, map), + PTR_TO_BTF_ID_OR_NULL }, + }, }; static int __init bpf_map_iter_init(void) diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index bd7bfd83d9e0..a9b7264dda08 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -312,6 +312,11 @@ static const struct bpf_iter_reg task_reg_info = { .init_seq_private = init_seq_pidns, .fini_seq_private = fini_seq_pidns, .seq_priv_size = sizeof(struct bpf_iter_seq_task_info), + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__task, task), + PTR_TO_BTF_ID_OR_NULL }, + }, }; static const struct bpf_iter_reg task_file_reg_info = { @@ -320,6 +325,13 @@ static const struct bpf_iter_reg task_file_reg_info = { .init_seq_private = init_seq_pidns, .fini_seq_private = fini_seq_pidns, .seq_priv_size = sizeof(struct bpf_iter_seq_task_file_info), + .ctx_arg_info_size = 2, + .ctx_arg_info = { + { offsetof(struct bpf_iter__task_file, task), + PTR_TO_BTF_ID_OR_NULL }, + { offsetof(struct bpf_iter__task_file, file), + PTR_TO_BTF_ID_OR_NULL }, + }, }; static int __init task_iter_init(void) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2a1826c76bb6..a3f2af756fd6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10652,7 +10652,6 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) prog->aux->attach_func_proto = t; if (!bpf_iter_prog_supported(prog)) return -EINVAL; - prog->aux->btf_id_or_null_non0_off = true; ret = btf_distill_func_proto(&env->log, btf, t, tname, &fmodel); return ret; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index a1fcc0ca21af..250ff52c674e 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -2638,11 +2638,6 @@ static void ipv6_route_native_seq_stop(struct seq_file *seq, void *v) } #if IS_BUILTIN(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL) -struct bpf_iter__ipv6_route { - __bpf_md_ptr(struct bpf_iter_meta *, meta); - __bpf_md_ptr(struct fib6_info *, rt); -}; - static int ipv6_route_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, void *v) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 22bf4e36c093..22e56465f14d 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -6403,6 +6403,11 @@ static const struct bpf_iter_reg ipv6_route_reg_info = { .init_seq_private = bpf_iter_init_seq_net, .fini_seq_private = bpf_iter_fini_seq_net, .seq_priv_size = sizeof(struct ipv6_route_iter), + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__ipv6_route, rt), + PTR_TO_BTF_ID_OR_NULL }, + }, }; static int __init bpf_iter_register(void) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 839827227e98..4f2c3b14ddbf 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2809,6 +2809,11 @@ static const struct bpf_iter_reg netlink_reg_info = { .init_seq_private = bpf_iter_init_seq_net, .fini_seq_private = bpf_iter_fini_seq_net, .seq_priv_size = sizeof(struct nl_seq_iter), + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__netlink, sk), + PTR_TO_BTF_ID_OR_NULL }, + }, }; static int __init bpf_iter_register(void) -- cgit v1.2.3 From 6e8a4f9dda3823274fa8a4c1aa5e6a93f9775749 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 May 2020 13:07:59 +0200 Subject: net: ignore sock_from_file errors in __scm_install_fd The code had historically been ignoring these errors, and my recent refactoring changed that, which broke ssh in some setups. Fixes: 2618d530dd8b ("net/scm: cleanup scm_detach_fds") Reported-by: Ido Schimmel Signed-off-by: Christoph Hellwig Tested-by: Ido Schimmel Tested-by: Ioana Ciornei Signed-off-by: David S. Miller --- net/core/scm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/scm.c b/net/core/scm.c index a75cd637a71f..875df1c2989d 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -307,7 +307,7 @@ static int __scm_install_fd(struct file *file, int __user *ufd, int o_flags) sock_update_classid(&sock->sk->sk_cgrp_data); } fd_install(new_fd, get_file(file)); - return error; + return 0; } static int scm_max_fds(struct msghdr *msg) -- cgit v1.2.3 From 1b2f08df0a886e0565c71821d5230cba395f5c18 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 May 2020 21:36:41 +0200 Subject: ipv6: set msg_control_is_user in do_ipv6_getsockopt While do_ipv6_getsockopt does not call the high-level recvmsg helper, the msghdr eventually ends up being passed to put_cmsg anyway, and thus needs msg_control_is_user set to the proper value. Fixes: 1f466e1f15cf ("net: cleanly handle kernel vs user buffers for ->msg_control") Reported-by: Eric Dumazet Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- net/ipv6/ipv6_sockglue.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 18d05403d3b5..a0e50cc57e54 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -1075,6 +1075,7 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, msg.msg_control = optval; msg.msg_controllen = len; msg.msg_flags = flags; + msg.msg_control_is_user = true; lock_sock(sk); skb = np->pktoptions; -- cgit v1.2.3 From 5a46b062e28f57bffde767437fad3ab1d0cee2c7 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 13 May 2020 10:28:22 -0700 Subject: devlink: refactor end checks in devlink_nl_cmd_region_read_dumpit Clean up after recent fixes, move address calculations around and change the variable init, so that we can have just one start_offset == end_offset check. Make the check a little stricter to preserve the -EINVAL error if requested start offset is larger than the region itself. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/core/devlink.c | 41 +++++++++------------- .../selftests/drivers/net/netdevsim/devlink.sh | 15 ++++++++ 2 files changed, 31 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/net/core/devlink.c b/net/core/devlink.c index 20f935fa29f5..7b76e5fffc10 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -4215,7 +4215,6 @@ static int devlink_nl_region_read_snapshot_fill(struct sk_buff *skb, struct nlattr **attrs, u64 start_offset, u64 end_offset, - bool dump, u64 *new_offset) { struct devlink_snapshot *snapshot; @@ -4230,9 +4229,6 @@ static int devlink_nl_region_read_snapshot_fill(struct sk_buff *skb, if (!snapshot) return -EINVAL; - if (end_offset > region->size || dump) - end_offset = region->size; - while (curr_offset < end_offset) { u32 data_size; u8 *data; @@ -4260,13 +4256,12 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { const struct genl_dumpit_info *info = genl_dumpit_info(cb); - u64 ret_offset, start_offset, end_offset = 0; + u64 ret_offset, start_offset, end_offset = U64_MAX; struct nlattr **attrs = info->attrs; struct devlink_region *region; struct nlattr *chunks_attr; const char *region_name; struct devlink *devlink; - bool dump = true; void *hdr; int err; @@ -4294,8 +4289,21 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, goto out_unlock; } + if (attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR] && + attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]) { + if (!start_offset) + start_offset = + nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]); + + end_offset = nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]); + end_offset += nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]); + } + + if (end_offset > region->size) + end_offset = region->size; + /* return 0 if there is no further data to read */ - if (start_offset >= region->size) { + if (start_offset == end_offset) { err = 0; goto out_unlock; } @@ -4322,27 +4330,10 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, goto nla_put_failure; } - if (attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR] && - attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]) { - if (!start_offset) - start_offset = - nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]); - - end_offset = nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]); - end_offset += nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]); - dump = false; - - if (start_offset == end_offset) { - err = 0; - goto nla_put_failure; - } - } - err = devlink_nl_region_read_snapshot_fill(skb, devlink, region, attrs, start_offset, - end_offset, dump, - &ret_offset); + end_offset, &ret_offset); if (err && err != -EMSGSIZE) goto nla_put_failure; diff --git a/tools/testing/selftests/drivers/net/netdevsim/devlink.sh b/tools/testing/selftests/drivers/net/netdevsim/devlink.sh index ad539eccddcb..de4b32fc4223 100755 --- a/tools/testing/selftests/drivers/net/netdevsim/devlink.sh +++ b/tools/testing/selftests/drivers/net/netdevsim/devlink.sh @@ -146,6 +146,21 @@ regions_test() check_region_snapshot_count dummy post-first-request 3 + devlink region dump $DL_HANDLE/dummy snapshot 25 >> /dev/null + check_err $? "Failed to dump snapshot with id 25" + + devlink region read $DL_HANDLE/dummy snapshot 25 addr 0 len 1 >> /dev/null + check_err $? "Failed to read snapshot with id 25 (1 byte)" + + devlink region read $DL_HANDLE/dummy snapshot 25 addr 128 len 128 >> /dev/null + check_err $? "Failed to read snapshot with id 25 (128 bytes)" + + devlink region read $DL_HANDLE/dummy snapshot 25 addr 128 len $((1<<32)) >> /dev/null + check_err $? "Failed to read snapshot with id 25 (oversized)" + + devlink region read $DL_HANDLE/dummy snapshot 25 addr $((1<<32)) len 128 >> /dev/null 2>&1 + check_fail $? "Bad read of snapshot with id 25 did not fail" + devlink region del $DL_HANDLE/dummy snapshot 25 check_err $? "Failed to delete snapshot with id 25" -- cgit v1.2.3 From 7aebfa1b3885b5aa29fcb4a596d0485ac463bbe8 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Wed, 13 May 2020 18:50:27 -0700 Subject: bpf: Support narrow loads from bpf_sock_addr.user_port bpf_sock_addr.user_port supports only 4-byte load and it leads to ugly code in BPF programs, like: volatile __u32 user_port = ctx->user_port; __u16 port = bpf_ntohs(user_port); Since otherwise clang may optimize the load to be 2-byte and it's rejected by verifier. Add support for 1- and 2-byte loads same way as it's supported for other fields in bpf_sock_addr like user_ip4, msg_src_ip4, etc. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/c1e983f4c17573032601d0b2b1f9d1274f24bc16.1589420814.git.rdna@fb.com --- include/uapi/linux/bpf.h | 2 +- net/core/filter.c | 15 +++++++-------- tools/include/uapi/linux/bpf.h | 2 +- 3 files changed, 9 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index bfb31c1be219..85cfdffde182 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3728,7 +3728,7 @@ struct bpf_sock_addr { __u32 user_ip6[4]; /* Allows 1,2,4,8-byte read and 4,8-byte write. * Stored in network byte order. */ - __u32 user_port; /* Allows 4-byte read and write. + __u32 user_port; /* Allows 1,2,4-byte read and 4-byte write. * Stored in network byte order */ __u32 family; /* Allows 4-byte read, but no write */ diff --git a/net/core/filter.c b/net/core/filter.c index da0634979f53..1fe8c0c2d408 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7029,6 +7029,7 @@ static bool sock_addr_is_valid_access(int off, int size, case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4): case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0], msg_src_ip6[3]): + case bpf_ctx_range(struct bpf_sock_addr, user_port): if (type == BPF_READ) { bpf_ctx_record_field_size(info, size_default); @@ -7059,10 +7060,6 @@ static bool sock_addr_is_valid_access(int off, int size, return false; } break; - case bpf_ctx_range(struct bpf_sock_addr, user_port): - if (size != size_default) - return false; - break; case offsetof(struct bpf_sock_addr, sk): if (type != BPF_READ) return false; @@ -7958,8 +7955,8 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { + int off, port_size = sizeof_field(struct sockaddr_in6, sin6_port); struct bpf_insn *insn = insn_buf; - int off; switch (si->off) { case offsetof(struct bpf_sock_addr, user_family): @@ -7994,9 +7991,11 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type, offsetof(struct sockaddr_in6, sin6_port)); BUILD_BUG_ON(sizeof_field(struct sockaddr_in, sin_port) != sizeof_field(struct sockaddr_in6, sin6_port)); - SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD(struct bpf_sock_addr_kern, - struct sockaddr_in6, uaddr, - sin6_port, tmp_reg); + /* Account for sin6_port being smaller than user_port. */ + port_size = min(port_size, BPF_LDST_BYTES(si)); + SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( + struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr, + sin6_port, bytes_to_bpf_size(port_size), 0, tmp_reg); break; case offsetof(struct bpf_sock_addr, family): diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index bfb31c1be219..85cfdffde182 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3728,7 +3728,7 @@ struct bpf_sock_addr { __u32 user_ip6[4]; /* Allows 1,2,4,8-byte read and 4,8-byte write. * Stored in network byte order. */ - __u32 user_port; /* Allows 4-byte read and write. + __u32 user_port; /* Allows 1,2,4-byte read and 4-byte write. * Stored in network byte order */ __u32 family; /* Allows 4-byte read, but no write */ -- cgit v1.2.3 From d56c2f95adb3d401bf982b6cf8fc4bb6d2f7acdd Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Thu, 14 May 2020 13:03:45 -0700 Subject: bpf: Allow sk lookup helpers in cgroup skb Currently sk lookup helpers are allowed in tc, xdp, sk skb, and cgroup sock_addr programs. But they would be useful in cgroup skb as well so that for example cgroup skb ingress program can lookup a peer socket a packet comes from on same host and make a decision whether to allow or deny this packet based on the properties of that socket, e.g. cgroup that peer socket belongs to. Allow the following sk lookup helpers in cgroup skb: * bpf_sk_lookup_tcp; * bpf_sk_lookup_udp; * bpf_sk_release; * bpf_skc_lookup_tcp. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/f8c7ee280f1582b586629436d777b6db00597d63.1589486450.git.rdna@fb.com --- net/core/filter.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 1fe8c0c2d408..9c3eada5c86c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6159,6 +6159,14 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skb_cgroup_id_proto; #endif #ifdef CONFIG_INET + case BPF_FUNC_sk_lookup_tcp: + return &bpf_sk_lookup_tcp_proto; + case BPF_FUNC_sk_lookup_udp: + return &bpf_sk_lookup_udp_proto; + case BPF_FUNC_sk_release: + return &bpf_sk_release_proto; + case BPF_FUNC_skc_lookup_tcp: + return &bpf_skc_lookup_tcp_proto; case BPF_FUNC_tcp_sock: return &bpf_tcp_sock_proto; case BPF_FUNC_get_listener_sock: -- cgit v1.2.3 From 06d3e4c9f11afc849dc201ecf9ef7a43eeb1dddd Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Thu, 14 May 2020 13:03:46 -0700 Subject: bpf: Allow skb_ancestor_cgroup_id helper in cgroup skb cgroup skb programs already can use bpf_skb_cgroup_id. Allow bpf_skb_ancestor_cgroup_id as well so that container policies can be implemented for a container that can have sub-cgroups dynamically created, but policies should still be implemented based on cgroup id of container itself not on an id of a sub-cgroup. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/8874194d6041eba190356453ea9f6071edf5f658.1589486450.git.rdna@fb.com --- net/core/filter.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 9c3eada5c86c..a47dc5b9dad4 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6157,6 +6157,8 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) #ifdef CONFIG_SOCK_CGROUP_DATA case BPF_FUNC_skb_cgroup_id: return &bpf_skb_cgroup_id_proto; + case BPF_FUNC_skb_ancestor_cgroup_id: + return &bpf_skb_ancestor_cgroup_id_proto; #endif #ifdef CONFIG_INET case BPF_FUNC_sk_lookup_tcp: -- cgit v1.2.3 From f307fa2cb4c935f7f1ff0aeb880c7b44fb9a642b Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Thu, 14 May 2020 13:03:47 -0700 Subject: bpf: Introduce bpf_sk_{, ancestor_}cgroup_id helpers With having ability to lookup sockets in cgroup skb programs it becomes useful to access cgroup id of retrieved sockets so that policies can be implemented based on origin cgroup of such socket. For example, a container running in a cgroup can have cgroup skb ingress program that can lookup peer socket that is sending packets to a process inside the container and decide whether those packets should be allowed or denied based on cgroup id of the peer. More specifically such ingress program can implement intra-host policy "allow incoming packets only from this same container and not from any other container on same host" w/o relying on source IP addresses since quite often it can be the case that containers share same IP address on the host. Introduce two new helpers for this use-case: bpf_sk_cgroup_id() and bpf_sk_ancestor_cgroup_id(). These helpers are similar to existing bpf_skb_{,ancestor_}cgroup_id helpers with the only difference that sk is used to get cgroup id instead of skb, and share code with them. See documentation in UAPI for more details. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/f5884981249ce911f63e9b57ecd5d7d19154ff39.1589486450.git.rdna@fb.com --- include/uapi/linux/bpf.h | 36 ++++++++++++++++++++++++- net/core/filter.c | 60 +++++++++++++++++++++++++++++++++++------- tools/include/uapi/linux/bpf.h | 36 ++++++++++++++++++++++++- 3 files changed, 121 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 85cfdffde182..146c742f1d49 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3121,6 +3121,38 @@ union bpf_attr { * 0 on success, or a negative error in case of failure: * * **-EOVERFLOW** if an overflow happened: The same object will be tried again. + * + * u64 bpf_sk_cgroup_id(struct bpf_sock *sk) + * Description + * Return the cgroup v2 id of the socket *sk*. + * + * *sk* must be a non-**NULL** pointer to a full socket, e.g. one + * returned from **bpf_sk_lookup_xxx**\ (), + * **bpf_sk_fullsock**\ (), etc. The format of returned id is + * same as in **bpf_skb_cgroup_id**\ (). + * + * This helper is available only if the kernel was compiled with + * the **CONFIG_SOCK_CGROUP_DATA** configuration option. + * Return + * The id is returned or 0 in case the id could not be retrieved. + * + * u64 bpf_sk_ancestor_cgroup_id(struct bpf_sock *sk, int ancestor_level) + * Description + * Return id of cgroup v2 that is ancestor of cgroup associated + * with the *sk* at the *ancestor_level*. The root cgroup is at + * *ancestor_level* zero and each step down the hierarchy + * increments the level. If *ancestor_level* == level of cgroup + * associated with *sk*, then return value will be same as that + * of **bpf_sk_cgroup_id**\ (). + * + * The helper is useful to implement policies based on cgroups + * that are upper in hierarchy than immediate cgroup associated + * with *sk*. + * + * The format of returned id and helper limitations are same as in + * **bpf_sk_cgroup_id**\ (). + * Return + * The id is returned or 0 in case the id could not be retrieved. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3250,7 +3282,9 @@ union bpf_attr { FN(sk_assign), \ FN(ktime_get_boot_ns), \ FN(seq_printf), \ - FN(seq_write), + FN(seq_write), \ + FN(sk_cgroup_id), \ + FN(sk_ancestor_cgroup_id), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/net/core/filter.c b/net/core/filter.c index a47dc5b9dad4..5815902bb617 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4003,16 +4003,22 @@ static const struct bpf_func_proto bpf_skb_under_cgroup_proto = { }; #ifdef CONFIG_SOCK_CGROUP_DATA +static inline u64 __bpf_sk_cgroup_id(struct sock *sk) +{ + struct cgroup *cgrp; + + cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + return cgroup_id(cgrp); +} + BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb) { struct sock *sk = skb_to_full_sk(skb); - struct cgroup *cgrp; if (!sk || !sk_fullsock(sk)) return 0; - cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - return cgroup_id(cgrp); + return __bpf_sk_cgroup_id(sk); } static const struct bpf_func_proto bpf_skb_cgroup_id_proto = { @@ -4022,16 +4028,12 @@ static const struct bpf_func_proto bpf_skb_cgroup_id_proto = { .arg1_type = ARG_PTR_TO_CTX, }; -BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int, - ancestor_level) +static inline u64 __bpf_sk_ancestor_cgroup_id(struct sock *sk, + int ancestor_level) { - struct sock *sk = skb_to_full_sk(skb); struct cgroup *ancestor; struct cgroup *cgrp; - if (!sk || !sk_fullsock(sk)) - return 0; - cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); ancestor = cgroup_ancestor(cgrp, ancestor_level); if (!ancestor) @@ -4040,6 +4042,17 @@ BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int, return cgroup_id(ancestor); } +BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int, + ancestor_level) +{ + struct sock *sk = skb_to_full_sk(skb); + + if (!sk || !sk_fullsock(sk)) + return 0; + + return __bpf_sk_ancestor_cgroup_id(sk, ancestor_level); +} + static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = { .func = bpf_skb_ancestor_cgroup_id, .gpl_only = false, @@ -4047,6 +4060,31 @@ static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; + +BPF_CALL_1(bpf_sk_cgroup_id, struct sock *, sk) +{ + return __bpf_sk_cgroup_id(sk); +} + +static const struct bpf_func_proto bpf_sk_cgroup_id_proto = { + .func = bpf_sk_cgroup_id, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_SOCKET, +}; + +BPF_CALL_2(bpf_sk_ancestor_cgroup_id, struct sock *, sk, int, ancestor_level) +{ + return __bpf_sk_ancestor_cgroup_id(sk, ancestor_level); +} + +static const struct bpf_func_proto bpf_sk_ancestor_cgroup_id_proto = { + .func = bpf_sk_ancestor_cgroup_id, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_SOCKET, + .arg2_type = ARG_ANYTHING, +}; #endif static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff, @@ -6159,6 +6197,10 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skb_cgroup_id_proto; case BPF_FUNC_skb_ancestor_cgroup_id: return &bpf_skb_ancestor_cgroup_id_proto; + case BPF_FUNC_sk_cgroup_id: + return &bpf_sk_cgroup_id_proto; + case BPF_FUNC_sk_ancestor_cgroup_id: + return &bpf_sk_ancestor_cgroup_id_proto; #endif #ifdef CONFIG_INET case BPF_FUNC_sk_lookup_tcp: diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 85cfdffde182..146c742f1d49 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3121,6 +3121,38 @@ union bpf_attr { * 0 on success, or a negative error in case of failure: * * **-EOVERFLOW** if an overflow happened: The same object will be tried again. + * + * u64 bpf_sk_cgroup_id(struct bpf_sock *sk) + * Description + * Return the cgroup v2 id of the socket *sk*. + * + * *sk* must be a non-**NULL** pointer to a full socket, e.g. one + * returned from **bpf_sk_lookup_xxx**\ (), + * **bpf_sk_fullsock**\ (), etc. The format of returned id is + * same as in **bpf_skb_cgroup_id**\ (). + * + * This helper is available only if the kernel was compiled with + * the **CONFIG_SOCK_CGROUP_DATA** configuration option. + * Return + * The id is returned or 0 in case the id could not be retrieved. + * + * u64 bpf_sk_ancestor_cgroup_id(struct bpf_sock *sk, int ancestor_level) + * Description + * Return id of cgroup v2 that is ancestor of cgroup associated + * with the *sk* at the *ancestor_level*. The root cgroup is at + * *ancestor_level* zero and each step down the hierarchy + * increments the level. If *ancestor_level* == level of cgroup + * associated with *sk*, then return value will be same as that + * of **bpf_sk_cgroup_id**\ (). + * + * The helper is useful to implement policies based on cgroups + * that are upper in hierarchy than immediate cgroup associated + * with *sk*. + * + * The format of returned id and helper limitations are same as in + * **bpf_sk_cgroup_id**\ (). + * Return + * The id is returned or 0 in case the id could not be retrieved. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3250,7 +3282,9 @@ union bpf_attr { FN(sk_assign), \ FN(ktime_get_boot_ns), \ FN(seq_printf), \ - FN(seq_write), + FN(seq_write), \ + FN(sk_cgroup_id), \ + FN(sk_ancestor_cgroup_id), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3 From a075767bbdc659066b89be282c8377fa880e9dc4 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 14 May 2020 12:49:28 +0200 Subject: net: XDP-generic determining XDP frame size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The SKB "head" pointer points to the data area that contains skb_shared_info, that can be found via skb_end_pointer(). Given xdp->data_hard_start have been established (basically pointing to skb->head), frame size is between skb_end_pointer() and data_hard_start, plus the size reserved to skb_shared_info. Change the bpf_xdp_adjust_tail offset adjust of skb->len, to be a positive offset number on grow, and negative number on shrink. As this seems more natural when reading the code. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/158945336804.97035.7164852191163722056.stgit@firesoul --- net/core/dev.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 4c91de39890a..f937a3ff668d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4617,6 +4617,11 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, xdp->data_meta = xdp->data; xdp->data_end = xdp->data + hlen; xdp->data_hard_start = skb->data - skb_headroom(skb); + + /* SKB "head" area always have tailroom for skb_shared_info */ + xdp->frame_sz = (void *)skb_end_pointer(skb) - xdp->data_hard_start; + xdp->frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + orig_data_end = xdp->data_end; orig_data = xdp->data; eth = (struct ethhdr *)xdp->data; @@ -4640,14 +4645,11 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, skb_reset_network_header(skb); } - /* check if bpf_xdp_adjust_tail was used. it can only "shrink" - * pckt. - */ - off = orig_data_end - xdp->data_end; + /* check if bpf_xdp_adjust_tail was used */ + off = xdp->data_end - orig_data_end; if (off != 0) { skb_set_tail_pointer(skb, xdp->data_end - xdp->data); - skb->len -= off; - + skb->len += off; /* positive on grow, negative on shrink */ } /* check if XDP changed eth hdr such SKB needs update */ -- cgit v1.2.3 From 34cc0b338a61de3eee3a2bfcaf4f9d6e9fae091a Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 14 May 2020 12:49:33 +0200 Subject: xdp: Xdp_frame add member frame_sz and handle in convert_to_xdp_frame MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use hole in struct xdp_frame, when adding member frame_sz, which keeps same sizeof struct (32 bytes) Drivers ixgbe and sfc had bug cases where the necessary/expected tailroom was not reserved. This can lead to some hard to catch memory corruption issues. Having the drivers frame_sz this can be detected when packet length/end via xdp->data_end exceed the xdp_data_hard_end pointer, which accounts for the reserved the tailroom. When detecting this driver issue, simply fail the conversion with NULL, which results in feedback to driver (failing xdp_do_redirect()) causing driver to drop packet. Given the lack of consistent XDP stats, this can be hard to troubleshoot. And given this is a driver bug, we want to generate some more noise in form of a WARN stack dump (to ID the driver code that inlined convert_to_xdp_frame). Inlining the WARN macro is problematic, because it adds an asm instruction (on Intel CPUs ud2) what influence instruction cache prefetching. Thus, introduce xdp_warn and macro XDP_WARN, to avoid this and at the same time make identifying the function and line of this inlined function easier. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/158945337313.97035.10015729316710496600.stgit@firesoul --- include/net/xdp.h | 14 +++++++++++++- net/core/xdp.c | 8 ++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/xdp.h b/include/net/xdp.h index a764af4ae0ea..3094fccf5a88 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -89,7 +89,8 @@ struct xdp_frame { void *data; u16 len; u16 headroom; - u16 metasize; + u32 metasize:8; + u32 frame_sz:24; /* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time, * while mem info is valid on remote CPU. */ @@ -104,6 +105,10 @@ static inline void xdp_scrub_frame(struct xdp_frame *frame) frame->dev_rx = NULL; } +/* Avoids inlining WARN macro in fast-path */ +void xdp_warn(const char *msg, const char *func, const int line); +#define XDP_WARN(msg) xdp_warn(msg, __func__, __LINE__) + struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp); /* Convert xdp_buff to xdp_frame */ @@ -124,6 +129,12 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp) if (unlikely((headroom - metasize) < sizeof(*xdp_frame))) return NULL; + /* Catch if driver didn't reserve tailroom for skb_shared_info */ + if (unlikely(xdp->data_end > xdp_data_hard_end(xdp))) { + XDP_WARN("Driver BUG: missing reserved tailroom"); + return NULL; + } + /* Store info in top of packet */ xdp_frame = xdp->data_hard_start; @@ -131,6 +142,7 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp) xdp_frame->len = xdp->data_end - xdp->data; xdp_frame->headroom = headroom - sizeof(*xdp_frame); xdp_frame->metasize = metasize; + xdp_frame->frame_sz = xdp->frame_sz; /* rxq only valid until napi_schedule ends, convert to xdp_mem_info */ xdp_frame->mem = xdp->rxq->mem; diff --git a/net/core/xdp.c b/net/core/xdp.c index 4c7ea85486af..490b8f5fa8ee 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -496,3 +497,10 @@ struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp) return xdpf; } EXPORT_SYMBOL_GPL(xdp_convert_zc_to_xdp_frame); + +/* Used by XDP_WARN macro, to avoid inlining WARN() in fast-path */ +void xdp_warn(const char *msg, const char *func, const int line) +{ + WARN(1, "XDP_WARN: %s(line:%d): %s\n", func, line, msg); +}; +EXPORT_SYMBOL_GPL(xdp_warn); -- cgit v1.2.3 From c8741e2bfe872425ea6f10bb6f7dc1d67bc60c3a Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 14 May 2020 12:51:25 +0200 Subject: xdp: Allow bpf_xdp_adjust_tail() to grow packet size Finally, after all drivers have a frame size, allow BPF-helper bpf_xdp_adjust_tail() to grow or extend packet size at frame tail. Remember that helper/macro xdp_data_hard_end have reserved some tailroom. Thus, this helper makes sure that the BPF-prog don't have access to this tailroom area. V2: Remove one chicken check and use WARN_ONCE for other Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/158945348530.97035.12577148209134239291.stgit@firesoul --- include/uapi/linux/bpf.h | 4 ++-- net/core/filter.c | 11 +++++++++-- 2 files changed, 11 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 32cbf36c7729..b9b8a0f63b91 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2015,8 +2015,8 @@ union bpf_attr { * int bpf_xdp_adjust_tail(struct xdp_buff *xdp_md, int delta) * Description * Adjust (move) *xdp_md*\ **->data_end** by *delta* bytes. It is - * only possible to shrink the packet as of this writing, - * therefore *delta* must be a negative integer. + * possible to both shrink and grow the packet tail. + * Shrink done via *delta* being a negative integer. * * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers diff --git a/net/core/filter.c b/net/core/filter.c index 5815902bb617..e7b033dad44e 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3411,12 +3411,19 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = { BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset) { + void *data_hard_end = xdp_data_hard_end(xdp); /* use xdp->frame_sz */ void *data_end = xdp->data_end + offset; - /* only shrinking is allowed for now. */ - if (unlikely(offset >= 0)) + /* Notice that xdp_data_hard_end have reserved some tailroom */ + if (unlikely(data_end > data_hard_end)) return -EINVAL; + /* ALL drivers MUST init xdp->frame_sz, chicken check below */ + if (unlikely(xdp->frame_sz > PAGE_SIZE)) { + WARN_ONCE(1, "Too BIG xdp->frame_sz = %d\n", xdp->frame_sz); + return -EINVAL; + } + if (unlikely(data_end < xdp->data + ETH_HLEN)) return -EINVAL; -- cgit v1.2.3 From ddb47d518ca10948d1f64a983cb9274720f691cd Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 14 May 2020 12:51:30 +0200 Subject: xdp: Clear grow memory in bpf_xdp_adjust_tail() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Clearing memory of tail when grow happens, because it is too easy to write a XDP_PASS program that extend the tail, which expose this memory to users that can run tcpdump. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/158945349039.97035.5262100484553494.stgit@firesoul --- net/core/filter.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index e7b033dad44e..a85eb538d4d6 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3427,6 +3427,10 @@ BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset) if (unlikely(data_end < xdp->data + ETH_HLEN)) return -EINVAL; + /* Clear memory area on grow, can contain uninit kernel memory */ + if (offset > 0) + memset(xdp->data_end, 0, offset); + xdp->data_end = data_end; return 0; -- cgit v1.2.3 From bc56c919fce782f616823b76fb70a788f4762cf5 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 14 May 2020 12:51:35 +0200 Subject: bpf: Add xdp.frame_sz in bpf_prog_test_run_xdp(). Update the memory requirements, when adding xdp.frame_sz in BPF test_run function bpf_prog_test_run_xdp() which e.g. is used by XDP selftests. Specifically add the expected reserved tailroom, but also allocated a larger memory area to reflect that XDP frames usually comes in this format. Limit the provided packet data size to 4096 minus headroom + tailroom, as this also reflect a common 3520 bytes MTU limit with XDP. Note that bpf_test_init already use a memory allocation method that clears memory. Thus, this already guards against leaking uninit kernel memory. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/158945349549.97035.15316291762482444006.stgit@firesoul --- net/bpf/test_run.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 29dbdd4c29f6..30ba7d38941d 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -470,25 +470,34 @@ out: int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { + u32 tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + u32 headroom = XDP_PACKET_HEADROOM; u32 size = kattr->test.data_size_in; u32 repeat = kattr->test.repeat; struct netdev_rx_queue *rxqueue; struct xdp_buff xdp = {}; u32 retval, duration; + u32 max_data_sz; void *data; int ret; if (kattr->test.ctx_in || kattr->test.ctx_out) return -EINVAL; - data = bpf_test_init(kattr, size, XDP_PACKET_HEADROOM + NET_IP_ALIGN, 0); + /* XDP have extra tailroom as (most) drivers use full page */ + max_data_sz = 4096 - headroom - tailroom; + if (size > max_data_sz) + return -EINVAL; + + data = bpf_test_init(kattr, max_data_sz, headroom, tailroom); if (IS_ERR(data)) return PTR_ERR(data); xdp.data_hard_start = data; - xdp.data = data + XDP_PACKET_HEADROOM + NET_IP_ALIGN; + xdp.data = data + headroom; xdp.data_meta = xdp.data; xdp.data_end = xdp.data + size; + xdp.frame_sz = headroom + max_data_sz + tailroom; rxqueue = __netif_get_rx_queue(current->nsproxy->net_ns->loopback_dev, 0); xdp.rxq = &rxqueue->xdp_rxq; @@ -496,8 +505,7 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, ret = bpf_test_run(prog, &xdp, repeat, &retval, &duration, true); if (ret) goto out; - if (xdp.data != data + XDP_PACKET_HEADROOM + NET_IP_ALIGN || - xdp.data_end != xdp.data + size) + if (xdp.data != data + headroom || xdp.data_end != xdp.data + size) size = xdp.data_end - xdp.data; ret = bpf_test_finish(kattr, uattr, xdp.data, size, retval, duration); out: -- cgit v1.2.3 From 2c78ee898d8f10ae6fb2fa23a3fbaec96b1b7366 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 13 May 2020 16:03:54 -0700 Subject: bpf: Implement CAP_BPF Implement permissions as stated in uapi/linux/capability.h In order to do that the verifier allow_ptr_leaks flag is split into four flags and they are set as: env->allow_ptr_leaks = bpf_allow_ptr_leaks(); env->bypass_spec_v1 = bpf_bypass_spec_v1(); env->bypass_spec_v4 = bpf_bypass_spec_v4(); env->bpf_capable = bpf_capable(); The first three currently equivalent to perfmon_capable(), since leaking kernel pointers and reading kernel memory via side channel attacks is roughly equivalent to reading kernel memory with cap_perfmon. 'bpf_capable' enables bounded loops, precision tracking, bpf to bpf calls and other verifier features. 'allow_ptr_leaks' enable ptr leaks, ptr conversions, subtraction of pointers. 'bypass_spec_v1' disables speculative analysis in the verifier, run time mitigations in bpf array, and enables indirect variable access in bpf programs. 'bypass_spec_v4' disables emission of sanitation code by the verifier. That means that the networking BPF program loaded with CAP_BPF + CAP_NET_ADMIN will have speculative checks done by the verifier and other spectre mitigation applied. Such networking BPF program will not be able to leak kernel pointers and will not be able to access arbitrary kernel memory. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200513230355.7858-3-alexei.starovoitov@gmail.com --- drivers/media/rc/bpf-lirc.c | 2 +- include/linux/bpf.h | 18 ++++++++- include/linux/bpf_verifier.h | 3 ++ kernel/bpf/arraymap.c | 10 ++--- kernel/bpf/bpf_struct_ops.c | 2 +- kernel/bpf/core.c | 2 +- kernel/bpf/cpumap.c | 2 +- kernel/bpf/hashtab.c | 4 +- kernel/bpf/helpers.c | 4 +- kernel/bpf/lpm_trie.c | 2 +- kernel/bpf/map_in_map.c | 2 +- kernel/bpf/queue_stack_maps.c | 2 +- kernel/bpf/reuseport_array.c | 2 +- kernel/bpf/stackmap.c | 2 +- kernel/bpf/syscall.c | 89 +++++++++++++++++++++++++++++++++---------- kernel/bpf/verifier.c | 37 +++++++++--------- kernel/trace/bpf_trace.c | 3 ++ net/core/bpf_sk_storage.c | 4 +- net/core/filter.c | 4 +- 19 files changed, 134 insertions(+), 60 deletions(-) (limited to 'net') diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c index 069c42f22a8c..5bb144435c16 100644 --- a/drivers/media/rc/bpf-lirc.c +++ b/drivers/media/rc/bpf-lirc.c @@ -110,7 +110,7 @@ lirc_mode2_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_get_prandom_u32: return &bpf_get_prandom_u32_proto; case BPF_FUNC_trace_printk: - if (capable(CAP_SYS_ADMIN)) + if (perfmon_capable()) return bpf_get_trace_printk_proto(); /* fall through */ default: diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c45d198ac38c..efe8836b5c48 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -19,6 +19,7 @@ #include #include #include +#include struct bpf_verifier_env; struct bpf_verifier_log; @@ -119,7 +120,7 @@ struct bpf_map { struct bpf_map_memory memory; char name[BPF_OBJ_NAME_LEN]; u32 btf_vmlinux_value_type_id; - bool unpriv_array; + bool bypass_spec_v1; bool frozen; /* write-once; write-protected by freeze_mutex */ /* 22 bytes hole */ @@ -1095,6 +1096,21 @@ struct bpf_map *bpf_map_get_curr_or_next(u32 *id); extern int sysctl_unprivileged_bpf_disabled; +static inline bool bpf_allow_ptr_leaks(void) +{ + return perfmon_capable(); +} + +static inline bool bpf_bypass_spec_v1(void) +{ + return perfmon_capable(); +} + +static inline bool bpf_bypass_spec_v4(void) +{ + return perfmon_capable(); +} + int bpf_map_new_fd(struct bpf_map *map, int flags); int bpf_prog_new_fd(struct bpf_prog *prog); diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 6abd5a778fcd..ea833087e853 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -375,6 +375,9 @@ struct bpf_verifier_env { u32 used_map_cnt; /* number of used maps */ u32 id_gen; /* used to generate unique reg IDs */ bool allow_ptr_leaks; + bool bpf_capable; + bool bypass_spec_v1; + bool bypass_spec_v4; bool seen_direct_write; struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ const struct bpf_line_info *prev_linfo; diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 95d77770353c..1d5bb0d983b2 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -77,7 +77,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; int ret, numa_node = bpf_map_attr_numa_node(attr); u32 elem_size, index_mask, max_entries; - bool unpriv = !capable(CAP_SYS_ADMIN); + bool bypass_spec_v1 = bpf_bypass_spec_v1(); u64 cost, array_size, mask64; struct bpf_map_memory mem; struct bpf_array *array; @@ -95,7 +95,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) mask64 -= 1; index_mask = mask64; - if (unpriv) { + if (!bypass_spec_v1) { /* round up array size to nearest power of 2, * since cpu will speculate within index_mask limits */ @@ -149,7 +149,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) return ERR_PTR(-ENOMEM); } array->index_mask = index_mask; - array->map.unpriv_array = unpriv; + array->map.bypass_spec_v1 = bypass_spec_v1; /* copy mandatory map attributes */ bpf_map_init_from_attr(&array->map, attr); @@ -219,7 +219,7 @@ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); - if (map->unpriv_array) { + if (!map->bypass_spec_v1) { *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 4); *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); } else { @@ -1053,7 +1053,7 @@ static u32 array_of_map_gen_lookup(struct bpf_map *map, *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); - if (map->unpriv_array) { + if (!map->bypass_spec_v1) { *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 6); *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); } else { diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 26cb51f2db72..c6b0decaa46a 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -557,7 +557,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) struct bpf_map *map; int err; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return ERR_PTR(-EPERM); st_ops = bpf_struct_ops_find_value(attr->btf_vmlinux_value_type_id); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 6aa11de67315..c40ff4cf9880 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -646,7 +646,7 @@ static bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp) void bpf_prog_kallsyms_add(struct bpf_prog *fp) { if (!bpf_prog_kallsyms_candidate(fp) || - !capable(CAP_SYS_ADMIN)) + !bpf_capable()) return; bpf_prog_ksym_set_addr(fp); diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index a71790dab12d..8b85bfddfac7 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -85,7 +85,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) u64 cost; int ret; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return ERR_PTR(-EPERM); /* check sanity of attributes */ diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index d541c8486c95..b4b288a3c3c9 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -359,9 +359,9 @@ static int htab_map_alloc_check(union bpf_attr *attr) BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) != offsetof(struct htab_elem, hash_node.pprev)); - if (lru && !capable(CAP_SYS_ADMIN)) + if (lru && !bpf_capable()) /* LRU implementation is much complicated than other - * maps. Hence, limit to CAP_SYS_ADMIN for now. + * maps. Hence, limit to CAP_BPF. */ return -EPERM; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 5c0290e0696e..886949fdcece 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -633,7 +633,7 @@ bpf_base_func_proto(enum bpf_func_id func_id) break; } - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return NULL; switch (func_id) { @@ -642,6 +642,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) case BPF_FUNC_spin_unlock: return &bpf_spin_unlock_proto; case BPF_FUNC_trace_printk: + if (!perfmon_capable()) + return NULL; return bpf_get_trace_printk_proto(); case BPF_FUNC_jiffies64: return &bpf_jiffies64_proto; diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 65c236cf341e..c8cc4e4cf98d 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -543,7 +543,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) u64 cost = sizeof(*trie), cost_per_node; int ret; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return ERR_PTR(-EPERM); /* check sanity of attributes */ diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index b3c48d1533cb..17738c93bec8 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -60,7 +60,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) /* Misc members not needed in bpf_map_meta_equal() check. */ inner_map_meta->ops = inner_map->ops; if (inner_map->ops == &array_map_ops) { - inner_map_meta->unpriv_array = inner_map->unpriv_array; + inner_map_meta->bypass_spec_v1 = inner_map->bypass_spec_v1; container_of(inner_map_meta, struct bpf_array, map)->index_mask = container_of(inner_map, struct bpf_array, map)->index_mask; } diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index 30e1373fd437..05c8e043b9d2 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -45,7 +45,7 @@ static bool queue_stack_map_is_full(struct bpf_queue_stack *qs) /* Called from syscall */ static int queue_stack_map_alloc_check(union bpf_attr *attr) { - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return -EPERM; /* check sanity of attributes */ diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index 01badd3eda7a..21cde24386db 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -154,7 +154,7 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) struct bpf_map_memory mem; u64 array_size; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return ERR_PTR(-EPERM); array_size = sizeof(*array); diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index db76339fe358..7b8381ce40a0 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -93,7 +93,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) u64 cost, n_buckets; int err; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return ERR_PTR(-EPERM); if (attr->map_flags & ~STACK_CREATE_FLAG_MASK) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index de2a75500233..79bcd8d056d2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1534,7 +1534,7 @@ static int map_freeze(const union bpf_attr *attr) err = -EBUSY; goto err_put; } - if (!capable(CAP_SYS_ADMIN)) { + if (!bpf_capable()) { err = -EPERM; goto err_put; } @@ -2009,6 +2009,55 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, } } +static bool is_net_admin_prog_type(enum bpf_prog_type prog_type) +{ + switch (prog_type) { + case BPF_PROG_TYPE_SCHED_CLS: + case BPF_PROG_TYPE_SCHED_ACT: + case BPF_PROG_TYPE_XDP: + case BPF_PROG_TYPE_LWT_IN: + case BPF_PROG_TYPE_LWT_OUT: + case BPF_PROG_TYPE_LWT_XMIT: + case BPF_PROG_TYPE_LWT_SEG6LOCAL: + case BPF_PROG_TYPE_SK_SKB: + case BPF_PROG_TYPE_SK_MSG: + case BPF_PROG_TYPE_LIRC_MODE2: + case BPF_PROG_TYPE_FLOW_DISSECTOR: + case BPF_PROG_TYPE_CGROUP_DEVICE: + case BPF_PROG_TYPE_CGROUP_SOCK: + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + case BPF_PROG_TYPE_CGROUP_SOCKOPT: + case BPF_PROG_TYPE_CGROUP_SYSCTL: + case BPF_PROG_TYPE_SOCK_OPS: + case BPF_PROG_TYPE_EXT: /* extends any prog */ + return true; + case BPF_PROG_TYPE_CGROUP_SKB: + /* always unpriv */ + case BPF_PROG_TYPE_SK_REUSEPORT: + /* equivalent to SOCKET_FILTER. need CAP_BPF only */ + default: + return false; + } +} + +static bool is_perfmon_prog_type(enum bpf_prog_type prog_type) +{ + switch (prog_type) { + case BPF_PROG_TYPE_KPROBE: + case BPF_PROG_TYPE_TRACEPOINT: + case BPF_PROG_TYPE_PERF_EVENT: + case BPF_PROG_TYPE_RAW_TRACEPOINT: + case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: + case BPF_PROG_TYPE_TRACING: + case BPF_PROG_TYPE_LSM: + case BPF_PROG_TYPE_STRUCT_OPS: /* has access to struct sock */ + case BPF_PROG_TYPE_EXT: /* extends any prog */ + return true; + default: + return false; + } +} + /* last field in 'union bpf_attr' used by this command */ #define BPF_PROG_LOAD_LAST_FIELD attach_prog_fd @@ -2031,7 +2080,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && (attr->prog_flags & BPF_F_ANY_ALIGNMENT) && - !capable(CAP_SYS_ADMIN)) + !bpf_capable()) return -EPERM; /* copy eBPF program license from user space */ @@ -2044,11 +2093,16 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) is_gpl = license_is_gpl_compatible(license); if (attr->insn_cnt == 0 || - attr->insn_cnt > (capable(CAP_SYS_ADMIN) ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) + attr->insn_cnt > (bpf_capable() ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) return -E2BIG; if (type != BPF_PROG_TYPE_SOCKET_FILTER && type != BPF_PROG_TYPE_CGROUP_SKB && - !capable(CAP_SYS_ADMIN)) + !bpf_capable()) + return -EPERM; + + if (is_net_admin_prog_type(type) && !capable(CAP_NET_ADMIN)) + return -EPERM; + if (is_perfmon_prog_type(type) && !perfmon_capable()) return -EPERM; bpf_prog_load_fixup_attach_type(attr); @@ -2682,6 +2736,11 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, case BPF_PROG_TYPE_CGROUP_SOCKOPT: return attach_type == prog->expected_attach_type ? 0 : -EINVAL; case BPF_PROG_TYPE_CGROUP_SKB: + if (!capable(CAP_NET_ADMIN)) + /* cg-skb progs can be loaded by unpriv user. + * check permissions at attach time. + */ + return -EPERM; return prog->enforce_expected_attach_type && prog->expected_attach_type != attach_type ? -EINVAL : 0; @@ -2747,9 +2806,6 @@ static int bpf_prog_attach(const union bpf_attr *attr) struct bpf_prog *prog; int ret; - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - if (CHECK_ATTR(BPF_PROG_ATTACH)) return -EINVAL; @@ -2804,9 +2860,6 @@ static int bpf_prog_detach(const union bpf_attr *attr) { enum bpf_prog_type ptype; - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - if (CHECK_ATTR(BPF_PROG_DETACH)) return -EINVAL; @@ -2819,6 +2872,8 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_PROG_TYPE_LIRC_MODE2: return lirc_prog_detach(attr); case BPF_PROG_TYPE_FLOW_DISSECTOR: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; return skb_flow_dissector_bpf_prog_detach(attr); case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_CGROUP_SKB: @@ -2882,8 +2937,6 @@ static int bpf_prog_test_run(const union bpf_attr *attr, struct bpf_prog *prog; int ret = -ENOTSUPP; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; if (CHECK_ATTR(BPF_PROG_TEST_RUN)) return -EINVAL; @@ -3184,7 +3237,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, info.run_time_ns = stats.nsecs; info.run_cnt = stats.cnt; - if (!capable(CAP_SYS_ADMIN)) { + if (!bpf_capable()) { info.jited_prog_len = 0; info.xlated_prog_len = 0; info.nr_jited_ksyms = 0; @@ -3543,7 +3596,7 @@ static int bpf_btf_load(const union bpf_attr *attr) if (CHECK_ATTR(BPF_BTF_LOAD)) return -EINVAL; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return -EPERM; return btf_new_fd(attr); @@ -3766,9 +3819,6 @@ static int link_create(union bpf_attr *attr) struct bpf_prog *prog; int ret; - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - if (CHECK_ATTR(BPF_LINK_CREATE)) return -EINVAL; @@ -3817,9 +3867,6 @@ static int link_update(union bpf_attr *attr) u32 flags; int ret; - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - if (CHECK_ATTR(BPF_LINK_UPDATE)) return -EINVAL; @@ -3988,7 +4035,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz union bpf_attr attr; int err; - if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN)) + if (sysctl_unprivileged_bpf_disabled && !bpf_capable()) return -EPERM; err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a3f2af756fd6..180933f6fba9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1295,7 +1295,7 @@ static void __mark_reg_unknown(const struct bpf_verifier_env *env, reg->type = SCALAR_VALUE; reg->var_off = tnum_unknown; reg->frameno = 0; - reg->precise = env->subprog_cnt > 1 || !env->allow_ptr_leaks; + reg->precise = env->subprog_cnt > 1 || !env->bpf_capable; __mark_reg_unbounded(reg); } @@ -1427,8 +1427,9 @@ static int check_subprogs(struct bpf_verifier_env *env) continue; if (insn[i].src_reg != BPF_PSEUDO_CALL) continue; - if (!env->allow_ptr_leaks) { - verbose(env, "function calls to other bpf functions are allowed for root only\n"); + if (!env->bpf_capable) { + verbose(env, + "function calls to other bpf functions are allowed for CAP_BPF and CAP_SYS_ADMIN\n"); return -EPERM; } ret = add_subprog(env, i + insn[i].imm + 1); @@ -1962,8 +1963,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno, bool new_marks = false; int i, err; - if (!env->allow_ptr_leaks) - /* backtracking is root only for now */ + if (!env->bpf_capable) return 0; func = st->frame[st->curframe]; @@ -2211,7 +2211,7 @@ static int check_stack_write(struct bpf_verifier_env *env, reg = &cur->regs[value_regno]; if (reg && size == BPF_REG_SIZE && register_is_const(reg) && - !register_is_null(reg) && env->allow_ptr_leaks) { + !register_is_null(reg) && env->bpf_capable) { if (dst_reg != BPF_REG_FP) { /* The backtracking logic can only recognize explicit * stack slot address like [fp - 8]. Other spill of @@ -2237,7 +2237,7 @@ static int check_stack_write(struct bpf_verifier_env *env, return -EINVAL; } - if (!env->allow_ptr_leaks) { + if (!env->bypass_spec_v4) { bool sanitize = false; if (state->stack[spi].slot_type[0] == STACK_SPILL && @@ -3432,7 +3432,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, * Spectre masking for stack ALU. * See also retrieve_ptr_limit(). */ - if (!env->allow_ptr_leaks) { + if (!env->bypass_spec_v1) { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); @@ -4435,10 +4435,10 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, if (!BPF_MAP_PTR(aux->map_ptr_state)) bpf_map_ptr_store(aux, meta->map_ptr, - meta->map_ptr->unpriv_array); + !meta->map_ptr->bypass_spec_v1); else if (BPF_MAP_PTR(aux->map_ptr_state) != meta->map_ptr) bpf_map_ptr_store(aux, BPF_MAP_PTR_POISON, - meta->map_ptr->unpriv_array); + !meta->map_ptr->bypass_spec_v1); return 0; } @@ -4807,7 +4807,7 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, static bool can_skip_alu_sanitation(const struct bpf_verifier_env *env, const struct bpf_insn *insn) { - return env->allow_ptr_leaks || BPF_SRC(insn->code) == BPF_K; + return env->bypass_spec_v1 || BPF_SRC(insn->code) == BPF_K; } static int update_alu_sanitation_state(struct bpf_insn_aux_data *aux, @@ -5117,7 +5117,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, /* For unprivileged we require that resulting offset must be in bounds * in order to be able to sanitize access later on. */ - if (!env->allow_ptr_leaks) { + if (!env->bypass_spec_v1) { if (dst_reg->type == PTR_TO_MAP_VALUE && check_map_access(env, dst, dst_reg->off, 1, false)) { verbose(env, "R%d pointer arithmetic of map value goes out of range, " @@ -7244,7 +7244,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env, insn_stack[env->cfg.cur_stack++] = w; return 1; } else if ((insn_state[w] & 0xF0) == DISCOVERED) { - if (loop_ok && env->allow_ptr_leaks) + if (loop_ok && env->bpf_capable) return 0; verbose_linfo(env, t, "%d: ", t); verbose_linfo(env, w, "%d: ", w); @@ -8353,7 +8353,7 @@ next: if (env->max_states_per_insn < states_cnt) env->max_states_per_insn = states_cnt; - if (!env->allow_ptr_leaks && states_cnt > BPF_COMPLEXITY_LIMIT_STATES) + if (!env->bpf_capable && states_cnt > BPF_COMPLEXITY_LIMIT_STATES) return push_jmp_history(env, cur); if (!add_new_state) @@ -10014,7 +10014,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) insn->code = BPF_JMP | BPF_TAIL_CALL; aux = &env->insn_aux_data[i + delta]; - if (env->allow_ptr_leaks && !expect_blinding && + if (env->bpf_capable && !expect_blinding && prog->jit_requested && !bpf_map_key_poisoned(aux) && !bpf_map_ptr_poisoned(aux) && @@ -10758,7 +10758,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, env->insn_aux_data[i].orig_idx = i; env->prog = *prog; env->ops = bpf_verifier_ops[env->prog->type]; - is_priv = capable(CAP_SYS_ADMIN); + is_priv = bpf_capable(); if (!btf_vmlinux && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) { mutex_lock(&bpf_verifier_lock); @@ -10799,7 +10799,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, if (attr->prog_flags & BPF_F_ANY_ALIGNMENT) env->strict_alignment = false; - env->allow_ptr_leaks = is_priv; + env->allow_ptr_leaks = bpf_allow_ptr_leaks(); + env->bypass_spec_v1 = bpf_bypass_spec_v1(); + env->bypass_spec_v4 = bpf_bypass_spec_v4(); + env->bpf_capable = bpf_capable(); if (is_priv) env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index d961428fb5b6..9a84d7fb4869 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -315,6 +315,9 @@ static const struct bpf_func_proto bpf_probe_write_user_proto = { static const struct bpf_func_proto *bpf_get_probe_write_proto(void) { + if (!capable(CAP_SYS_ADMIN)) + return NULL; + pr_warn_ratelimited("%s[%d] is installing a program with bpf_probe_write_user helper that may corrupt user memory!", current->comm, task_pid_nr(current)); diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 756b63b6f7b3..d2c4d16dadba 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -625,7 +625,7 @@ static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr) !attr->btf_key_type_id || !attr->btf_value_type_id) return -EINVAL; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return -EPERM; if (attr->value_size > MAX_VALUE_SIZE) @@ -978,7 +978,7 @@ bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs) /* bpf_sk_storage_map is currently limited to CAP_SYS_ADMIN as * the map_alloc_check() side also does. */ - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return ERR_PTR(-EPERM); nla_for_each_nested(nla, nla_stgs, rem) { diff --git a/net/core/filter.c b/net/core/filter.c index a85eb538d4d6..f8a3c7e9d027 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6687,7 +6687,7 @@ static bool cg_skb_is_valid_access(int off, int size, return false; case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_end): - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return false; break; } @@ -6699,7 +6699,7 @@ static bool cg_skb_is_valid_access(int off, int size, case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): break; case bpf_ctx_range(struct __sk_buff, tstamp): - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return false; break; default: -- cgit v1.2.3 From 2e186a2cf8c785f38ef3237e83f8921f82f6e2b7 Mon Sep 17 00:00:00 2001 From: Tobias Waldekranz Date: Fri, 15 May 2020 11:52:52 +0200 Subject: net: core: recursively find netdev by device node The assumption that a device node is associated either with the netdev's device, or the parent of that device, does not hold for all drivers. E.g. Freescale's DPAA has two layers of platform devices above the netdev. Instead, recursively walk up the tree from the netdev, allowing any parent to match against the sought after node. Signed-off-by: Tobias Waldekranz Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 880e89c894f6..e353b822bb15 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1805,12 +1805,12 @@ static struct class net_class __ro_after_init = { #ifdef CONFIG_OF_NET static int of_dev_node_match(struct device *dev, const void *data) { - int ret = 0; - - if (dev->parent) - ret = dev->parent->of_node == data; + for (; dev; dev = dev->parent) { + if (dev->of_node == data) + return 1; + } - return ret == 0 ? dev->of_node == data : ret; + return 0; } /* -- cgit v1.2.3 From f8ab1807a9c9aa14478920e64d1c9d3685aae26f Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Fri, 15 May 2020 14:40:11 +0300 Subject: net: sched: introduce terse dump flag Add new TCA_DUMP_FLAGS attribute and use it in cls API to request terse filter output from classifiers with TCA_DUMP_FLAGS_TERSE flag. This option is intended to be used to improve performance of TC filter dump when userland only needs to obtain stats and not the whole classifier/action data. Extend struct tcf_proto_ops with new terse_dump() callback that must be defined by supporting classifier implementations. Support of the options in specific classifiers and actions is implemented in following patches in the series. Signed-off-by: Vlad Buslov Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/sch_generic.h | 4 ++++ include/uapi/linux/rtnetlink.h | 6 ++++++ net/sched/cls_api.c | 39 +++++++++++++++++++++++++++++++-------- 3 files changed, 41 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index ab87a8b86a32..c510b03b9751 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -330,6 +330,10 @@ struct tcf_proto_ops { int (*dump)(struct net*, struct tcf_proto*, void *, struct sk_buff *skb, struct tcmsg*, bool); + int (*terse_dump)(struct net *net, + struct tcf_proto *tp, void *fh, + struct sk_buff *skb, + struct tcmsg *t, bool rtnl_held); int (*tmplt_dump)(struct sk_buff *skb, struct net *net, void *tmplt_priv); diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 4a8c5b745157..073e71ef6bdd 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -609,11 +609,17 @@ enum { TCA_HW_OFFLOAD, TCA_INGRESS_BLOCK, TCA_EGRESS_BLOCK, + TCA_DUMP_FLAGS, __TCA_MAX }; #define TCA_MAX (__TCA_MAX - 1) +#define TCA_DUMP_FLAGS_TERSE (1 << 0) /* Means that in dump user gets only basic + * data necessary to identify the objects + * (handle, cookie, etc.) and stats. + */ + #define TCA_RTA(r) ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct tcmsg)))) #define TCA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct tcmsg)) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 299b963c796e..cb2c10e0fee5 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1851,7 +1851,7 @@ static int tcf_fill_node(struct net *net, struct sk_buff *skb, struct tcf_proto *tp, struct tcf_block *block, struct Qdisc *q, u32 parent, void *fh, u32 portid, u32 seq, u16 flags, int event, - bool rtnl_held) + bool terse_dump, bool rtnl_held) { struct tcmsg *tcm; struct nlmsghdr *nlh; @@ -1878,6 +1878,14 @@ static int tcf_fill_node(struct net *net, struct sk_buff *skb, goto nla_put_failure; if (!fh) { tcm->tcm_handle = 0; + } else if (terse_dump) { + if (tp->ops->terse_dump) { + if (tp->ops->terse_dump(net, tp, fh, skb, tcm, + rtnl_held) < 0) + goto nla_put_failure; + } else { + goto cls_op_not_supp; + } } else { if (tp->ops->dump && tp->ops->dump(net, tp, fh, skb, tcm, rtnl_held) < 0) @@ -1888,6 +1896,7 @@ static int tcf_fill_node(struct net *net, struct sk_buff *skb, out_nlmsg_trim: nla_put_failure: +cls_op_not_supp: nlmsg_trim(skb, b); return -1; } @@ -1908,7 +1917,7 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb, if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid, n->nlmsg_seq, n->nlmsg_flags, event, - rtnl_held) <= 0) { + false, rtnl_held) <= 0) { kfree_skb(skb); return -EINVAL; } @@ -1940,7 +1949,7 @@ static int tfilter_del_notify(struct net *net, struct sk_buff *oskb, if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid, n->nlmsg_seq, n->nlmsg_flags, RTM_DELTFILTER, - rtnl_held) <= 0) { + false, rtnl_held) <= 0) { NL_SET_ERR_MSG(extack, "Failed to build del event notification"); kfree_skb(skb); return -EINVAL; @@ -2501,6 +2510,7 @@ struct tcf_dump_args { struct tcf_block *block; struct Qdisc *q; u32 parent; + bool terse_dump; }; static int tcf_node_dump(struct tcf_proto *tp, void *n, struct tcf_walker *arg) @@ -2511,12 +2521,12 @@ static int tcf_node_dump(struct tcf_proto *tp, void *n, struct tcf_walker *arg) return tcf_fill_node(net, a->skb, tp, a->block, a->q, a->parent, n, NETLINK_CB(a->cb->skb).portid, a->cb->nlh->nlmsg_seq, NLM_F_MULTI, - RTM_NEWTFILTER, true); + RTM_NEWTFILTER, a->terse_dump, true); } static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent, struct sk_buff *skb, struct netlink_callback *cb, - long index_start, long *p_index) + long index_start, long *p_index, bool terse) { struct net *net = sock_net(skb->sk); struct tcf_block *block = chain->block; @@ -2545,7 +2555,7 @@ static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent, if (tcf_fill_node(net, skb, tp, block, q, parent, NULL, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, - RTM_NEWTFILTER, true) <= 0) + RTM_NEWTFILTER, false, true) <= 0) goto errout; cb->args[1] = 1; } @@ -2561,6 +2571,7 @@ static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent, arg.w.skip = cb->args[1] - 1; arg.w.count = 0; arg.w.cookie = cb->args[2]; + arg.terse_dump = terse; tp->ops->walk(tp, &arg.w, true); cb->args[2] = arg.w.cookie; cb->args[1] = arg.w.count + 1; @@ -2574,6 +2585,10 @@ errout: return false; } +static const struct nla_policy tcf_tfilter_dump_policy[TCA_MAX + 1] = { + [TCA_DUMP_FLAGS] = NLA_POLICY_BITFIELD32(TCA_DUMP_FLAGS_TERSE), +}; + /* called with RTNL */ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) { @@ -2583,6 +2598,7 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) struct Qdisc *q = NULL; struct tcf_block *block; struct tcmsg *tcm = nlmsg_data(cb->nlh); + bool terse_dump = false; long index_start; long index; u32 parent; @@ -2592,10 +2608,17 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; err = nlmsg_parse_deprecated(cb->nlh, sizeof(*tcm), tca, TCA_MAX, - NULL, cb->extack); + tcf_tfilter_dump_policy, cb->extack); if (err) return err; + if (tca[TCA_DUMP_FLAGS]) { + struct nla_bitfield32 flags = + nla_get_bitfield32(tca[TCA_DUMP_FLAGS]); + + terse_dump = flags.value & TCA_DUMP_FLAGS_TERSE; + } + if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK) { block = tcf_block_refcnt_get(net, tcm->tcm_block_index); if (!block) @@ -2653,7 +2676,7 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) nla_get_u32(tca[TCA_CHAIN]) != chain->index) continue; if (!tcf_chain_dump(chain, q, parent, skb, cb, - index_start, &index)) { + index_start, &index, terse_dump)) { tcf_chain_put(chain); err = -EMSGSIZE; break; -- cgit v1.2.3 From ca44b738e59420ae73d9e04a1be630a405e3a0f1 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Fri, 15 May 2020 14:40:12 +0300 Subject: net: sched: implement terse dump support in act Extend tcf_action_dump() with boolean argument 'terse' that is used to request terse-mode action dump. In terse mode only essential data needed to identify particular action (action kind, cookie, etc.) and its stats is put to resulting skb and everything else is omitted. Implement tcf_exts_terse_dump() helper in cls API that is intended to be used to request terse dump of all exts (actions) attached to the filter. Signed-off-by: Vlad Buslov Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/act_api.h | 2 +- include/net/pkt_cls.h | 1 + net/sched/act_api.c | 30 +++++++++++++++++++++++------- net/sched/cls_api.c | 28 +++++++++++++++++++++++++++- 4 files changed, 52 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/include/net/act_api.h b/include/net/act_api.h index c24d7643548e..1b4bfc4437be 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -193,7 +193,7 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, bool rtnl_held, struct netlink_ext_ack *extack); int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[], int bind, - int ref); + int ref, bool terse); int tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int, int); int tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int, int); diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 04aa0649f3b0..ed65619cbc47 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -325,6 +325,7 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, void tcf_exts_destroy(struct tcf_exts *exts); void tcf_exts_change(struct tcf_exts *dst, struct tcf_exts *src); int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts); +int tcf_exts_terse_dump(struct sk_buff *skb, struct tcf_exts *exts); int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts); /** diff --git a/net/sched/act_api.c b/net/sched/act_api.c index fbbec2e562f5..8ac7eb0a8309 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -766,12 +766,10 @@ tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int bind, int ref) return a->ops->dump(skb, a, bind, ref); } -int -tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref) +static int +tcf_action_dump_terse(struct sk_buff *skb, struct tc_action *a) { - int err = -EINVAL; unsigned char *b = skb_tail_pointer(skb); - struct nlattr *nest; struct tc_cookie *cookie; if (nla_put_string(skb, TCA_KIND, a->ops->kind)) @@ -789,6 +787,23 @@ tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref) } rcu_read_unlock(); + return 0; + +nla_put_failure: + nlmsg_trim(skb, b); + return -1; +} + +int +tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref) +{ + int err = -EINVAL; + unsigned char *b = skb_tail_pointer(skb); + struct nlattr *nest; + + if (tcf_action_dump_terse(skb, a)) + goto nla_put_failure; + if (a->hw_stats != TCA_ACT_HW_STATS_ANY && nla_put_bitfield32(skb, TCA_ACT_HW_STATS, a->hw_stats, TCA_ACT_HW_STATS_ANY)) @@ -820,7 +835,7 @@ nla_put_failure: EXPORT_SYMBOL(tcf_action_dump_1); int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[], - int bind, int ref) + int bind, int ref, bool terse) { struct tc_action *a; int err = -EINVAL, i; @@ -831,7 +846,8 @@ int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[], nest = nla_nest_start_noflag(skb, i + 1); if (nest == NULL) goto nla_put_failure; - err = tcf_action_dump_1(skb, a, bind, ref); + err = terse ? tcf_action_dump_terse(skb, a) : + tcf_action_dump_1(skb, a, bind, ref); if (err < 0) goto errout; nla_nest_end(skb, nest); @@ -1133,7 +1149,7 @@ static int tca_get_fill(struct sk_buff *skb, struct tc_action *actions[], if (!nest) goto out_nlmsg_trim; - if (tcf_action_dump(skb, actions, bind, ref) < 0) + if (tcf_action_dump(skb, actions, bind, ref, false) < 0) goto out_nlmsg_trim; nla_nest_end(skb, nest); diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index cb2c10e0fee5..752d608f4442 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -3179,7 +3179,8 @@ int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts) if (nest == NULL) goto nla_put_failure; - if (tcf_action_dump(skb, exts->actions, 0, 0) < 0) + if (tcf_action_dump(skb, exts->actions, 0, 0, false) + < 0) goto nla_put_failure; nla_nest_end(skb, nest); } else if (exts->police) { @@ -3203,6 +3204,31 @@ nla_put_failure: } EXPORT_SYMBOL(tcf_exts_dump); +int tcf_exts_terse_dump(struct sk_buff *skb, struct tcf_exts *exts) +{ +#ifdef CONFIG_NET_CLS_ACT + struct nlattr *nest; + + if (!exts->action || !tcf_exts_has_actions(exts)) + return 0; + + nest = nla_nest_start_noflag(skb, exts->action); + if (!nest) + goto nla_put_failure; + + if (tcf_action_dump(skb, exts->actions, 0, 0, true) < 0) + goto nla_put_failure; + nla_nest_end(skb, nest); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -1; +#else + return 0; +#endif +} +EXPORT_SYMBOL(tcf_exts_terse_dump); int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts) { -- cgit v1.2.3 From 0348451db9fffebd01d7b496e108dd729c2fcb24 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Fri, 15 May 2020 14:40:13 +0300 Subject: net: sched: cls_flower: implement terse dump support Implement tcf_proto_ops->terse_dump() callback for flower classifier. Only dump handle, flags and action data in terse mode. Signed-off-by: Vlad Buslov Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_flower.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'net') diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 74a0febcafb8..0c574700da75 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -2768,6 +2768,48 @@ nla_put_failure: return -1; } +static int fl_terse_dump(struct net *net, struct tcf_proto *tp, void *fh, + struct sk_buff *skb, struct tcmsg *t, bool rtnl_held) +{ + struct cls_fl_filter *f = fh; + struct nlattr *nest; + bool skip_hw; + + if (!f) + return skb->len; + + t->tcm_handle = f->handle; + + nest = nla_nest_start_noflag(skb, TCA_OPTIONS); + if (!nest) + goto nla_put_failure; + + spin_lock(&tp->lock); + + skip_hw = tc_skip_hw(f->flags); + + if (f->flags && nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags)) + goto nla_put_failure_locked; + + spin_unlock(&tp->lock); + + if (!skip_hw) + fl_hw_update_stats(tp, f, rtnl_held); + + if (tcf_exts_terse_dump(skb, &f->exts)) + goto nla_put_failure; + + nla_nest_end(skb, nest); + + return skb->len; + +nla_put_failure_locked: + spin_unlock(&tp->lock); +nla_put_failure: + nla_nest_cancel(skb, nest); + return -1; +} + static int fl_tmplt_dump(struct sk_buff *skb, struct net *net, void *tmplt_priv) { struct fl_flow_tmplt *tmplt = tmplt_priv; @@ -2832,6 +2874,7 @@ static struct tcf_proto_ops cls_fl_ops __read_mostly = { .hw_add = fl_hw_add, .hw_del = fl_hw_del, .dump = fl_dump, + .terse_dump = fl_terse_dump, .bind_class = fl_bind_class, .tmplt_create = fl_tmplt_create, .tmplt_destroy = fl_tmplt_destroy, -- cgit v1.2.3 From 90bf45134d55d626ae2713cac50cda10c6c8b0c2 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 15 May 2020 19:22:15 +0200 Subject: mptcp: add new sock flag to deal with join subflows MP_JOIN subflows must not land into the accept queue. Currently tcp_check_req() calls an mptcp specific helper to detect such scenario. Such helper leverages the subflow context to check for MP_JOIN subflows. We need to deal also with MP JOIN failures, even when the subflow context is not available due allocation failure. A possible solution would be changing the syn_recv_sock() signature to allow returning a more descriptive action/ error code and deal with that in tcp_check_req(). Since the above need is MPTCP specific, this patch instead uses a TCP request socket hole to add a MPTCP specific flag. Such flag is used by the MPTCP syn_recv_sock() to tell tcp_check_req() how to deal with the request socket. This change is a no-op for !MPTCP build, and makes the MPTCP code simpler. It allows also the next patch to deal correctly with MP JOIN failure. v1 -> v2: - be more conservative on drop_req initialization (Mat) RFC -> v1: - move the drop_req bit inside tcp_request_sock (Eric) Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Reviewed-by: Christoph Paasch Signed-off-by: David S. Miller --- include/linux/tcp.h | 3 +++ include/net/mptcp.h | 17 ++++++++++------- net/ipv4/tcp_minisocks.c | 2 +- net/mptcp/protocol.c | 7 ------- net/mptcp/subflow.c | 3 +++ 5 files changed, 17 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index e60db06ec28d..bf44e85d709d 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -120,6 +120,9 @@ struct tcp_request_sock { u64 snt_synack; /* first SYNACK sent time */ bool tfo_listener; bool is_mptcp; +#if IS_ENABLED(CONFIG_MPTCP) + bool drop_req; +#endif u32 txhash; u32 rcv_isn; u32 snt_isn; diff --git a/include/net/mptcp.h b/include/net/mptcp.h index e60275659de6..c4a6ef4ba35b 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -68,6 +68,11 @@ static inline bool rsk_is_mptcp(const struct request_sock *req) return tcp_rsk(req)->is_mptcp; } +static inline bool rsk_drop_req(const struct request_sock *req) +{ + return tcp_rsk(req)->is_mptcp && tcp_rsk(req)->drop_req; +} + void mptcp_space(const struct sock *ssk, int *space, int *full_space); bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, unsigned int *size, struct mptcp_out_options *opts); @@ -121,8 +126,6 @@ static inline bool mptcp_skb_can_collapse(const struct sk_buff *to, skb_ext_find(from, SKB_EXT_MPTCP)); } -bool mptcp_sk_is_subflow(const struct sock *sk); - void mptcp_seq_show(struct seq_file *seq); #else @@ -140,6 +143,11 @@ static inline bool rsk_is_mptcp(const struct request_sock *req) return false; } +static inline bool rsk_drop_req(const struct request_sock *req) +{ + return false; +} + static inline void mptcp_parse_option(const struct sk_buff *skb, const unsigned char *ptr, int opsize, struct tcp_options_received *opt_rx) @@ -190,11 +198,6 @@ static inline bool mptcp_skb_can_collapse(const struct sk_buff *to, return true; } -static inline bool mptcp_sk_is_subflow(const struct sock *sk) -{ - return false; -} - static inline void mptcp_space(const struct sock *ssk, int *s, int *fs) { } static inline void mptcp_seq_show(struct seq_file *seq) { } #endif /* CONFIG_MPTCP */ diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 7e40322cc5ec..495dda2449fe 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -774,7 +774,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, if (!child) goto listen_overflow; - if (own_req && sk_is_mptcp(child) && mptcp_sk_is_subflow(child)) { + if (own_req && rsk_drop_req(req)) { reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req); inet_csk_reqsk_queue_drop_and_put(sk, req); return child; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index e1f23016ed3f..a61e60e94137 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1638,13 +1638,6 @@ bool mptcp_finish_join(struct sock *sk) return ret; } -bool mptcp_sk_is_subflow(const struct sock *sk) -{ - struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); - - return subflow->mp_join == 1; -} - static bool mptcp_memory_free(const struct sock *sk, int wake) { struct mptcp_sock *msk = mptcp_sk(sk); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 009d5c478062..5e03ed8ae899 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -470,6 +470,8 @@ create_child: if (child && *own_req) { struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(child); + tcp_rsk(req)->drop_req = false; + /* we need to fallback on ctx allocation failure and on pre-reqs * checking above. In the latter scenario we additionally need * to reset the context to non MPTCP status. @@ -512,6 +514,7 @@ create_child: goto close_child; SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKRX); + tcp_rsk(req)->drop_req = true; } } -- cgit v1.2.3 From 2f8a397d0a54b59c05e481523ab2a88a63d82d18 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 15 May 2020 19:22:16 +0200 Subject: inet_connection_sock: factor out destroy helper. Move the steps to prepare an inet_connection_sock for forced disposal inside a separate helper. No functional changes inteded, this will just simplify the next patch. Signed-off-by: Paolo Abeni Reviewed-by: Christoph Paasch Reviewed-by: Mat Martineau Signed-off-by: David S. Miller --- include/net/inet_connection_sock.h | 8 ++++++++ net/ipv4/inet_connection_sock.c | 6 +----- 2 files changed, 9 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index a3f076befa4f..2f1f8c3efb26 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -287,6 +287,14 @@ static inline int inet_csk_reqsk_queue_is_full(const struct sock *sk) void inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req); void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req); +static inline void inet_csk_prepare_for_destroy_sock(struct sock *sk) +{ + /* The below has to be done to allow calling inet_csk_destroy_sock */ + sock_set_flag(sk, SOCK_DEAD); + percpu_counter_inc(sk->sk_prot->orphan_count); + inet_sk(sk)->inet_num = 0; +} + void inet_csk_destroy_sock(struct sock *sk); void inet_csk_prepare_forced_close(struct sock *sk); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 5f34eb951627..d6faf3702824 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -896,11 +896,7 @@ void inet_csk_prepare_forced_close(struct sock *sk) /* sk_clone_lock locked the socket and set refcnt to 2 */ bh_unlock_sock(sk); sock_put(sk); - - /* The below has to be done to allow calling inet_csk_destroy_sock */ - sock_set_flag(sk, SOCK_DEAD); - percpu_counter_inc(sk->sk_prot->orphan_count); - inet_sk(sk)->inet_num = 0; + inet_csk_prepare_for_destroy_sock(sk); } EXPORT_SYMBOL(inet_csk_prepare_forced_close); -- cgit v1.2.3 From 729cd6436f359b6e618c2f14836d419f40444503 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 15 May 2020 19:22:17 +0200 Subject: mptcp: cope better with MP_JOIN failure Currently, on MP_JOIN failure we reset the child socket, but leave the request socket untouched. tcp_check_req will deal with it according to the 'tcp_abort_on_overflow' sysctl value - by default the req socket will stay alive. The above leads to inconsistent behavior on MP JOIN failure, and bad listener overflow accounting. This patch addresses the issue leveraging the infrastructure just introduced to ask the TCP stack to drop the req on failure. The child socket is not freed anymore by subflow_syn_recv_sock(), instead it's moved to a dead state and will be disposed by the next sock_put done by the TCP stack, so that listener overflow accounting is not affected by MP JOIN failure. Signed-off-by: Paolo Abeni Reviewed-by: Christoph Paasch Reviewed-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/subflow.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 5e03ed8ae899..3cf2eeea9d80 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -478,7 +478,7 @@ create_child: */ if (!ctx || fallback) { if (fallback_is_fatal) - goto close_child; + goto dispose_child; if (ctx) { subflow_ulp_fallback(child, ctx); @@ -507,11 +507,11 @@ create_child: owner = mptcp_token_get_sock(ctx->token); if (!owner) - goto close_child; + goto dispose_child; ctx->conn = (struct sock *)owner; if (!mptcp_finish_join(child)) - goto close_child; + goto dispose_child; SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKRX); tcp_rsk(req)->drop_req = true; @@ -531,11 +531,14 @@ out: !mptcp_subflow_ctx(child)->conn)); return child; -close_child: +dispose_child: + tcp_rsk(req)->drop_req = true; tcp_send_active_reset(child, GFP_ATOMIC); - inet_csk_prepare_forced_close(child); + inet_csk_prepare_for_destroy_sock(child); tcp_done(child); - return NULL; + + /* The last child reference will be released by the caller */ + return child; } static struct inet_connection_sock_af_ops subflow_specific; -- cgit v1.2.3 From 9efd6a3cecdde984d67e63d17fe6af53c7c50968 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Wed, 13 May 2020 15:58:43 +0200 Subject: netns: enable to inherit devconf from current netns The goal is to be able to inherit the initial devconf parameters from the current netns, ie the netns where this new netns has been created. This is useful in a containers environment where /proc/sys is read only. For example, if a pod is created with specifics devconf parameters and has the capability to create netns, the user expects to get the same parameters than his 'init_net', which is not the real init_net in this case. Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- Documentation/admin-guide/sysctl/net.rst | 4 +++- net/core/sysctl_net_core.c | 4 +++- net/ipv4/devinet.c | 23 ++++++++++++++++++----- net/ipv6/addrconf.c | 23 ++++++++++++++++++++--- 4 files changed, 44 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/Documentation/admin-guide/sysctl/net.rst b/Documentation/admin-guide/sysctl/net.rst index 2ad1b77a7182..42cd04bca548 100644 --- a/Documentation/admin-guide/sysctl/net.rst +++ b/Documentation/admin-guide/sysctl/net.rst @@ -339,7 +339,9 @@ settings from init_net and for IPv6 we reset all settings to default. If set to 1, both IPv4 and IPv6 settings are forced to inherit from current ones in init_net. If set to 2, both IPv4 and IPv6 settings are -forced to reset to their default values. +forced to reset to their default values. If set to 3, both IPv4 and IPv6 +settings are forced to inherit from current ones in the netns where this +new netns has been created. Default : 0 (for compatibility reasons) diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 0ddb13a6282b..b109cc8a6dd8 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -23,6 +23,7 @@ #include static int two __maybe_unused = 2; +static int three = 3; static int min_sndbuf = SOCK_MIN_SNDBUF; static int min_rcvbuf = SOCK_MIN_RCVBUF; static int max_skb_frags = MAX_SKB_FRAGS; @@ -39,6 +40,7 @@ EXPORT_SYMBOL(sysctl_fb_tunnels_only_for_init_net); * IPv6: reset all settings to default * 1 - Both inherit all current settings from init_net * 2 - Both reset all settings to default + * 3 - Both inherit all settings from current netns */ int sysctl_devconf_inherit_init_net __read_mostly; EXPORT_SYMBOL(sysctl_devconf_inherit_init_net); @@ -553,7 +555,7 @@ static struct ctl_table net_core_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = &two, + .extra2 = &three, }, { .procname = "high_order_alloc_disable", diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index fc94f82f82c7..f048d0a188b7 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -2666,11 +2666,24 @@ static __net_init int devinet_init_net(struct net *net) tbl[0].extra2 = net; #endif - if ((!IS_ENABLED(CONFIG_SYSCTL) || - sysctl_devconf_inherit_init_net != 2) && - !net_eq(net, &init_net)) { - memcpy(all, init_net.ipv4.devconf_all, sizeof(ipv4_devconf)); - memcpy(dflt, init_net.ipv4.devconf_dflt, sizeof(ipv4_devconf_dflt)); + if (!net_eq(net, &init_net)) { + if (IS_ENABLED(CONFIG_SYSCTL) && + sysctl_devconf_inherit_init_net == 3) { + /* copy from the current netns */ + memcpy(all, current->nsproxy->net_ns->ipv4.devconf_all, + sizeof(ipv4_devconf)); + memcpy(dflt, + current->nsproxy->net_ns->ipv4.devconf_dflt, + sizeof(ipv4_devconf_dflt)); + } else if (!IS_ENABLED(CONFIG_SYSCTL) || + sysctl_devconf_inherit_init_net != 2) { + /* inherit == 0 or 1: copy from init_net */ + memcpy(all, init_net.ipv4.devconf_all, + sizeof(ipv4_devconf)); + memcpy(dflt, init_net.ipv4.devconf_dflt, + sizeof(ipv4_devconf_dflt)); + } + /* else inherit == 2: use compiled values */ } #ifdef CONFIG_SYSCTL diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index fd885f06c4ed..ab7e839753ae 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -6991,9 +6991,26 @@ static int __net_init addrconf_init_net(struct net *net) goto err_alloc_dflt; if (IS_ENABLED(CONFIG_SYSCTL) && - sysctl_devconf_inherit_init_net == 1 && !net_eq(net, &init_net)) { - memcpy(all, init_net.ipv6.devconf_all, sizeof(ipv6_devconf)); - memcpy(dflt, init_net.ipv6.devconf_dflt, sizeof(ipv6_devconf_dflt)); + !net_eq(net, &init_net)) { + switch (sysctl_devconf_inherit_init_net) { + case 1: /* copy from init_net */ + memcpy(all, init_net.ipv6.devconf_all, + sizeof(ipv6_devconf)); + memcpy(dflt, init_net.ipv6.devconf_dflt, + sizeof(ipv6_devconf_dflt)); + break; + case 3: /* copy from the current netns */ + memcpy(all, current->nsproxy->net_ns->ipv6.devconf_all, + sizeof(ipv6_devconf)); + memcpy(dflt, + current->nsproxy->net_ns->ipv6.devconf_dflt, + sizeof(ipv6_devconf_dflt)); + break; + case 0: + case 2: + /* use compiled values */ + break; + } } /* these will be inherited by all namespaces */ -- cgit v1.2.3 From a0c1d0eafd1ef1ada3b588ea205e5bc37ae0d8d9 Mon Sep 17 00:00:00 2001 From: Christoph Paasch Date: Thu, 14 May 2020 08:53:03 -0700 Subject: mptcp: Use 32-bit DATA_ACK when possible RFC8684 allows to send 32-bit DATA_ACKs as long as the peer is not sending 64-bit data-sequence numbers. The 64-bit DSN is only there for extreme scenarios when a very high throughput subflow is combined with a long-RTT subflow such that the high-throughput subflow wraps around the 32-bit sequence number space within an RTT of the high-RTT subflow. It is thus a rare scenario and we should try to use the 32-bit DATA_ACK instead as long as possible. It allows to reduce the TCP-option overhead by 4 bytes, thus makes space for an additional SACK-block. It also makes tcpdumps much easier to read when the DSN and DATA_ACK are both either 32 or 64-bit. Signed-off-by: Christoph Paasch Reviewed-by: Matthieu Baerts Signed-off-by: David S. Miller --- include/net/mptcp.h | 5 ++++- net/mptcp/options.c | 33 ++++++++++++++++++++++++--------- net/mptcp/protocol.h | 1 + net/mptcp/subflow.c | 2 ++ 4 files changed, 31 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/net/mptcp.h b/include/net/mptcp.h index c4a6ef4ba35b..46d0487d2b22 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -16,7 +16,10 @@ struct seq_file; /* MPTCP sk_buff extension data */ struct mptcp_ext { - u64 data_ack; + union { + u64 data_ack; + u32 data_ack32; + }; u64 data_seq; u32 subflow_seq; u16 data_len; diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 45497af23906..ece6f92cf7d1 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -516,7 +516,16 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, return ret; } - ack_size = TCPOLEN_MPTCP_DSS_ACK64; + if (subflow->use_64bit_ack) { + ack_size = TCPOLEN_MPTCP_DSS_ACK64; + opts->ext_copy.data_ack = msk->ack_seq; + opts->ext_copy.ack64 = 1; + } else { + ack_size = TCPOLEN_MPTCP_DSS_ACK32; + opts->ext_copy.data_ack32 = (uint32_t)(msk->ack_seq); + opts->ext_copy.ack64 = 0; + } + opts->ext_copy.use_ack = 1; /* Add kind/length/subtype/flag overhead if mapping is not populated */ if (dss_size == 0) @@ -524,10 +533,6 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, dss_size += ack_size; - opts->ext_copy.data_ack = msk->ack_seq; - opts->ext_copy.ack64 = 1; - opts->ext_copy.use_ack = 1; - *size = ALIGN(dss_size, 4); return true; } @@ -986,8 +991,13 @@ mp_capable_done: u8 flags = 0; if (mpext->use_ack) { - len += TCPOLEN_MPTCP_DSS_ACK64; - flags = MPTCP_DSS_HAS_ACK | MPTCP_DSS_ACK64; + flags = MPTCP_DSS_HAS_ACK; + if (mpext->ack64) { + len += TCPOLEN_MPTCP_DSS_ACK64; + flags |= MPTCP_DSS_ACK64; + } else { + len += TCPOLEN_MPTCP_DSS_ACK32; + } } if (mpext->use_map) { @@ -1004,8 +1014,13 @@ mp_capable_done: *ptr++ = mptcp_option(MPTCPOPT_DSS, len, 0, flags); if (mpext->use_ack) { - put_unaligned_be64(mpext->data_ack, ptr); - ptr += 2; + if (mpext->ack64) { + put_unaligned_be64(mpext->data_ack, ptr); + ptr += 2; + } else { + put_unaligned_be32(mpext->data_ack32, ptr); + ptr += 1; + } } if (mpext->use_map) { diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index e4ca6320ce76..f5adca93e8fb 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -290,6 +290,7 @@ struct mptcp_subflow_context { data_avail : 1, rx_eof : 1, data_fin_tx_enable : 1, + use_64bit_ack : 1, /* Set when we received a 64-bit DSN */ can_ack : 1; /* only after processing the remote a key */ u64 data_fin_tx_seq; u32 remote_nonce; diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 53c75b0e5dce..0020d356233d 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -667,9 +667,11 @@ static enum mapping_status get_mapping_status(struct sock *ssk) if (!mpext->dsn64) { map_seq = expand_seq(subflow->map_seq, subflow->map_data_len, mpext->data_seq); + subflow->use_64bit_ack = 0; pr_debug("expanded seq=%llu", subflow->map_seq); } else { map_seq = mpext->data_seq; + subflow->use_64bit_ack = 1; } if (subflow->map_valid) { -- cgit v1.2.3 From 7be92514b99c15b89def6d72bbc84c354f89a025 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 15 May 2020 12:49:00 -0700 Subject: ethtool: check if there is at least one channel for TX/RX in the core Having a channel config with no ability to RX or TX traffic is clearly wrong. Check for this in the core so the drivers don't have to. Signed-off-by: Jakub Kicinski Reviewed-by: Michal Kubecek Signed-off-by: David S. Miller --- net/ethtool/channels.c | 20 ++++++++++++++++++-- net/ethtool/ioctl.c | 5 +++++ 2 files changed, 23 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ethtool/channels.c b/net/ethtool/channels.c index 389924b65d05..3aa4975919d7 100644 --- a/net/ethtool/channels.c +++ b/net/ethtool/channels.c @@ -129,13 +129,13 @@ int ethnl_set_channels(struct sk_buff *skb, struct genl_info *info) { struct nlattr *tb[ETHTOOL_A_CHANNELS_MAX + 1]; unsigned int from_channel, old_total, i; + bool mod = false, mod_combined = false; struct ethtool_channels channels = {}; struct ethnl_req_info req_info = {}; const struct nlattr *err_attr; const struct ethtool_ops *ops; struct net_device *dev; u32 max_rx_in_use = 0; - bool mod = false; int ret; ret = nlmsg_parse(info->nlhdr, GENL_HDRLEN, tb, @@ -170,7 +170,8 @@ int ethnl_set_channels(struct sk_buff *skb, struct genl_info *info) ethnl_update_u32(&channels.other_count, tb[ETHTOOL_A_CHANNELS_OTHER_COUNT], &mod); ethnl_update_u32(&channels.combined_count, - tb[ETHTOOL_A_CHANNELS_COMBINED_COUNT], &mod); + tb[ETHTOOL_A_CHANNELS_COMBINED_COUNT], &mod_combined); + mod |= mod_combined; ret = 0; if (!mod) goto out_ops; @@ -193,6 +194,21 @@ int ethnl_set_channels(struct sk_buff *skb, struct genl_info *info) goto out_ops; } + /* ensure there is at least one RX and one TX channel */ + if (!channels.combined_count && !channels.rx_count) + err_attr = tb[ETHTOOL_A_CHANNELS_RX_COUNT]; + else if (!channels.combined_count && !channels.tx_count) + err_attr = tb[ETHTOOL_A_CHANNELS_TX_COUNT]; + else + err_attr = NULL; + if (err_attr) { + if (mod_combined) + err_attr = tb[ETHTOOL_A_CHANNELS_COMBINED_COUNT]; + ret = -EINVAL; + NL_SET_ERR_MSG_ATTR(info->extack, err_attr, "requested channel counts would result in no RX or TX channel being configured"); + goto out_ops; + } + /* ensure the new Rx count fits within the configured Rx flow * indirection table settings */ diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 52102ab1709b..a574d60111fa 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -1676,6 +1676,11 @@ static noinline_for_stack int ethtool_set_channels(struct net_device *dev, channels.other_count > curr.max_other) return -EINVAL; + /* ensure there is at least one RX and one TX channel */ + if (!channels.combined_count && + (!channels.rx_count || !channels.tx_count)) + return -EINVAL; + /* ensure the new Rx count fits within the configured Rx flow * indirection table settings */ if (netif_is_rxfh_configured(dev) && -- cgit v1.2.3 From 75c36dbb1c3790eab909344b758decce6bb432da Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 15 May 2020 12:49:02 -0700 Subject: ethtool: don't call set_channels in drivers if config didn't change Don't call drivers if nothing changed. Netlink code already contains this logic. Signed-off-by: Jakub Kicinski Reviewed-by: Michal Kubecek Signed-off-by: David S. Miller --- net/ethtool/ioctl.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index a574d60111fa..eeb1137a3f23 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -1669,6 +1669,12 @@ static noinline_for_stack int ethtool_set_channels(struct net_device *dev, dev->ethtool_ops->get_channels(dev, &curr); + if (channels.rx_count == curr.rx_count && + channels.tx_count == curr.tx_count && + channels.combined_count == curr.combined_count && + channels.other_count == curr.other_count) + return 0; + /* ensure new counts are within the maximums */ if (channels.rx_count > curr.max_rx || channels.tx_count > curr.max_tx || -- cgit v1.2.3 From a0e17064d43e445181bc004d949a4855ea8ccf9c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 16 May 2020 10:46:17 +0200 Subject: mptcp: move common nospace-pattern to a helper Paolo noticed that ssk_check_wmem() has same pattern, so add/use common helper for both places. Suggested-by: Paolo Abeni Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 1f52a0fa31ed..0413454fcdaf 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -653,6 +653,15 @@ out: return ret; } +static void mptcp_nospace(struct mptcp_sock *msk, struct socket *sock) +{ + clear_bit(MPTCP_SEND_SPACE, &msk->flags); + smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */ + + /* enables sk->write_space() callbacks */ + set_bit(SOCK_NOSPACE, &sock->flags); +} + static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; @@ -666,13 +675,8 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) if (!sk_stream_memory_free(ssk)) { struct socket *sock = ssk->sk_socket; - if (sock) { - clear_bit(MPTCP_SEND_SPACE, &msk->flags); - smp_mb__after_atomic(); - - /* enables sk->write_space() callbacks */ - set_bit(SOCK_NOSPACE, &sock->flags); - } + if (sock) + mptcp_nospace(msk, sock); return NULL; } @@ -698,13 +702,8 @@ static void ssk_check_wmem(struct mptcp_sock *msk, struct sock *ssk) return; sock = READ_ONCE(ssk->sk_socket); - - if (sock) { - clear_bit(MPTCP_SEND_SPACE, &msk->flags); - smp_mb__after_atomic(); - /* set NOSPACE only after clearing SEND_SPACE flag */ - set_bit(SOCK_NOSPACE, &sock->flags); - } + if (sock) + mptcp_nospace(msk, sock); } static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) -- cgit v1.2.3 From fb529e62d3f3e85001108213dc323c35f2765575 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 16 May 2020 10:46:18 +0200 Subject: mptcp: break and restart in case mptcp sndbuf is full Its not enough to check for available tcp send space. We also hold on to transmitted data for mptcp-level retransmits. Right now we will send more and more data if the peer can ack data at the tcp level fast enough, since that frees up tcp send buffer space. But we also need to check that data was acked and reclaimed at the mptcp level. Therefore add needed check in mptcp_sendmsg, flush tcp data and wait until more mptcp snd space becomes available if we are over the limit. Before we wait for more data, also make sure we start the retransmit timer if we ran out of sndbuf space. Otherwise there is a very small chance that we wait forever: * receiver is waiting for data * sender is blocked because mptcp socket buffer is full * at tcp level, all data was acked * mptcp-level snd_una was not updated, because last ack that acknowledged the last data packet carried an older MPTCP-ack. Restarting the retransmit timer avoids this problem: if TCP subflow is idle, data is retransmitted from the RTX queue. New data will make the peer send a new, updated MPTCP-Ack. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 0413454fcdaf..75be8d662ac5 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -739,9 +739,23 @@ fallback: mptcp_clean_una(sk); +wait_for_sndbuf: __mptcp_flush_join_list(msk); ssk = mptcp_subflow_get_send(msk); while (!sk_stream_memory_free(sk) || !ssk) { + if (ssk) { + /* make sure retransmit timer is + * running before we wait for memory. + * + * The retransmit timer might be needed + * to make the peer send an up-to-date + * MPTCP Ack. + */ + mptcp_set_timeout(sk, ssk); + if (!mptcp_timer_pending(sk)) + mptcp_reset_timer(sk); + } + ret = sk_stream_wait_memory(sk, &timeo); if (ret) goto out; @@ -776,6 +790,28 @@ fallback: } copied += ret; + + /* memory is charged to mptcp level socket as well, i.e. + * if msg is very large, mptcp socket may run out of buffer + * space. mptcp_clean_una() will release data that has + * been acked at mptcp level in the mean time, so there is + * a good chance we can continue sending data right away. + */ + if (unlikely(!sk_stream_memory_free(sk))) { + tcp_push(ssk, msg->msg_flags, mss_now, + tcp_sk(ssk)->nonagle, size_goal); + mptcp_clean_una(sk); + if (!sk_stream_memory_free(sk)) { + /* can't send more for now, need to wait for + * MPTCP-level ACKs from peer. + * + * Wakeup will happen via mptcp_clean_una(). + */ + mptcp_set_timeout(sk, ssk); + release_sock(ssk); + goto wait_for_sndbuf; + } + } } mptcp_set_timeout(sk, ssk); -- cgit v1.2.3 From 72511aab95c94d7c0f03d0b7db5df47fdca059f6 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 16 May 2020 10:46:19 +0200 Subject: mptcp: avoid blocking in tcp_sendpages The transmit loop continues to xmit new data until an error is returned or all data was transmitted. For the blocking i/o case, this means that tcp_sendpages() may block on the subflow until more space becomes available, i.e. we end up sleeping with the mptcp socket lock held. Instead we should check if a different subflow is ready to be used. This restarts the subflow sk lookup when the tx operation succeeded and the tcp subflow can't accept more data or if tcp_sendpages indicates -EAGAIN on a blocking mptcp socket. In that case we also need to set the NOSPACE bit to make sure we get notified once memory becomes available. In case all subflows are busy, the existing logic will wait until a subflow is ready, releasing the mptcp socket lock while doing so. The mptcp worker already sets DONTWAIT, so no need to make changes there. v2: * set NOSPACE bit * add a comment to clarify that mptcp-sk sndbuf limits need to be checked as well. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 35 ++++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 75be8d662ac5..e97357066b21 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -590,7 +590,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, * access the skb after the sendpages call */ ret = do_tcp_sendpages(ssk, page, offset, psize, - msg->msg_flags | MSG_SENDPAGE_NOTLAST); + msg->msg_flags | MSG_SENDPAGE_NOTLAST | MSG_DONTWAIT); if (ret <= 0) return ret; @@ -713,6 +713,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) struct socket *ssock; size_t copied = 0; struct sock *ssk; + bool tx_ok; long timeo; if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL)) @@ -737,6 +738,7 @@ fallback: return ret >= 0 ? ret + copied : (copied ? copied : ret); } +restart: mptcp_clean_una(sk); wait_for_sndbuf: @@ -772,11 +774,18 @@ wait_for_sndbuf: pr_debug("conn_list->subflow=%p", ssk); lock_sock(ssk); - while (msg_data_left(msg)) { + tx_ok = msg_data_left(msg); + while (tx_ok) { ret = mptcp_sendmsg_frag(sk, ssk, msg, NULL, &timeo, &mss_now, &size_goal); - if (ret < 0) + if (ret < 0) { + if (ret == -EAGAIN && timeo > 0) { + mptcp_set_timeout(sk, ssk); + release_sock(ssk); + goto restart; + } break; + } if (ret == 0 && unlikely(__mptcp_needs_tcp_fallback(msk))) { /* Can happen for passive sockets: * 3WHS negotiated MPTCP, but first packet after is @@ -791,11 +800,31 @@ wait_for_sndbuf: copied += ret; + tx_ok = msg_data_left(msg); + if (!tx_ok) + break; + + if (!sk_stream_memory_free(ssk)) { + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + tcp_push(ssk, msg->msg_flags, mss_now, + tcp_sk(ssk)->nonagle, size_goal); + mptcp_set_timeout(sk, ssk); + release_sock(ssk); + goto restart; + } + /* memory is charged to mptcp level socket as well, i.e. * if msg is very large, mptcp socket may run out of buffer * space. mptcp_clean_una() will release data that has * been acked at mptcp level in the mean time, so there is * a good chance we can continue sending data right away. + * + * Normally, when the tcp subflow can accept more data, then + * so can the MPTCP socket. However, we need to cope with + * peers that might lag behind in their MPTCP-level + * acknowledgements, i.e. data might have been acked at + * tcp level only. So, we must also check the MPTCP socket + * limits before we send more data. */ if (unlikely(!sk_stream_memory_free(sk))) { tcp_push(ssk, msg->msg_flags, mss_now, -- cgit v1.2.3 From 149f7c71e2c710a8ced836421a631953c9f84aa3 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 16 May 2020 10:46:20 +0200 Subject: mptcp: fill skb extension cache outside of mptcp_sendmsg_frag The mptcp_sendmsg_frag helper contains a loop that will wait on the subflow sk. It seems preferrable to only wait in mptcp_sendmsg() when blocking io is requested. mptcp_sendmsg already has such a wait loop that is used when no subflow socket is available for transmission. This is a preparation patch that makes sure we call mptcp_sendmsg_frag only if a skb extension has been allocated. Moreover, such allocation currently uses GFP_ATOMIC while it could use sleeping allocation instead. Followup patches will remove the wait loop from mptcp_sendmsg_frag() and will allow to do a sleeping allocation for the extension. Acked-by: Paolo Abeni Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index e97357066b21..1bdfbca1c23a 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -669,6 +669,9 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) sock_owned_by_me((const struct sock *)msk); + if (!mptcp_ext_cache_refill(msk)) + return NULL; + mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); @@ -804,7 +807,8 @@ wait_for_sndbuf: if (!tx_ok) break; - if (!sk_stream_memory_free(ssk)) { + if (!sk_stream_memory_free(ssk) || + !mptcp_ext_cache_refill(msk)) { set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); tcp_push(ssk, msg->msg_flags, mss_now, tcp_sk(ssk)->nonagle, size_goal); @@ -1158,7 +1162,7 @@ static void mptcp_worker(struct work_struct *work) { struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work); struct sock *ssk, *sk = &msk->sk.icsk_inet.sk; - int orig_len, orig_offset, ret, mss_now = 0, size_goal = 0; + int orig_len, orig_offset, mss_now = 0, size_goal = 0; struct mptcp_data_frag *dfrag; u64 orig_write_seq; size_t copied = 0; @@ -1180,6 +1184,9 @@ static void mptcp_worker(struct work_struct *work) if (!dfrag) goto unlock; + if (!mptcp_ext_cache_refill(msk)) + goto reset_unlock; + ssk = mptcp_subflow_get_retrans(msk); if (!ssk) goto reset_unlock; @@ -1191,8 +1198,8 @@ static void mptcp_worker(struct work_struct *work) orig_offset = dfrag->offset; orig_write_seq = dfrag->data_seq; while (dfrag->data_len > 0) { - ret = mptcp_sendmsg_frag(sk, ssk, &msg, dfrag, &timeo, &mss_now, - &size_goal); + int ret = mptcp_sendmsg_frag(sk, ssk, &msg, dfrag, &timeo, + &mss_now, &size_goal); if (ret < 0) break; @@ -1200,6 +1207,9 @@ static void mptcp_worker(struct work_struct *work) copied += ret; dfrag->data_len -= ret; dfrag->offset += ret; + + if (!mptcp_ext_cache_refill(msk)) + break; } if (copied) tcp_push(ssk, msg.msg_flags, mss_now, tcp_sk(ssk)->nonagle, -- cgit v1.2.3 From 17091708d1e503383f20934631305ccb375b0eb1 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 16 May 2020 10:46:21 +0200 Subject: mptcp: fill skb page frag cache outside of mptcp_sendmsg_frag The mptcp_sendmsg_frag helper contains a loop that will wait on the subflow sk. It seems preferrable to only wait in mptcp_sendmsg() when blocking io is requested. mptcp_sendmsg already has such a wait loop that is used when no subflow socket is available for transmission. This is another preparation patch that makes sure we call mptcp_sendmsg_frag only if the page frag cache has been refilled. Followup patch will remove the wait loop from mptcp_sendmsg_frag(). The retransmit worker doesn't need to do this refill as it won't transmit new mptcp-level data. Acked-by: Paolo Abeni Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 1bdfbca1c23a..a11e51222e59 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -713,6 +713,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { int mss_now = 0, size_goal = 0, ret = 0; struct mptcp_sock *msk = mptcp_sk(sk); + struct page_frag *pfrag; struct socket *ssock; size_t copied = 0; struct sock *ssk; @@ -741,13 +742,16 @@ fallback: return ret >= 0 ? ret + copied : (copied ? copied : ret); } + pfrag = sk_page_frag(sk); restart: mptcp_clean_una(sk); wait_for_sndbuf: __mptcp_flush_join_list(msk); ssk = mptcp_subflow_get_send(msk); - while (!sk_stream_memory_free(sk) || !ssk) { + while (!sk_stream_memory_free(sk) || + !ssk || + !mptcp_page_frag_refill(ssk, pfrag)) { if (ssk) { /* make sure retransmit timer is * running before we wait for memory. @@ -808,6 +812,7 @@ wait_for_sndbuf: break; if (!sk_stream_memory_free(ssk) || + !mptcp_page_frag_refill(ssk, pfrag) || !mptcp_ext_cache_refill(msk)) { set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); tcp_push(ssk, msg->msg_flags, mss_now, -- cgit v1.2.3 From 5c8264435d4f6a056ac926989a827aba1961e3c8 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 16 May 2020 10:46:22 +0200 Subject: mptcp: remove inner wait loop from mptcp_sendmsg_frag previous patches made sure we only call into this function when these prerequisites are met, so no need to wait on the subflow socket anymore. Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/7 Acked-by: Paolo Abeni Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index a11e51222e59..bc950cf818f7 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -510,20 +510,6 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, * fooled into a warning if we don't init here */ pfrag = sk_page_frag(sk); - while ((!retransmission && !mptcp_page_frag_refill(ssk, pfrag)) || - !mptcp_ext_cache_refill(msk)) { - ret = sk_stream_wait_memory(ssk, timeo); - if (ret) - return ret; - - /* if sk_stream_wait_memory() sleeps snd_una can change - * significantly, refresh the rtx queue - */ - mptcp_clean_una(sk); - - if (unlikely(__mptcp_needs_tcp_fallback(msk))) - return 0; - } if (!retransmission) { write_seq = &msk->write_seq; page = pfrag->page; -- cgit v1.2.3 From 4930f4831b1547b52c5968e9307fe3d840d7fba0 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 16 May 2020 10:46:23 +0200 Subject: net: allow __skb_ext_alloc to sleep mptcp calls this from the transmit side, from process context. Allow a sleeping allocation instead of unconditional GFP_ATOMIC. Acked-by: Paolo Abeni Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 +- net/core/skbuff.c | 8 +++++--- net/mptcp/protocol.c | 4 +++- 3 files changed, 9 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 3000c526f552..531843952809 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4165,7 +4165,7 @@ struct skb_ext { char data[] __aligned(8); }; -struct skb_ext *__skb_ext_alloc(void); +struct skb_ext *__skb_ext_alloc(gfp_t flags); void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id, struct skb_ext *ext); void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 1bf0c3d278e7..35a133c6d13b 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -6087,13 +6087,15 @@ static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id) /** * __skb_ext_alloc - allocate a new skb extensions storage * + * @flags: See kmalloc(). + * * Returns the newly allocated pointer. The pointer can later attached to a * skb via __skb_ext_set(). * Note: caller must handle the skb_ext as an opaque data. */ -struct skb_ext *__skb_ext_alloc(void) +struct skb_ext *__skb_ext_alloc(gfp_t flags) { - struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC); + struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, flags); if (new) { memset(new->offset, 0, sizeof(new->offset)); @@ -6188,7 +6190,7 @@ void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id) } else { newoff = SKB_EXT_CHUNKSIZEOF(*new); - new = __skb_ext_alloc(); + new = __skb_ext_alloc(GFP_ATOMIC); if (!new) return NULL; } diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index bc950cf818f7..e3a628bea2b8 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -367,8 +367,10 @@ static void mptcp_stop_timer(struct sock *sk) static bool mptcp_ext_cache_refill(struct mptcp_sock *msk) { + const struct sock *sk = (const struct sock *)msk; + if (!msk->cached_ext) - msk->cached_ext = __skb_ext_alloc(); + msk->cached_ext = __skb_ext_alloc(sk->sk_allocation); return !!msk->cached_ext; } -- cgit v1.2.3 From dbfe7d74376e187f3c6eaff822e85176bc2cd06e Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Sat, 16 May 2020 18:23:36 -0700 Subject: rds: convert get_user_pages() --> pin_user_pages() This code was using get_user_pages_fast(), in a "Case 2" scenario (DMA/RDMA), using the categorization from [1]. That means that it's time to convert the get_user_pages_fast() + put_page() calls to pin_user_pages_fast() + unpin_user_pages() calls. There is some helpful background in [2]: basically, this is a small part of fixing a long-standing disconnect between pinning pages, and file systems' use of those pages. [1] Documentation/core-api/pin_user_pages.rst [2] "Explicit pinning of user-space pages": https://lwn.net/Articles/807108/ Cc: David S. Miller Cc: Jakub Kicinski Cc: netdev@vger.kernel.org Cc: linux-rdma@vger.kernel.org Cc: rds-devel@oss.oracle.com Signed-off-by: John Hubbard Signed-off-by: David S. Miller --- net/rds/info.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/rds/info.c b/net/rds/info.c index 03f6fd56d237..e1d63563e81c 100644 --- a/net/rds/info.c +++ b/net/rds/info.c @@ -162,7 +162,6 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval, struct rds_info_lengths lens; unsigned long nr_pages = 0; unsigned long start; - unsigned long i; rds_info_func func; struct page **pages = NULL; int ret; @@ -193,7 +192,7 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval, ret = -ENOMEM; goto out; } - ret = get_user_pages_fast(start, nr_pages, FOLL_WRITE, pages); + ret = pin_user_pages_fast(start, nr_pages, FOLL_WRITE, pages); if (ret != nr_pages) { if (ret > 0) nr_pages = ret; @@ -235,8 +234,7 @@ call_func: ret = -EFAULT; out: - for (i = 0; pages && i < nr_pages; i++) - put_page(pages[i]); + unpin_user_pages(pages, nr_pages); kfree(pages); return ret; -- cgit v1.2.3 From 49c06c9eb14ba61725c1c82e5107f4e4bd6c1886 Mon Sep 17 00:00:00 2001 From: Łukasz Rymanowski Date: Wed, 13 May 2020 10:18:53 +0200 Subject: Bluetooth: Fix for GAP/SEC/SEM/BI-10-C MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Security Mode 1 level 4, force us to use have key size 16 octects long. This patch adds check for that. This is required for the qualification test GAP/SEC/SEM/BI-10-C Logs from test when ATT is configured with sec level BT_SECURITY_FIPS < ACL Data TX: Handle 3585 flags 0x00 dlen 11 #28 [hci0] 3.785965 SMP: Pairing Request (0x01) len 6 IO capability: DisplayYesNo (0x01) OOB data: Authentication data not present (0x00) Authentication requirement: Bonding, MITM, SC, No Keypresses (0x0d) Max encryption key size: 16 Initiator key distribution: EncKey Sign (0x05) Responder key distribution: EncKey IdKey Sign (0x07) > ACL Data RX: Handle 3585 flags 0x02 dlen 11 #35 [hci0] 3.883020 SMP: Pairing Response (0x02) len 6 IO capability: DisplayYesNo (0x01) OOB data: Authentication data not present (0x00) Authentication requirement: Bonding, MITM, SC, No Keypresses (0x0d) Max encryption key size: 7 Initiator key distribution: EncKey Sign (0x05) Responder key distribution: EncKey IdKey Sign (0x07) < ACL Data TX: Handle 3585 flags 0x00 dlen 6 #36 [hci0] 3.883136 SMP: Pairing Failed (0x05) len 1 Reason: Encryption key size (0x06) Signed-off-by: Łukasz Rymanowski Signed-off-by: Marcel Holtmann --- net/bluetooth/smp.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 5510017cf9ff..6fd9ddb2d85c 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -730,6 +730,10 @@ static u8 check_enc_key_size(struct l2cap_conn *conn, __u8 max_key_size) struct hci_dev *hdev = conn->hcon->hdev; struct smp_chan *smp = chan->data; + if (conn->hcon->pending_sec_level == BT_SECURITY_FIPS && + max_key_size != SMP_MAX_ENC_KEY_SIZE) + return SMP_ENC_KEY_SIZE; + if (max_key_size > hdev->le_max_key_size || max_key_size < SMP_MIN_ENC_KEY_SIZE) return SMP_ENC_KEY_SIZE; -- cgit v1.2.3 From 56b5453a86203a44726f523b4133c1feca49ce7c Mon Sep 17 00:00:00 2001 From: Hsin-Yu Chao Date: Fri, 15 May 2020 17:27:04 +0800 Subject: Bluetooth: Add SCO fallback for invalid LMP parameters error Bluetooth PTS test case HFP/AG/ACC/BI-12-I accepts SCO connection with invalid parameter at the first SCO request expecting AG to attempt another SCO request with the use of "safe settings" for given codec, base on section 5.7.1.2 of HFP 1.7 specification. This patch addresses it by adding "Invalid LMP Parameters" (0x1e) to the SCO fallback case. Verified with below log: < HCI Command: Setup Synchronous Connection (0x01|0x0028) plen 17 Handle: 256 Transmit bandwidth: 8000 Receive bandwidth: 8000 Max latency: 13 Setting: 0x0003 Input Coding: Linear Input Data Format: 1's complement Input Sample Size: 8-bit # of bits padding at MSB: 0 Air Coding Format: Transparent Data Retransmission effort: Optimize for link quality (0x02) Packet type: 0x0380 3-EV3 may not be used 2-EV5 may not be used 3-EV5 may not be used > HCI Event: Command Status (0x0f) plen 4 Setup Synchronous Connection (0x01|0x0028) ncmd 1 Status: Success (0x00) > HCI Event: Number of Completed Packets (0x13) plen 5 Num handles: 1 Handle: 256 Count: 1 > HCI Event: Max Slots Change (0x1b) plen 3 Handle: 256 Max slots: 1 > HCI Event: Synchronous Connect Complete (0x2c) plen 17 Status: Invalid LMP Parameters / Invalid LL Parameters (0x1e) Handle: 0 Address: 00:1B:DC:F2:21:59 (OUI 00-1B-DC) Link type: eSCO (0x02) Transmission interval: 0x00 Retransmission window: 0x02 RX packet length: 0 TX packet length: 0 Air mode: Transparent (0x03) < HCI Command: Setup Synchronous Connection (0x01|0x0028) plen 17 Handle: 256 Transmit bandwidth: 8000 Receive bandwidth: 8000 Max latency: 8 Setting: 0x0003 Input Coding: Linear Input Data Format: 1's complement Input Sample Size: 8-bit # of bits padding at MSB: 0 Air Coding Format: Transparent Data Retransmission effort: Optimize for link quality (0x02) Packet type: 0x03c8 EV3 may be used 2-EV3 may not be used 3-EV3 may not be used 2-EV5 may not be used 3-EV5 may not be used > HCI Event: Command Status (0x0f) plen 4 Setup Synchronous Connection (0x01|0x0028) ncmd 1 Status: Success (0x00) > HCI Event: Max Slots Change (0x1b) plen 3 Handle: 256 Max slots: 5 > HCI Event: Max Slots Change (0x1b) plen 3 Handle: 256 Max slots: 1 > HCI Event: Synchronous Connect Complete (0x2c) plen 17 Status: Success (0x00) Handle: 257 Address: 00:1B:DC:F2:21:59 (OUI 00-1B-DC) Link type: eSCO (0x02) Transmission interval: 0x06 Retransmission window: 0x04 RX packet length: 30 TX packet length: 30 Air mode: Transparent (0x03) Signed-off-by: Hsin-Yu Chao Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_event.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 73aabca0064b..f024b3d57a1c 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -4337,6 +4337,7 @@ static void hci_sync_conn_complete_evt(struct hci_dev *hdev, case 0x11: /* Unsupported Feature or Parameter Value */ case 0x1c: /* SCO interval rejected */ case 0x1a: /* Unsupported Remote Feature */ + case 0x1e: /* Invalid LMP Parameters */ case 0x1f: /* Unspecified error */ case 0x20: /* Unsupported LMP Parameter value */ if (conn->out) { -- cgit v1.2.3 From 7c1552da900c159a13473a2106c7547746ebe4a1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 May 2020 08:28:05 +0200 Subject: ipv6: lift copy_from_user out of ipv6_route_ioctl Prepare for better compat ioctl handling by moving the user copy out of ipv6_route_ioctl. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/net/ip6_route.h | 3 ++- net/ipv6/af_inet6.c | 16 ++++++++++------ net/ipv6/route.c | 44 ++++++++++++++++---------------------------- 3 files changed, 28 insertions(+), 35 deletions(-) (limited to 'net') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index e525f003e619..2a5277758379 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -118,7 +118,8 @@ void ip6_route_init_special_entries(void); int ip6_route_init(void); void ip6_route_cleanup(void); -int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg); +int ipv6_route_ioctl(struct net *net, unsigned int cmd, + struct in6_rtmsg *rtmsg); int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 771a462a8322..a618beb9b6d5 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -542,21 +542,25 @@ EXPORT_SYMBOL(inet6_getname); int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { + void __user *argp = (void __user *)arg; struct sock *sk = sock->sk; struct net *net = sock_net(sk); switch (cmd) { case SIOCADDRT: - case SIOCDELRT: - - return ipv6_route_ioctl(net, cmd, (void __user *)arg); + case SIOCDELRT: { + struct in6_rtmsg rtmsg; + if (copy_from_user(&rtmsg, argp, sizeof(rtmsg))) + return -EFAULT; + return ipv6_route_ioctl(net, cmd, &rtmsg); + } case SIOCSIFADDR: - return addrconf_add_ifaddr(net, (void __user *) arg); + return addrconf_add_ifaddr(net, argp); case SIOCDIFADDR: - return addrconf_del_ifaddr(net, (void __user *) arg); + return addrconf_del_ifaddr(net, argp); case SIOCSIFDSTADDR: - return addrconf_set_dstaddr(net, (void __user *) arg); + return addrconf_set_dstaddr(net, argp); default: if (!sk->sk_prot->ioctl) return -ENOIOCTLCMD; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index a8b4add0b545..a52ec1b86432 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -4336,41 +4336,29 @@ static void rtmsg_to_fib6_config(struct net *net, }; } -int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) +int ipv6_route_ioctl(struct net *net, unsigned int cmd, struct in6_rtmsg *rtmsg) { struct fib6_config cfg; - struct in6_rtmsg rtmsg; int err; - switch (cmd) { - case SIOCADDRT: /* Add a route */ - case SIOCDELRT: /* Delete a route */ - if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) - return -EPERM; - err = copy_from_user(&rtmsg, arg, - sizeof(struct in6_rtmsg)); - if (err) - return -EFAULT; + if (cmd != SIOCADDRT && cmd != SIOCDELRT) + return -EINVAL; + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + return -EPERM; - rtmsg_to_fib6_config(net, &rtmsg, &cfg); + rtmsg_to_fib6_config(net, rtmsg, &cfg); - rtnl_lock(); - switch (cmd) { - case SIOCADDRT: - err = ip6_route_add(&cfg, GFP_KERNEL, NULL); - break; - case SIOCDELRT: - err = ip6_route_del(&cfg, NULL); - break; - default: - err = -EINVAL; - } - rtnl_unlock(); - - return err; + rtnl_lock(); + switch (cmd) { + case SIOCADDRT: + err = ip6_route_add(&cfg, GFP_KERNEL, NULL); + break; + case SIOCDELRT: + err = ip6_route_del(&cfg, NULL); + break; } - - return -EINVAL; + rtnl_unlock(); + return err; } /* -- cgit v1.2.3 From 3986912f6a9aae50945fc9d3513c621381eba1aa Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 May 2020 08:28:06 +0200 Subject: ipv6: move SIOCADDRT and SIOCDELRT handling into ->compat_ioctl To prepare removing the global routing_ioctl hack start lifting the code into a newly added ipv6 ->compat_ioctl handler. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/net/ipv6.h | 2 ++ net/dccp/ipv6.c | 1 + net/ipv6/af_inet6.c | 53 +++++++++++++++++++++++++++++++++++++++++++ net/ipv6/raw.c | 1 + net/l2tp/l2tp_ip6.c | 1 + net/mptcp/protocol.c | 1 + net/sctp/ipv6.c | 1 + net/socket.c | 63 ++++++++++++++-------------------------------------- 8 files changed, 77 insertions(+), 46 deletions(-) (limited to 'net') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 955badd1e8ff..5fc3a9d7b053 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -1115,6 +1115,8 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); int inet6_getname(struct socket *sock, struct sockaddr *uaddr, int peer); int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); +int inet6_compat_ioctl(struct socket *sock, unsigned int cmd, + unsigned long arg); int inet6_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk); diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 1e5e08cc0bfc..650187d68851 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -1082,6 +1082,7 @@ static const struct proto_ops inet6_dccp_ops = { .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, #ifdef CONFIG_COMPAT + .compat_ioctl = inet6_compat_ioctl, .compat_setsockopt = compat_sock_common_setsockopt, .compat_getsockopt = compat_sock_common_getsockopt, #endif diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index a618beb9b6d5..b69496eaf922 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -60,6 +60,7 @@ #include #include #include +#include #include #include @@ -571,6 +572,56 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) } EXPORT_SYMBOL(inet6_ioctl); +#ifdef CONFIG_COMPAT +struct compat_in6_rtmsg { + struct in6_addr rtmsg_dst; + struct in6_addr rtmsg_src; + struct in6_addr rtmsg_gateway; + u32 rtmsg_type; + u16 rtmsg_dst_len; + u16 rtmsg_src_len; + u32 rtmsg_metric; + u32 rtmsg_info; + u32 rtmsg_flags; + s32 rtmsg_ifindex; +}; + +static int inet6_compat_routing_ioctl(struct sock *sk, unsigned int cmd, + struct compat_in6_rtmsg __user *ur) +{ + struct in6_rtmsg rt; + + if (copy_from_user(&rt.rtmsg_dst, &ur->rtmsg_dst, + 3 * sizeof(struct in6_addr)) || + get_user(rt.rtmsg_type, &ur->rtmsg_type) || + get_user(rt.rtmsg_dst_len, &ur->rtmsg_dst_len) || + get_user(rt.rtmsg_src_len, &ur->rtmsg_src_len) || + get_user(rt.rtmsg_metric, &ur->rtmsg_metric) || + get_user(rt.rtmsg_info, &ur->rtmsg_info) || + get_user(rt.rtmsg_flags, &ur->rtmsg_flags) || + get_user(rt.rtmsg_ifindex, &ur->rtmsg_ifindex)) + return -EFAULT; + + + return ipv6_route_ioctl(sock_net(sk), cmd, &rt); +} + +int inet6_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + void __user *argp = compat_ptr(arg); + struct sock *sk = sock->sk; + + switch (cmd) { + case SIOCADDRT: + case SIOCDELRT: + return inet6_compat_routing_ioctl(sk, cmd, argp); + default: + return -ENOIOCTLCMD; + } +} +EXPORT_SYMBOL_GPL(inet6_compat_ioctl); +#endif /* CONFIG_COMPAT */ + INDIRECT_CALLABLE_DECLARE(int udpv6_sendmsg(struct sock *, struct msghdr *, size_t)); int inet6_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) @@ -632,6 +683,7 @@ const struct proto_ops inet6_stream_ops = { .read_sock = tcp_read_sock, .peek_len = tcp_peek_len, #ifdef CONFIG_COMPAT + .compat_ioctl = inet6_compat_ioctl, .compat_setsockopt = compat_sock_common_setsockopt, .compat_getsockopt = compat_sock_common_getsockopt, #endif @@ -660,6 +712,7 @@ const struct proto_ops inet6_dgram_ops = { .sendpage = sock_no_sendpage, .set_peek_off = sk_set_peek_off, #ifdef CONFIG_COMPAT + .compat_ioctl = inet6_compat_ioctl, .compat_setsockopt = compat_sock_common_setsockopt, .compat_getsockopt = compat_sock_common_getsockopt, #endif diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 0028aa1d7869..8ef5a7b30524 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -1377,6 +1377,7 @@ const struct proto_ops inet6_sockraw_ops = { .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, #ifdef CONFIG_COMPAT + .compat_ioctl = inet6_compat_ioctl, .compat_setsockopt = compat_sock_common_setsockopt, .compat_getsockopt = compat_sock_common_getsockopt, #endif diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index d148766f40d1..fdfef926c591 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -758,6 +758,7 @@ static const struct proto_ops l2tp_ip6_ops = { .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, #ifdef CONFIG_COMPAT + .compat_ioctl = inet6_compat_ioctl, .compat_setsockopt = compat_sock_common_setsockopt, .compat_getsockopt = compat_sock_common_getsockopt, #endif diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index e3a628bea2b8..ba9d3d5c625f 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2068,6 +2068,7 @@ static const struct proto_ops mptcp_v6_stream_ops = { .mmap = sock_no_mmap, .sendpage = inet_sendpage, #ifdef CONFIG_COMPAT + .compat_ioctl = inet6_compat_ioctl, .compat_setsockopt = compat_sock_common_setsockopt, .compat_getsockopt = compat_sock_common_getsockopt, #endif diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index c87af430107a..ccfa0ab3e7f4 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -1032,6 +1032,7 @@ static const struct proto_ops inet6_seqpacket_ops = { .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, #ifdef CONFIG_COMPAT + .compat_ioctl = inet6_compat_ioctl, .compat_setsockopt = compat_sock_common_setsockopt, .compat_getsockopt = compat_sock_common_getsockopt, #endif diff --git a/net/socket.c b/net/socket.c index 1c9a7260a41d..682447075775 100644 --- a/net/socket.c +++ b/net/socket.c @@ -3384,62 +3384,33 @@ struct rtentry32 { unsigned short rt_irtt; /* Initial RTT */ }; -struct in6_rtmsg32 { - struct in6_addr rtmsg_dst; - struct in6_addr rtmsg_src; - struct in6_addr rtmsg_gateway; - u32 rtmsg_type; - u16 rtmsg_dst_len; - u16 rtmsg_src_len; - u32 rtmsg_metric; - u32 rtmsg_info; - u32 rtmsg_flags; - s32 rtmsg_ifindex; -}; - static int routing_ioctl(struct net *net, struct socket *sock, unsigned int cmd, void __user *argp) { + struct rtentry32 __user *ur4 = argp; int ret; void *r = NULL; - struct in6_rtmsg r6; struct rtentry r4; char devname[16]; u32 rtdev; mm_segment_t old_fs = get_fs(); - if (sock && sock->sk && sock->sk->sk_family == AF_INET6) { /* ipv6 */ - struct in6_rtmsg32 __user *ur6 = argp; - ret = copy_from_user(&r6.rtmsg_dst, &(ur6->rtmsg_dst), - 3 * sizeof(struct in6_addr)); - ret |= get_user(r6.rtmsg_type, &(ur6->rtmsg_type)); - ret |= get_user(r6.rtmsg_dst_len, &(ur6->rtmsg_dst_len)); - ret |= get_user(r6.rtmsg_src_len, &(ur6->rtmsg_src_len)); - ret |= get_user(r6.rtmsg_metric, &(ur6->rtmsg_metric)); - ret |= get_user(r6.rtmsg_info, &(ur6->rtmsg_info)); - ret |= get_user(r6.rtmsg_flags, &(ur6->rtmsg_flags)); - ret |= get_user(r6.rtmsg_ifindex, &(ur6->rtmsg_ifindex)); - - r = (void *) &r6; - } else { /* ipv4 */ - struct rtentry32 __user *ur4 = argp; - ret = copy_from_user(&r4.rt_dst, &(ur4->rt_dst), - 3 * sizeof(struct sockaddr)); - ret |= get_user(r4.rt_flags, &(ur4->rt_flags)); - ret |= get_user(r4.rt_metric, &(ur4->rt_metric)); - ret |= get_user(r4.rt_mtu, &(ur4->rt_mtu)); - ret |= get_user(r4.rt_window, &(ur4->rt_window)); - ret |= get_user(r4.rt_irtt, &(ur4->rt_irtt)); - ret |= get_user(rtdev, &(ur4->rt_dev)); - if (rtdev) { - ret |= copy_from_user(devname, compat_ptr(rtdev), 15); - r4.rt_dev = (char __user __force *)devname; - devname[15] = 0; - } else - r4.rt_dev = NULL; - - r = (void *) &r4; - } + ret = copy_from_user(&r4.rt_dst, &(ur4->rt_dst), + 3 * sizeof(struct sockaddr)); + ret |= get_user(r4.rt_flags, &(ur4->rt_flags)); + ret |= get_user(r4.rt_metric, &(ur4->rt_metric)); + ret |= get_user(r4.rt_mtu, &(ur4->rt_mtu)); + ret |= get_user(r4.rt_window, &(ur4->rt_window)); + ret |= get_user(r4.rt_irtt, &(ur4->rt_irtt)); + ret |= get_user(rtdev, &(ur4->rt_dev)); + if (rtdev) { + ret |= copy_from_user(devname, compat_ptr(rtdev), 15); + r4.rt_dev = (char __user __force *)devname; + devname[15] = 0; + } else + r4.rt_dev = NULL; + + r = (void *) &r4; if (ret) { ret = -EFAULT; -- cgit v1.2.3 From a50049235483b0337d129e2878e99750e4da3ac2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 May 2020 08:28:07 +0200 Subject: appletalk: factor out a atrtr_ioctl_addrt helper Add a helper than can be shared with the upcoming compat ioctl handler. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- net/appletalk/ddp.c | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index b41375d4d295..4177a74f6543 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -867,6 +867,24 @@ static int atif_ioctl(int cmd, void __user *arg) return copy_to_user(arg, &atreq, sizeof(atreq)) ? -EFAULT : 0; } +static int atrtr_ioctl_addrt(struct rtentry *rt) +{ + struct net_device *dev = NULL; + + if (rt->rt_dev) { + char name[IFNAMSIZ]; + + if (copy_from_user(name, rt->rt_dev, IFNAMSIZ-1)) + return -EFAULT; + name[IFNAMSIZ-1] = '\0'; + + dev = __dev_get_by_name(&init_net, name); + if (!dev) + return -ENODEV; + } + return atrtr_create(rt, dev); +} + /* Routing ioctl() calls */ static int atrtr_ioctl(unsigned int cmd, void __user *arg) { @@ -882,19 +900,8 @@ static int atrtr_ioctl(unsigned int cmd, void __user *arg) return atrtr_delete(&((struct sockaddr_at *) &rt.rt_dst)->sat_addr); - case SIOCADDRT: { - struct net_device *dev = NULL; - if (rt.rt_dev) { - char name[IFNAMSIZ]; - if (copy_from_user(name, rt.rt_dev, IFNAMSIZ-1)) - return -EFAULT; - name[IFNAMSIZ-1] = '\0'; - dev = __dev_get_by_name(&init_net, name); - if (!dev) - return -ENODEV; - } - return atrtr_create(&rt, dev); - } + case SIOCADDRT: + return atrtr_ioctl_addrt(&rt); } return -EINVAL; } -- cgit v1.2.3 From dc13c8761c91c06acd3d98cd107f371cba9811b9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 May 2020 08:28:08 +0200 Subject: ipv4,appletalk: move SIOCADDRT and SIOCDELRT handling into ->compat_ioctl To prepare removing the global routing_ioctl hack start lifting the code into the ipv4 and appletalk ->compat_ioctl handlers. Unlike the existing handler we don't bother copying in the name - there are no compat issues for char arrays. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/net/compat.h | 18 +++++++++++++++ net/appletalk/ddp.c | 49 ++++++++++++++++++++++++++++++++++++----- net/ipv4/af_inet.c | 38 +++++++++++++++++++++++++++----- net/socket.c | 62 ---------------------------------------------------- 4 files changed, 94 insertions(+), 73 deletions(-) (limited to 'net') diff --git a/include/net/compat.h b/include/net/compat.h index e341260642fe..2b5e1f7ba153 100644 --- a/include/net/compat.h +++ b/include/net/compat.h @@ -30,6 +30,24 @@ struct compat_cmsghdr { compat_int_t cmsg_type; }; +struct compat_rtentry { + u32 rt_pad1; + struct sockaddr rt_dst; /* target address */ + struct sockaddr rt_gateway; /* gateway addr (RTF_GATEWAY) */ + struct sockaddr rt_genmask; /* target network mask (IP) */ + unsigned short rt_flags; + short rt_pad2; + u32 rt_pad3; + unsigned char rt_tos; + unsigned char rt_class; + short rt_pad4; + short rt_metric; /* +1 for binary compatibility! */ + compat_uptr_t rt_dev; /* forcing the device at add */ + u32 rt_mtu; /* per route MTU/Window */ + u32 rt_window; /* Window clamping */ + unsigned short rt_irtt; /* Initial RTT */ +}; + #else /* defined(CONFIG_COMPAT) */ /* * To avoid compiler warnings: diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 4177a74f6543..15787e8c0629 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -57,6 +57,7 @@ #include #include #include +#include #include #include @@ -1839,20 +1840,58 @@ static int atalk_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) #ifdef CONFIG_COMPAT +static int atalk_compat_routing_ioctl(struct sock *sk, unsigned int cmd, + struct compat_rtentry __user *ur) +{ + compat_uptr_t rtdev; + struct rtentry rt; + + if (copy_from_user(&rt.rt_dst, &ur->rt_dst, + 3 * sizeof(struct sockaddr)) || + get_user(rt.rt_flags, &ur->rt_flags) || + get_user(rt.rt_metric, &ur->rt_metric) || + get_user(rt.rt_mtu, &ur->rt_mtu) || + get_user(rt.rt_window, &ur->rt_window) || + get_user(rt.rt_irtt, &ur->rt_irtt) || + get_user(rtdev, &ur->rt_dev)) + return -EFAULT; + + switch (cmd) { + case SIOCDELRT: + if (rt.rt_dst.sa_family != AF_APPLETALK) + return -EINVAL; + return atrtr_delete(&((struct sockaddr_at *) + &rt.rt_dst)->sat_addr); + + case SIOCADDRT: + rt.rt_dev = compat_ptr(rtdev); + return atrtr_ioctl_addrt(&rt); + default: + return -EINVAL; + } +} static int atalk_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { + void __user *argp = compat_ptr(arg); + struct sock *sk = sock->sk; + + switch (cmd) { + case SIOCADDRT: + case SIOCDELRT: + return atalk_compat_routing_ioctl(sk, cmd, argp); /* * SIOCATALKDIFADDR is a SIOCPROTOPRIVATE ioctl number, so we * cannot handle it in common code. The data we access if ifreq * here is compatible, so we can simply call the native * handler. */ - if (cmd == SIOCATALKDIFADDR) - return atalk_ioctl(sock, cmd, (unsigned long)compat_ptr(arg)); - - return -ENOIOCTLCMD; + case SIOCATALKDIFADDR: + return atalk_ioctl(sock, cmd, (unsigned long)argp); + default: + return -ENOIOCTLCMD; + } } -#endif +#endif /* CONFIG_COMPAT */ static const struct net_proto_family atalk_family_ops = { diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index fcf0d12a407a..c35a8b2e0499 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -116,6 +116,7 @@ #include #endif #include +#include #include @@ -970,17 +971,42 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) EXPORT_SYMBOL(inet_ioctl); #ifdef CONFIG_COMPAT +static int inet_compat_routing_ioctl(struct sock *sk, unsigned int cmd, + struct compat_rtentry __user *ur) +{ + compat_uptr_t rtdev; + struct rtentry rt; + + if (copy_from_user(&rt.rt_dst, &ur->rt_dst, + 3 * sizeof(struct sockaddr)) || + get_user(rt.rt_flags, &ur->rt_flags) || + get_user(rt.rt_metric, &ur->rt_metric) || + get_user(rt.rt_mtu, &ur->rt_mtu) || + get_user(rt.rt_window, &ur->rt_window) || + get_user(rt.rt_irtt, &ur->rt_irtt) || + get_user(rtdev, &ur->rt_dev)) + return -EFAULT; + + rt.rt_dev = compat_ptr(rtdev); + return ip_rt_ioctl(sock_net(sk), cmd, &rt); +} + static int inet_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { + void __user *argp = compat_ptr(arg); struct sock *sk = sock->sk; - int err = -ENOIOCTLCMD; - - if (sk->sk_prot->compat_ioctl) - err = sk->sk_prot->compat_ioctl(sk, cmd, arg); - return err; + switch (cmd) { + case SIOCADDRT: + case SIOCDELRT: + return inet_compat_routing_ioctl(sk, cmd, argp); + default: + if (!sk->sk_prot->compat_ioctl) + return -ENOIOCTLCMD; + return sk->sk_prot->compat_ioctl(sk, cmd, arg); + } } -#endif +#endif /* CONFIG_COMPAT */ const struct proto_ops inet_stream_ops = { .family = PF_INET, diff --git a/net/socket.c b/net/socket.c index 682447075775..80422fc3c836 100644 --- a/net/socket.c +++ b/net/socket.c @@ -3366,65 +3366,6 @@ static int compat_sioc_ifmap(struct net *net, unsigned int cmd, return err; } -struct rtentry32 { - u32 rt_pad1; - struct sockaddr rt_dst; /* target address */ - struct sockaddr rt_gateway; /* gateway addr (RTF_GATEWAY) */ - struct sockaddr rt_genmask; /* target network mask (IP) */ - unsigned short rt_flags; - short rt_pad2; - u32 rt_pad3; - unsigned char rt_tos; - unsigned char rt_class; - short rt_pad4; - short rt_metric; /* +1 for binary compatibility! */ - /* char * */ u32 rt_dev; /* forcing the device at add */ - u32 rt_mtu; /* per route MTU/Window */ - u32 rt_window; /* Window clamping */ - unsigned short rt_irtt; /* Initial RTT */ -}; - -static int routing_ioctl(struct net *net, struct socket *sock, - unsigned int cmd, void __user *argp) -{ - struct rtentry32 __user *ur4 = argp; - int ret; - void *r = NULL; - struct rtentry r4; - char devname[16]; - u32 rtdev; - mm_segment_t old_fs = get_fs(); - - ret = copy_from_user(&r4.rt_dst, &(ur4->rt_dst), - 3 * sizeof(struct sockaddr)); - ret |= get_user(r4.rt_flags, &(ur4->rt_flags)); - ret |= get_user(r4.rt_metric, &(ur4->rt_metric)); - ret |= get_user(r4.rt_mtu, &(ur4->rt_mtu)); - ret |= get_user(r4.rt_window, &(ur4->rt_window)); - ret |= get_user(r4.rt_irtt, &(ur4->rt_irtt)); - ret |= get_user(rtdev, &(ur4->rt_dev)); - if (rtdev) { - ret |= copy_from_user(devname, compat_ptr(rtdev), 15); - r4.rt_dev = (char __user __force *)devname; - devname[15] = 0; - } else - r4.rt_dev = NULL; - - r = (void *) &r4; - - if (ret) { - ret = -EFAULT; - goto out; - } - - set_fs(KERNEL_DS); - ret = sock_do_ioctl(net, sock, cmd, (unsigned long) r); - set_fs(old_fs); - -out: - return ret; -} - /* Since old style bridge ioctl's endup using SIOCDEVPRIVATE * for some operations; this forces use of the newer bridge-utils that * use compatible ioctls @@ -3463,9 +3404,6 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock, case SIOCGIFMAP: case SIOCSIFMAP: return compat_sioc_ifmap(net, cmd, argp); - case SIOCADDRT: - case SIOCDELRT: - return routing_ioctl(net, sock, cmd, argp); case SIOCGSTAMP_OLD: case SIOCGSTAMPNS_OLD: if (!sock->ops->gettstamp) -- cgit v1.2.3 From d800bad67d4c21aaf11722f04e0f7547fb915ab5 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Mon, 18 May 2020 15:05:27 +0200 Subject: bpf: Fix too large copy from user in bpf_test_init Commit bc56c919fce7 ("bpf: Add xdp.frame_sz in bpf_prog_test_run_xdp().") recently changed bpf_prog_test_run_xdp() to use larger frames for XDP in order to test tail growing frames (via bpf_xdp_adjust_tail) and to have memory backing frame better resemble drivers. The commit contains a bug, as it tries to copy the max data size from userspace, instead of the size provided by userspace. This cause XDP unit tests to fail sporadically with EFAULT, an unfortunate behavior. The fix is to only copy the size specified by userspace. Fixes: bc56c919fce7 ("bpf: Add xdp.frame_sz in bpf_prog_test_run_xdp().") Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/158980712729.256597.6115007718472928659.stgit@firesoul --- net/bpf/test_run.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 30ba7d38941d..bfd4ccd80847 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -160,16 +160,20 @@ static void *bpf_test_init(const union bpf_attr *kattr, u32 size, u32 headroom, u32 tailroom) { void __user *data_in = u64_to_user_ptr(kattr->test.data_in); + u32 user_size = kattr->test.data_size_in; void *data; if (size < ETH_HLEN || size > PAGE_SIZE - headroom - tailroom) return ERR_PTR(-EINVAL); + if (user_size > size) + return ERR_PTR(-EMSGSIZE); + data = kzalloc(size + headroom + tailroom, GFP_USER); if (!data) return ERR_PTR(-ENOMEM); - if (copy_from_user(data + headroom, data_in, size)) { + if (copy_from_user(data + headroom, data_in, user_size)) { kfree(data); return ERR_PTR(-EFAULT); } @@ -486,8 +490,6 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, /* XDP have extra tailroom as (most) drivers use full page */ max_data_sz = 4096 - headroom - tailroom; - if (size > max_data_sz) - return -EINVAL; data = bpf_test_init(kattr, max_data_sz, headroom, tailroom); if (IS_ERR(data)) -- cgit v1.2.3 From 1b66d253610c7f8f257103808a9460223a087469 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 19 May 2020 00:45:45 +0200 Subject: bpf: Add get{peer, sock}name attach types for sock_addr As stated in 983695fa6765 ("bpf: fix unconnected udp hooks"), the objective for the existing cgroup connect/sendmsg/recvmsg/bind BPF hooks is to be transparent to applications. In Cilium we make use of these hooks [0] in order to enable E-W load balancing for existing Kubernetes service types for all Cilium managed nodes in the cluster. Those backends can be local or remote. The main advantage of this approach is that it operates as close as possible to the socket, and therefore allows to avoid packet-based NAT given in connect/sendmsg/recvmsg hooks we only need to xlate sock addresses. This also allows to expose NodePort services on loopback addresses in the host namespace, for example. As another advantage, this also efficiently blocks bind requests for applications in the host namespace for exposed ports. However, one missing item is that we also need to perform reverse xlation for inet{,6}_getname() hooks such that we can return the service IP/port tuple back to the application instead of the remote peer address. The vast majority of applications does not bother about getpeername(), but in a few occasions we've seen breakage when validating the peer's address since it returns unexpectedly the backend tuple instead of the service one. Therefore, this trivial patch allows to customise and adds a getpeername() as well as getsockname() BPF cgroup hook for both IPv4 and IPv6 in order to address this situation. Simple example: # ./cilium/cilium service list ID Frontend Service Type Backend 1 1.2.3.4:80 ClusterIP 1 => 10.0.0.10:80 Before; curl's verbose output example, no getpeername() reverse xlation: # curl --verbose 1.2.3.4 * Rebuilt URL to: 1.2.3.4/ * Trying 1.2.3.4... * TCP_NODELAY set * Connected to 1.2.3.4 (10.0.0.10) port 80 (#0) > GET / HTTP/1.1 > Host: 1.2.3.4 > User-Agent: curl/7.58.0 > Accept: */* [...] After; with getpeername() reverse xlation: # curl --verbose 1.2.3.4 * Rebuilt URL to: 1.2.3.4/ * Trying 1.2.3.4... * TCP_NODELAY set * Connected to 1.2.3.4 (1.2.3.4) port 80 (#0) > GET / HTTP/1.1 > Host: 1.2.3.4 > User-Agent: curl/7.58.0 > Accept: */* [...] Originally, I had both under a BPF_CGROUP_INET{4,6}_GETNAME type and exposed peer to the context similar as in inet{,6}_getname() fashion, but API-wise this is suboptimal as it always enforces programs having to test for ctx->peer which can easily be missed, hence BPF_CGROUP_INET{4,6}_GET{PEER,SOCK}NAME split. Similarly, the checked return code is on tnum_range(1, 1), but if a use case comes up in future, it can easily be changed to return an error code instead. Helper and ctx member access is the same as with connect/sendmsg/etc hooks. [0] https://github.com/cilium/cilium/blob/master/bpf/bpf_sock.c Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: Andrey Ignatov Link: https://lore.kernel.org/bpf/61a479d759b2482ae3efb45546490bacd796a220.1589841594.git.daniel@iogearbox.net --- include/linux/bpf-cgroup.h | 1 + include/uapi/linux/bpf.h | 4 ++++ kernel/bpf/syscall.c | 12 ++++++++++++ kernel/bpf/verifier.c | 6 +++++- net/core/filter.c | 4 ++++ net/ipv4/af_inet.c | 8 ++++++-- net/ipv6/af_inet6.c | 9 ++++++--- tools/include/uapi/linux/bpf.h | 4 ++++ 8 files changed, 42 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 272626cc3fc9..c66c545e161a 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -396,6 +396,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, } #define cgroup_bpf_enabled (0) +#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx) ({ 0; }) #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0) #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b9b8a0f63b91..97e1fd19ff58 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -220,6 +220,10 @@ enum bpf_attach_type { BPF_MODIFY_RETURN, BPF_LSM_MAC, BPF_TRACE_ITER, + BPF_CGROUP_INET4_GETPEERNAME, + BPF_CGROUP_INET6_GETPEERNAME, + BPF_CGROUP_INET4_GETSOCKNAME, + BPF_CGROUP_INET6_GETSOCKNAME, __MAX_BPF_ATTACH_TYPE }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 57dfc98289d5..431241c74614 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1978,6 +1978,10 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: @@ -2767,6 +2771,10 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: @@ -2912,6 +2920,10 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_INET6_POST_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9c7d67d65d8c..2ed8351f47a4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7094,7 +7094,11 @@ static int check_return_code(struct bpf_verifier_env *env) switch (env->prog->type) { case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG || - env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG) + env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG || + env->prog->expected_attach_type == BPF_CGROUP_INET4_GETPEERNAME || + env->prog->expected_attach_type == BPF_CGROUP_INET6_GETPEERNAME || + env->prog->expected_attach_type == BPF_CGROUP_INET4_GETSOCKNAME || + env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME) range = tnum_range(1, 1); break; case BPF_PROG_TYPE_CGROUP_SKB: diff --git a/net/core/filter.c b/net/core/filter.c index 822d662f97ef..bd2853d23b50 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7049,6 +7049,8 @@ static bool sock_addr_is_valid_access(int off, int size, switch (prog->expected_attach_type) { case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: break; @@ -7060,6 +7062,8 @@ static bool sock_addr_is_valid_access(int off, int size, switch (prog->expected_attach_type) { case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UDP6_RECVMSG: break; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index fcf0d12a407a..8f5c8c9409d3 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -755,12 +755,11 @@ do_err: } EXPORT_SYMBOL(inet_accept); - /* * This does both peername and sockname. */ int inet_getname(struct socket *sock, struct sockaddr *uaddr, - int peer) + int peer) { struct sock *sk = sock->sk; struct inet_sock *inet = inet_sk(sk); @@ -781,6 +780,11 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr, sin->sin_port = inet->inet_sport; sin->sin_addr.s_addr = addr; } + if (cgroup_bpf_enabled) + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, + peer ? BPF_CGROUP_INET4_GETPEERNAME : + BPF_CGROUP_INET4_GETSOCKNAME, + NULL); memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); return sizeof(*sin); } diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 771a462a8322..3b6fcc0c321a 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -504,9 +504,8 @@ EXPORT_SYMBOL_GPL(inet6_destroy_sock); /* * This does both peername and sockname. */ - int inet6_getname(struct socket *sock, struct sockaddr *uaddr, - int peer) + int peer) { struct sockaddr_in6 *sin = (struct sockaddr_in6 *)uaddr; struct sock *sk = sock->sk; @@ -531,9 +530,13 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr, sin->sin6_addr = np->saddr; else sin->sin6_addr = sk->sk_v6_rcv_saddr; - sin->sin6_port = inet->inet_sport; } + if (cgroup_bpf_enabled) + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, + peer ? BPF_CGROUP_INET6_GETPEERNAME : + BPF_CGROUP_INET6_GETSOCKNAME, + NULL); sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr, sk->sk_bound_dev_if); return sizeof(*sin); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 146c742f1d49..1cddc398404a 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -220,6 +220,10 @@ enum bpf_attach_type { BPF_MODIFY_RETURN, BPF_LSM_MAC, BPF_TRACE_ITER, + BPF_CGROUP_INET4_GETPEERNAME, + BPF_CGROUP_INET6_GETPEERNAME, + BPF_CGROUP_INET4_GETSOCKNAME, + BPF_CGROUP_INET6_GETSOCKNAME, __MAX_BPF_ATTACH_TYPE }; -- cgit v1.2.3 From 4b32f86bf1673acb16441dd55d7b325609f54897 Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Tue, 19 May 2020 21:10:08 +0200 Subject: net/iucv: remove pm support commit 394216275c7d ("s390: remove broken hibernate / power management support") removed support for ARCH_HIBERNATION_POSSIBLE from s390. So drop the unused pm ops from the s390-only iucv bus driver. CC: Hendrik Brueckner Signed-off-by: Julian Wiedmann Signed-off-by: David S. Miller --- net/iucv/iucv.c | 188 -------------------------------------------------------- 1 file changed, 188 deletions(-) (limited to 'net') diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c index 9a2d023842fe..19250a0c85d3 100644 --- a/net/iucv/iucv.c +++ b/net/iucv/iucv.c @@ -67,32 +67,9 @@ static int iucv_bus_match(struct device *dev, struct device_driver *drv) return 0; } -enum iucv_pm_states { - IUCV_PM_INITIAL = 0, - IUCV_PM_FREEZING = 1, - IUCV_PM_THAWING = 2, - IUCV_PM_RESTORING = 3, -}; -static enum iucv_pm_states iucv_pm_state; - -static int iucv_pm_prepare(struct device *); -static void iucv_pm_complete(struct device *); -static int iucv_pm_freeze(struct device *); -static int iucv_pm_thaw(struct device *); -static int iucv_pm_restore(struct device *); - -static const struct dev_pm_ops iucv_pm_ops = { - .prepare = iucv_pm_prepare, - .complete = iucv_pm_complete, - .freeze = iucv_pm_freeze, - .thaw = iucv_pm_thaw, - .restore = iucv_pm_restore, -}; - struct bus_type iucv_bus = { .name = "iucv", .match = iucv_bus_match, - .pm = &iucv_pm_ops, }; EXPORT_SYMBOL(iucv_bus); @@ -434,31 +411,6 @@ static void iucv_block_cpu(void *data) cpumask_clear_cpu(cpu, &iucv_irq_cpumask); } -/** - * iucv_block_cpu_almost - * @data: unused - * - * Allow connection-severed interrupts only on this cpu. - */ -static void iucv_block_cpu_almost(void *data) -{ - int cpu = smp_processor_id(); - union iucv_param *parm; - - /* Allow iucv control interrupts only */ - parm = iucv_param_irq[cpu]; - memset(parm, 0, sizeof(union iucv_param)); - parm->set_mask.ipmask = 0x08; - iucv_call_b2f0(IUCV_SETMASK, parm); - /* Allow iucv-severed interrupt only */ - memset(parm, 0, sizeof(union iucv_param)); - parm->set_mask.ipmask = 0x20; - iucv_call_b2f0(IUCV_SETCONTROLMASK, parm); - - /* Clear indication that iucv interrupts are allowed for this cpu. */ - cpumask_clear_cpu(cpu, &iucv_irq_cpumask); -} - /** * iucv_declare_cpu * @data: unused @@ -1834,146 +1786,6 @@ static void iucv_external_interrupt(struct ext_code ext_code, spin_unlock(&iucv_queue_lock); } -static int iucv_pm_prepare(struct device *dev) -{ - int rc = 0; - -#ifdef CONFIG_PM_DEBUG - printk(KERN_INFO "iucv_pm_prepare\n"); -#endif - if (dev->driver && dev->driver->pm && dev->driver->pm->prepare) - rc = dev->driver->pm->prepare(dev); - return rc; -} - -static void iucv_pm_complete(struct device *dev) -{ -#ifdef CONFIG_PM_DEBUG - printk(KERN_INFO "iucv_pm_complete\n"); -#endif - if (dev->driver && dev->driver->pm && dev->driver->pm->complete) - dev->driver->pm->complete(dev); -} - -/** - * iucv_path_table_empty() - determine if iucv path table is empty - * - * Returns 0 if there are still iucv pathes defined - * 1 if there are no iucv pathes defined - */ -static int iucv_path_table_empty(void) -{ - int i; - - for (i = 0; i < iucv_max_pathid; i++) { - if (iucv_path_table[i]) - return 0; - } - return 1; -} - -/** - * iucv_pm_freeze() - Freeze PM callback - * @dev: iucv-based device - * - * disable iucv interrupts - * invoke callback function of the iucv-based driver - * shut down iucv, if no iucv-pathes are established anymore - */ -static int iucv_pm_freeze(struct device *dev) -{ - int cpu; - struct iucv_irq_list *p, *n; - int rc = 0; - -#ifdef CONFIG_PM_DEBUG - printk(KERN_WARNING "iucv_pm_freeze\n"); -#endif - if (iucv_pm_state != IUCV_PM_FREEZING) { - for_each_cpu(cpu, &iucv_irq_cpumask) - smp_call_function_single(cpu, iucv_block_cpu_almost, - NULL, 1); - cancel_work_sync(&iucv_work); - list_for_each_entry_safe(p, n, &iucv_work_queue, list) { - list_del_init(&p->list); - iucv_sever_pathid(p->data.ippathid, - iucv_error_no_listener); - kfree(p); - } - } - iucv_pm_state = IUCV_PM_FREEZING; - if (dev->driver && dev->driver->pm && dev->driver->pm->freeze) - rc = dev->driver->pm->freeze(dev); - if (iucv_path_table_empty()) - iucv_disable(); - return rc; -} - -/** - * iucv_pm_thaw() - Thaw PM callback - * @dev: iucv-based device - * - * make iucv ready for use again: allocate path table, declare interrupt buffers - * and enable iucv interrupts - * invoke callback function of the iucv-based driver - */ -static int iucv_pm_thaw(struct device *dev) -{ - int rc = 0; - -#ifdef CONFIG_PM_DEBUG - printk(KERN_WARNING "iucv_pm_thaw\n"); -#endif - iucv_pm_state = IUCV_PM_THAWING; - if (!iucv_path_table) { - rc = iucv_enable(); - if (rc) - goto out; - } - if (cpumask_empty(&iucv_irq_cpumask)) { - if (iucv_nonsmp_handler) - /* enable interrupts on one cpu */ - iucv_allow_cpu(NULL); - else - /* enable interrupts on all cpus */ - iucv_setmask_mp(); - } - if (dev->driver && dev->driver->pm && dev->driver->pm->thaw) - rc = dev->driver->pm->thaw(dev); -out: - return rc; -} - -/** - * iucv_pm_restore() - Restore PM callback - * @dev: iucv-based device - * - * make iucv ready for use again: allocate path table, declare interrupt buffers - * and enable iucv interrupts - * invoke callback function of the iucv-based driver - */ -static int iucv_pm_restore(struct device *dev) -{ - int rc = 0; - -#ifdef CONFIG_PM_DEBUG - printk(KERN_WARNING "iucv_pm_restore %p\n", iucv_path_table); -#endif - if ((iucv_pm_state != IUCV_PM_RESTORING) && iucv_path_table) - pr_warn("Suspending Linux did not completely close all IUCV connections\n"); - iucv_pm_state = IUCV_PM_RESTORING; - if (cpumask_empty(&iucv_irq_cpumask)) { - rc = iucv_query_maxconn(); - rc = iucv_enable(); - if (rc) - goto out; - } - if (dev->driver && dev->driver->pm && dev->driver->pm->restore) - rc = dev->driver->pm->restore(dev); -out: - return rc; -} - struct iucv_interface iucv_if = { .message_receive = iucv_message_receive, .__message_receive = __iucv_message_receive, -- cgit v1.2.3 From 585bc2209539501674b77de68f58e98039892501 Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Tue, 19 May 2020 21:10:09 +0200 Subject: net/af_iucv: remove pm support commit 394216275c7d ("s390: remove broken hibernate / power management support") removed support for ARCH_HIBERNATION_POSSIBLE from s390. So drop the unused pm ops from the s390-only af_iucv socket code. Signed-off-by: Julian Wiedmann Signed-off-by: David S. Miller --- net/iucv/af_iucv.c | 141 +---------------------------------------------------- 1 file changed, 1 insertion(+), 140 deletions(-) (limited to 'net') diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index c4bdcbc84b07..84bd18fea1d6 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -87,7 +87,6 @@ do { \ static void iucv_sock_kill(struct sock *sk); static void iucv_sock_close(struct sock *sk); -static void iucv_sever_path(struct sock *, int); static int afiucv_hs_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); @@ -127,110 +126,6 @@ static inline void low_nmcpy(unsigned char *dst, char *src) memcpy(&dst[8], src, 8); } -static int afiucv_pm_prepare(struct device *dev) -{ -#ifdef CONFIG_PM_DEBUG - printk(KERN_WARNING "afiucv_pm_prepare\n"); -#endif - return 0; -} - -static void afiucv_pm_complete(struct device *dev) -{ -#ifdef CONFIG_PM_DEBUG - printk(KERN_WARNING "afiucv_pm_complete\n"); -#endif -} - -/** - * afiucv_pm_freeze() - Freeze PM callback - * @dev: AFIUCV dummy device - * - * Sever all established IUCV communication pathes - */ -static int afiucv_pm_freeze(struct device *dev) -{ - struct iucv_sock *iucv; - struct sock *sk; - -#ifdef CONFIG_PM_DEBUG - printk(KERN_WARNING "afiucv_pm_freeze\n"); -#endif - read_lock(&iucv_sk_list.lock); - sk_for_each(sk, &iucv_sk_list.head) { - iucv = iucv_sk(sk); - switch (sk->sk_state) { - case IUCV_DISCONN: - case IUCV_CLOSING: - case IUCV_CONNECTED: - iucv_sever_path(sk, 0); - break; - case IUCV_OPEN: - case IUCV_BOUND: - case IUCV_LISTEN: - case IUCV_CLOSED: - default: - break; - } - skb_queue_purge(&iucv->send_skb_q); - skb_queue_purge(&iucv->backlog_skb_q); - } - read_unlock(&iucv_sk_list.lock); - return 0; -} - -/** - * afiucv_pm_restore_thaw() - Thaw and restore PM callback - * @dev: AFIUCV dummy device - * - * socket clean up after freeze - */ -static int afiucv_pm_restore_thaw(struct device *dev) -{ - struct sock *sk; - -#ifdef CONFIG_PM_DEBUG - printk(KERN_WARNING "afiucv_pm_restore_thaw\n"); -#endif - read_lock(&iucv_sk_list.lock); - sk_for_each(sk, &iucv_sk_list.head) { - switch (sk->sk_state) { - case IUCV_CONNECTED: - sk->sk_err = EPIPE; - sk->sk_state = IUCV_DISCONN; - sk->sk_state_change(sk); - break; - case IUCV_DISCONN: - case IUCV_CLOSING: - case IUCV_LISTEN: - case IUCV_BOUND: - case IUCV_OPEN: - default: - break; - } - } - read_unlock(&iucv_sk_list.lock); - return 0; -} - -static const struct dev_pm_ops afiucv_pm_ops = { - .prepare = afiucv_pm_prepare, - .complete = afiucv_pm_complete, - .freeze = afiucv_pm_freeze, - .thaw = afiucv_pm_restore_thaw, - .restore = afiucv_pm_restore_thaw, -}; - -static struct device_driver af_iucv_driver = { - .owner = THIS_MODULE, - .name = "afiucv", - .bus = NULL, - .pm = &afiucv_pm_ops, -}; - -/* dummy device used as trigger for PM functions */ -static struct device *af_iucv_dev; - /** * iucv_msg_length() - Returns the length of an iucv message. * @msg: Pointer to struct iucv_message, MUST NOT be NULL @@ -2409,45 +2304,11 @@ static struct packet_type iucv_packet_type = { static int afiucv_iucv_init(void) { - int err; - - err = pr_iucv->iucv_register(&af_iucv_handler, 0); - if (err) - goto out; - /* establish dummy device */ - af_iucv_driver.bus = pr_iucv->bus; - err = driver_register(&af_iucv_driver); - if (err) - goto out_iucv; - af_iucv_dev = kzalloc(sizeof(struct device), GFP_KERNEL); - if (!af_iucv_dev) { - err = -ENOMEM; - goto out_driver; - } - dev_set_name(af_iucv_dev, "af_iucv"); - af_iucv_dev->bus = pr_iucv->bus; - af_iucv_dev->parent = pr_iucv->root; - af_iucv_dev->release = (void (*)(struct device *))kfree; - af_iucv_dev->driver = &af_iucv_driver; - err = device_register(af_iucv_dev); - if (err) - goto out_iucv_dev; - return 0; - -out_iucv_dev: - put_device(af_iucv_dev); -out_driver: - driver_unregister(&af_iucv_driver); -out_iucv: - pr_iucv->iucv_unregister(&af_iucv_handler, 0); -out: - return err; + return pr_iucv->iucv_register(&af_iucv_handler, 0); } static void afiucv_iucv_exit(void) { - device_unregister(af_iucv_dev); - driver_unregister(&af_iucv_driver); pr_iucv->iucv_unregister(&af_iucv_handler, 0); } -- cgit v1.2.3 From 0d1c7664ed2044048d5ed790b943affa296bc3a4 Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Tue, 19 May 2020 21:10:10 +0200 Subject: net/af_iucv: replace open-coded U16_MAX Improve the readability of a range check. Signed-off-by: Julian Wiedmann Signed-off-by: David S. Miller --- net/iucv/af_iucv.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 84bd18fea1d6..b02470a04c50 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -1559,7 +1560,7 @@ static int iucv_sock_setsockopt(struct socket *sock, int level, int optname, switch (sk->sk_state) { case IUCV_OPEN: case IUCV_BOUND: - if (val < 1 || val > (u16)(~0)) + if (val < 1 || val > U16_MAX) rc = -EINVAL; else iucv->msglimit = val; -- cgit v1.2.3 From dca1262f97499337489da7f5aa0bc6fec247a83f Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Tue, 19 May 2020 21:10:11 +0200 Subject: net/af_iucv: remove a redundant zero initialization txmsg is declared as {0}, no need to clear individual fields later on. Signed-off-by: Julian Wiedmann Signed-off-by: David S. Miller --- net/iucv/af_iucv.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index b02470a04c50..799dcf5483de 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -996,7 +996,6 @@ static int iucv_sock_sendmsg(struct socket *sock, struct msghdr *msg, /* initialize defaults */ cmsg_done = 0; /* check for duplicate headers */ - txmsg.class = 0; /* iterate over control messages */ for_each_cmsghdr(cmsg, msg) { -- cgit v1.2.3 From e9a36ca5f6f302675e7e36101ffa0ca7f9b8779b Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Tue, 19 May 2020 21:10:12 +0200 Subject: net/af_iucv: clean up function prototypes Remove a bunch of forward declarations (trivially shifting code around where needed), and make a few functions static. Signed-off-by: Julian Wiedmann Signed-off-by: David S. Miller --- include/net/iucv/af_iucv.h | 8 ---- net/iucv/af_iucv.c | 108 +++++++++++++++++++++------------------------ 2 files changed, 51 insertions(+), 65 deletions(-) (limited to 'net') diff --git a/include/net/iucv/af_iucv.h b/include/net/iucv/af_iucv.h index 14a490246be9..9259ce2b22f3 100644 --- a/include/net/iucv/af_iucv.h +++ b/include/net/iucv/af_iucv.h @@ -158,12 +158,4 @@ struct iucv_sock_list { atomic_t autobind_name; }; -__poll_t iucv_sock_poll(struct file *file, struct socket *sock, - poll_table *wait); -void iucv_sock_link(struct iucv_sock_list *l, struct sock *s); -void iucv_sock_unlink(struct iucv_sock_list *l, struct sock *s); -void iucv_accept_enqueue(struct sock *parent, struct sock *sk); -void iucv_accept_unlink(struct sock *sk); -struct sock *iucv_accept_dequeue(struct sock *parent, struct socket *newsock); - #endif /* __IUCV_H */ diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 799dcf5483de..ee0add15497d 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -37,8 +37,6 @@ static char iucv_userid[80]; -static const struct proto_ops iucv_sock_ops; - static struct proto iucv_proto = { .name = "AF_IUCV", .owner = THIS_MODULE, @@ -86,13 +84,11 @@ do { \ __ret; \ }) +static struct sock *iucv_accept_dequeue(struct sock *parent, + struct socket *newsock); static void iucv_sock_kill(struct sock *sk); static void iucv_sock_close(struct sock *sk); -static int afiucv_hs_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt, struct net_device *orig_dev); -static int afiucv_hs_send(struct iucv_message *imsg, struct sock *sock, - struct sk_buff *skb, u8 flags); static void afiucv_hs_callback_txnotify(struct sk_buff *, enum iucv_tx_notify); /* Call Back functions */ @@ -331,6 +327,20 @@ static void iucv_sock_cleanup_listen(struct sock *parent) parent->sk_state = IUCV_CLOSED; } +static void iucv_sock_link(struct iucv_sock_list *l, struct sock *sk) +{ + write_lock_bh(&l->lock); + sk_add_node(sk, &l->head); + write_unlock_bh(&l->lock); +} + +static void iucv_sock_unlink(struct iucv_sock_list *l, struct sock *sk) +{ + write_lock_bh(&l->lock); + sk_del_node_init(sk); + write_unlock_bh(&l->lock); +} + /* Kill socket (only if zapped and orphaned) */ static void iucv_sock_kill(struct sock *sk) { @@ -503,53 +513,7 @@ static struct sock *iucv_sock_alloc(struct socket *sock, int proto, gfp_t prio, return sk; } -/* Create an IUCV socket */ -static int iucv_sock_create(struct net *net, struct socket *sock, int protocol, - int kern) -{ - struct sock *sk; - - if (protocol && protocol != PF_IUCV) - return -EPROTONOSUPPORT; - - sock->state = SS_UNCONNECTED; - - switch (sock->type) { - case SOCK_STREAM: - sock->ops = &iucv_sock_ops; - break; - case SOCK_SEQPACKET: - /* currently, proto ops can handle both sk types */ - sock->ops = &iucv_sock_ops; - break; - default: - return -ESOCKTNOSUPPORT; - } - - sk = iucv_sock_alloc(sock, protocol, GFP_KERNEL, kern); - if (!sk) - return -ENOMEM; - - iucv_sock_init(sk, NULL); - - return 0; -} - -void iucv_sock_link(struct iucv_sock_list *l, struct sock *sk) -{ - write_lock_bh(&l->lock); - sk_add_node(sk, &l->head); - write_unlock_bh(&l->lock); -} - -void iucv_sock_unlink(struct iucv_sock_list *l, struct sock *sk) -{ - write_lock_bh(&l->lock); - sk_del_node_init(sk); - write_unlock_bh(&l->lock); -} - -void iucv_accept_enqueue(struct sock *parent, struct sock *sk) +static void iucv_accept_enqueue(struct sock *parent, struct sock *sk) { unsigned long flags; struct iucv_sock *par = iucv_sk(parent); @@ -562,7 +526,7 @@ void iucv_accept_enqueue(struct sock *parent, struct sock *sk) sk_acceptq_added(parent); } -void iucv_accept_unlink(struct sock *sk) +static void iucv_accept_unlink(struct sock *sk) { unsigned long flags; struct iucv_sock *par = iucv_sk(iucv_sk(sk)->parent); @@ -575,7 +539,8 @@ void iucv_accept_unlink(struct sock *sk) sock_put(sk); } -struct sock *iucv_accept_dequeue(struct sock *parent, struct socket *newsock) +static struct sock *iucv_accept_dequeue(struct sock *parent, + struct socket *newsock) { struct iucv_sock *isk, *n; struct sock *sk; @@ -1406,8 +1371,8 @@ static inline __poll_t iucv_accept_poll(struct sock *parent) return 0; } -__poll_t iucv_sock_poll(struct file *file, struct socket *sock, - poll_table *wait) +static __poll_t iucv_sock_poll(struct file *file, struct socket *sock, + poll_table *wait) { struct sock *sk = sock->sk; __poll_t mask = 0; @@ -2291,6 +2256,35 @@ static const struct proto_ops iucv_sock_ops = { .getsockopt = iucv_sock_getsockopt, }; +static int iucv_sock_create(struct net *net, struct socket *sock, int protocol, + int kern) +{ + struct sock *sk; + + if (protocol && protocol != PF_IUCV) + return -EPROTONOSUPPORT; + + sock->state = SS_UNCONNECTED; + + switch (sock->type) { + case SOCK_STREAM: + case SOCK_SEQPACKET: + /* currently, proto ops can handle both sk types */ + sock->ops = &iucv_sock_ops; + break; + default: + return -ESOCKTNOSUPPORT; + } + + sk = iucv_sock_alloc(sock, protocol, GFP_KERNEL, kern); + if (!sk) + return -ENOMEM; + + iucv_sock_init(sk, NULL); + + return 0; +} + static const struct net_proto_family iucv_sock_family_ops = { .family = AF_IUCV, .owner = THIS_MODULE, -- cgit v1.2.3 From c384b8a70c59d85f83a05c8963e71d35da2607b9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 May 2020 15:03:11 +0200 Subject: ipv4: streamline ipmr_new_tunnel Reduce a few level of indentation to simplify the function. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- net/ipv4/ipmr.c | 73 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 36 insertions(+), 37 deletions(-) (limited to 'net') diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 5c218db2dede..a1169b694113 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -471,50 +471,49 @@ static bool ipmr_init_vif_indev(const struct net_device *dev) static struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v) { - struct net_device *dev; + struct net_device *tunnel_dev, *new_dev; + struct ip_tunnel_parm p = { }; + mm_segment_t oldfs = get_fs(); + struct ifreq ifr; + int err; - dev = __dev_get_by_name(net, "tunl0"); + tunnel_dev = __dev_get_by_name(net, "tunl0"); + if (!tunnel_dev) + goto out; - if (dev) { - const struct net_device_ops *ops = dev->netdev_ops; - int err; - struct ifreq ifr; - struct ip_tunnel_parm p; + p.iph.daddr = v->vifc_rmt_addr.s_addr; + p.iph.saddr = v->vifc_lcl_addr.s_addr; + p.iph.version = 4; + p.iph.ihl = 5; + p.iph.protocol = IPPROTO_IPIP; + sprintf(p.name, "dvmrp%d", v->vifc_vifi); + ifr.ifr_ifru.ifru_data = (__force void __user *)&p; - memset(&p, 0, sizeof(p)); - p.iph.daddr = v->vifc_rmt_addr.s_addr; - p.iph.saddr = v->vifc_lcl_addr.s_addr; - p.iph.version = 4; - p.iph.ihl = 5; - p.iph.protocol = IPPROTO_IPIP; - sprintf(p.name, "dvmrp%d", v->vifc_vifi); - ifr.ifr_ifru.ifru_data = (__force void __user *)&p; + if (!tunnel_dev->netdev_ops->ndo_do_ioctl) + goto out; - if (ops->ndo_do_ioctl) { - mm_segment_t oldfs = get_fs(); + set_fs(KERNEL_DS); + err = tunnel_dev->netdev_ops->ndo_do_ioctl(tunnel_dev, &ifr, + SIOCADDTUNNEL); + set_fs(oldfs); + if (err) + goto out; - set_fs(KERNEL_DS); - err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL); - set_fs(oldfs); - } else { - err = -EOPNOTSUPP; - } - dev = NULL; + new_dev = __dev_get_by_name(net, p.name); + if (!new_dev) + goto out; - if (err == 0 && - (dev = __dev_get_by_name(net, p.name)) != NULL) { - dev->flags |= IFF_MULTICAST; - if (!ipmr_init_vif_indev(dev)) - goto failure; - if (dev_open(dev, NULL)) - goto failure; - dev_hold(dev); - } - } - return dev; + new_dev->flags |= IFF_MULTICAST; + if (!ipmr_init_vif_indev(new_dev)) + goto out_unregister; + if (dev_open(new_dev, NULL)) + goto out_unregister; + dev_hold(new_dev); + return new_dev; -failure: - unregister_netdevice(dev); +out_unregister: + unregister_netdevice(new_dev); +out: return NULL; } -- cgit v1.2.3 From c1fd1182c43692022a7938d6b496fa21fcd49717 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 May 2020 15:03:12 +0200 Subject: ipv4: consolidate the VIFF_TUNNEL handling in ipmr_new_tunnel Also move the dev_set_allmulti call and the error handling into the ioctl helper. This allows reusing already looked up tunnel_dev pointer and the set up argument structure for the deletion in the error handler. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- net/ipv4/ipmr.c | 53 +++++++++++++---------------------------------------- 1 file changed, 13 insertions(+), 40 deletions(-) (limited to 'net') diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index a1169b694113..cd1a3260a99a 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -421,37 +421,6 @@ static void ipmr_free_table(struct mr_table *mrt) /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */ -static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v) -{ - struct net *net = dev_net(dev); - - dev_close(dev); - - dev = __dev_get_by_name(net, "tunl0"); - if (dev) { - const struct net_device_ops *ops = dev->netdev_ops; - struct ifreq ifr; - struct ip_tunnel_parm p; - - memset(&p, 0, sizeof(p)); - p.iph.daddr = v->vifc_rmt_addr.s_addr; - p.iph.saddr = v->vifc_lcl_addr.s_addr; - p.iph.version = 4; - p.iph.ihl = 5; - p.iph.protocol = IPPROTO_IPIP; - sprintf(p.name, "dvmrp%d", v->vifc_vifi); - ifr.ifr_ifru.ifru_data = (__force void __user *)&p; - - if (ops->ndo_do_ioctl) { - mm_segment_t oldfs = get_fs(); - - set_fs(KERNEL_DS); - ops->ndo_do_ioctl(dev, &ifr, SIOCDELTUNNEL); - set_fs(oldfs); - } - } -} - /* Initialize ipmr pimreg/tunnel in_device */ static bool ipmr_init_vif_indev(const struct net_device *dev) { @@ -509,12 +478,22 @@ static struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v) if (dev_open(new_dev, NULL)) goto out_unregister; dev_hold(new_dev); + err = dev_set_allmulti(new_dev, 1); + if (err) { + dev_close(new_dev); + set_fs(KERNEL_DS); + tunnel_dev->netdev_ops->ndo_do_ioctl(tunnel_dev, &ifr, + SIOCDELTUNNEL); + set_fs(oldfs); + dev_put(new_dev); + new_dev = ERR_PTR(err); + } return new_dev; out_unregister: unregister_netdevice(new_dev); out: - return NULL; + return ERR_PTR(-ENOBUFS); } #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2) @@ -866,14 +845,8 @@ static int vif_add(struct net *net, struct mr_table *mrt, break; case VIFF_TUNNEL: dev = ipmr_new_tunnel(net, vifc); - if (!dev) - return -ENOBUFS; - err = dev_set_allmulti(dev, 1); - if (err) { - ipmr_del_tunnel(dev, vifc); - dev_put(dev); - return err; - } + if (IS_ERR(dev)) + return PTR_ERR(dev); break; case VIFF_USE_IFINDEX: case 0: -- cgit v1.2.3 From 607259a695312cdfac2b52fb9d5b5890c834d573 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 May 2020 15:03:13 +0200 Subject: net: add a new ndo_tunnel_ioctl method This method is used to properly allow kernel callers of the IPv4 route management ioctls. The exsting ip_tunnel_ioctl helper is renamed to ip_tunnel_ctl to better reflect that it doesn't directly implement ioctls touching user memory, and is used for the guts of ndo_tunnel_ctl implementations. A new ip_tunnel_ioctl helper is added that can be wired up directly to the ndo_do_ioctl method and takes care of the copy to and from userspace. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/linux/netdevice.h | 6 ++++++ include/net/ip_tunnels.h | 3 ++- net/ipv4/ip_gre.c | 35 ++++++++++++++--------------------- net/ipv4/ip_tunnel.c | 16 +++++++++++++++- net/ipv4/ip_vti.c | 32 +++++++++++++------------------- net/ipv4/ipip.c | 30 +++++++++--------------------- 6 files changed, 59 insertions(+), 63 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 6a8f8daef09d..a18f8fdf4260 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -53,6 +53,7 @@ struct netpoll_info; struct device; struct phy_device; struct dsa_port; +struct ip_tunnel_parm; struct macsec_context; struct macsec_ops; @@ -1274,6 +1275,9 @@ struct netdev_net_notifier { * Get devlink port instance associated with a given netdev. * Called with a reference on the netdevice and devlink locks only, * rtnl_lock is not held. + * int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm *p, + * int cmd); + * Add, change, delete or get information on an IPv4 tunnel. */ struct net_device_ops { int (*ndo_init)(struct net_device *dev); @@ -1479,6 +1483,8 @@ struct net_device_ops { int (*ndo_xsk_wakeup)(struct net_device *dev, u32 queue_id, u32 flags); struct devlink_port * (*ndo_get_devlink_port)(struct net_device *dev); + int (*ndo_tunnel_ctl)(struct net_device *dev, + struct ip_tunnel_parm *p, int cmd); }; /** diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 236503a50759..076e5d7db7d3 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -269,7 +269,8 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, const struct iphdr *tnl_params, const u8 protocol); void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, const u8 proto, int tunnel_hlen); -int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd); +int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd); +int ip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd); int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict); int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 0ce9b91ff55c..4e31f23e4117 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -768,45 +768,37 @@ static void ipgre_link_update(struct net_device *dev, bool set_mtu) } } -static int ipgre_tunnel_ioctl(struct net_device *dev, - struct ifreq *ifr, int cmd) +static int ipgre_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, + int cmd) { - struct ip_tunnel_parm p; int err; - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) - return -EFAULT; - if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) { - if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE || - p.iph.ihl != 5 || (p.iph.frag_off & htons(~IP_DF)) || - ((p.i_flags | p.o_flags) & (GRE_VERSION | GRE_ROUTING))) + if (p->iph.version != 4 || p->iph.protocol != IPPROTO_GRE || + p->iph.ihl != 5 || (p->iph.frag_off & htons(~IP_DF)) || + ((p->i_flags | p->o_flags) & (GRE_VERSION | GRE_ROUTING))) return -EINVAL; } - p.i_flags = gre_flags_to_tnl_flags(p.i_flags); - p.o_flags = gre_flags_to_tnl_flags(p.o_flags); + p->i_flags = gre_flags_to_tnl_flags(p->i_flags); + p->o_flags = gre_flags_to_tnl_flags(p->o_flags); - err = ip_tunnel_ioctl(dev, &p, cmd); + err = ip_tunnel_ctl(dev, p, cmd); if (err) return err; if (cmd == SIOCCHGTUNNEL) { struct ip_tunnel *t = netdev_priv(dev); - t->parms.i_flags = p.i_flags; - t->parms.o_flags = p.o_flags; + t->parms.i_flags = p->i_flags; + t->parms.o_flags = p->o_flags; if (strcmp(dev->rtnl_link_ops->kind, "erspan")) ipgre_link_update(dev, true); } - p.i_flags = gre_tnl_flags_to_gre_flags(p.i_flags); - p.o_flags = gre_tnl_flags_to_gre_flags(p.o_flags); - - if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) - return -EFAULT; - + p->i_flags = gre_tnl_flags_to_gre_flags(p->i_flags); + p->o_flags = gre_tnl_flags_to_gre_flags(p->o_flags); return 0; } @@ -924,10 +916,11 @@ static const struct net_device_ops ipgre_netdev_ops = { .ndo_stop = ipgre_close, #endif .ndo_start_xmit = ipgre_xmit, - .ndo_do_ioctl = ipgre_tunnel_ioctl, + .ndo_do_ioctl = ip_tunnel_ioctl, .ndo_change_mtu = ip_tunnel_change_mtu, .ndo_get_stats64 = ip_tunnel_get_stats64, .ndo_get_iflink = ip_tunnel_get_iflink, + .ndo_tunnel_ctl = ipgre_tunnel_ctl, }; #define GRE_FEATURES (NETIF_F_SG | \ diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index cd4b84310d92..f4f1d11eab50 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -860,7 +860,7 @@ static void ip_tunnel_update(struct ip_tunnel_net *itn, netdev_state_change(dev); } -int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) +int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) { int err = 0; struct ip_tunnel *t = netdev_priv(dev); @@ -960,6 +960,20 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) done: return err; } +EXPORT_SYMBOL_GPL(ip_tunnel_ctl); + +int ip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +{ + struct ip_tunnel_parm p; + int err; + + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + return -EFAULT; + err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, cmd); + if (!err && copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) + return -EFAULT; + return err; +} EXPORT_SYMBOL_GPL(ip_tunnel_ioctl); int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict) diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 1b4e6f298648..c8974360a99f 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -378,38 +378,31 @@ static int vti4_err(struct sk_buff *skb, u32 info) } static int -vti_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +vti_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) { int err = 0; - struct ip_tunnel_parm p; - - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) - return -EFAULT; if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) { - if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP || - p.iph.ihl != 5) + if (p->iph.version != 4 || p->iph.protocol != IPPROTO_IPIP || + p->iph.ihl != 5) return -EINVAL; } - if (!(p.i_flags & GRE_KEY)) - p.i_key = 0; - if (!(p.o_flags & GRE_KEY)) - p.o_key = 0; + if (!(p->i_flags & GRE_KEY)) + p->i_key = 0; + if (!(p->o_flags & GRE_KEY)) + p->o_key = 0; - p.i_flags = VTI_ISVTI; + p->i_flags = VTI_ISVTI; - err = ip_tunnel_ioctl(dev, &p, cmd); + err = ip_tunnel_ctl(dev, p, cmd); if (err) return err; if (cmd != SIOCDELTUNNEL) { - p.i_flags |= GRE_KEY; - p.o_flags |= GRE_KEY; + p->i_flags |= GRE_KEY; + p->o_flags |= GRE_KEY; } - - if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) - return -EFAULT; return 0; } @@ -417,10 +410,11 @@ static const struct net_device_ops vti_netdev_ops = { .ndo_init = vti_tunnel_init, .ndo_uninit = ip_tunnel_uninit, .ndo_start_xmit = vti_tunnel_xmit, - .ndo_do_ioctl = vti_tunnel_ioctl, + .ndo_do_ioctl = ip_tunnel_ioctl, .ndo_change_mtu = ip_tunnel_change_mtu, .ndo_get_stats64 = ip_tunnel_get_stats64, .ndo_get_iflink = ip_tunnel_get_iflink, + .ndo_tunnel_ctl = vti_tunnel_ctl, }; static void vti_tunnel_setup(struct net_device *dev) diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 2f01cf6fa0de..df663baf2516 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -327,41 +327,29 @@ static bool ipip_tunnel_ioctl_verify_protocol(u8 ipproto) } static int -ipip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +ipip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) { - int err = 0; - struct ip_tunnel_parm p; - - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) - return -EFAULT; - if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) { - if (p.iph.version != 4 || - !ipip_tunnel_ioctl_verify_protocol(p.iph.protocol) || - p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF))) + if (p->iph.version != 4 || + !ipip_tunnel_ioctl_verify_protocol(p->iph.protocol) || + p->iph.ihl != 5 || (p->iph.frag_off & htons(~IP_DF))) return -EINVAL; } - p.i_key = p.o_key = 0; - p.i_flags = p.o_flags = 0; - err = ip_tunnel_ioctl(dev, &p, cmd); - if (err) - return err; - - if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) - return -EFAULT; - - return 0; + p->i_key = p->o_key = 0; + p->i_flags = p->o_flags = 0; + return ip_tunnel_ctl(dev, p, cmd); } static const struct net_device_ops ipip_netdev_ops = { .ndo_init = ipip_tunnel_init, .ndo_uninit = ip_tunnel_uninit, .ndo_start_xmit = ipip_tunnel_xmit, - .ndo_do_ioctl = ipip_tunnel_ioctl, + .ndo_do_ioctl = ip_tunnel_ioctl, .ndo_change_mtu = ip_tunnel_change_mtu, .ndo_get_stats64 = ip_tunnel_get_stats64, .ndo_get_iflink = ip_tunnel_get_iflink, + .ndo_tunnel_ctl = ipip_tunnel_ctl, }; #define IPIP_FEATURES (NETIF_F_SG | \ -- cgit v1.2.3 From c7e3670516042bfd8147151d9008b5874a6eb73e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 May 2020 15:03:14 +0200 Subject: impr: use ->ndo_tunnel_ctl in ipmr_new_tunnel Use the new ->ndo_tunnel_ctl instead of overriding the address limit and using ->ndo_do_ioctl just to do a pointless user copy. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- net/ipv4/ipmr.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index cd1a3260a99a..d3e9b80a57de 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -442,8 +442,6 @@ static struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v) { struct net_device *tunnel_dev, *new_dev; struct ip_tunnel_parm p = { }; - mm_segment_t oldfs = get_fs(); - struct ifreq ifr; int err; tunnel_dev = __dev_get_by_name(net, "tunl0"); @@ -456,15 +454,11 @@ static struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v) p.iph.ihl = 5; p.iph.protocol = IPPROTO_IPIP; sprintf(p.name, "dvmrp%d", v->vifc_vifi); - ifr.ifr_ifru.ifru_data = (__force void __user *)&p; - if (!tunnel_dev->netdev_ops->ndo_do_ioctl) + if (!tunnel_dev->netdev_ops->ndo_tunnel_ctl) goto out; - - set_fs(KERNEL_DS); - err = tunnel_dev->netdev_ops->ndo_do_ioctl(tunnel_dev, &ifr, + err = tunnel_dev->netdev_ops->ndo_tunnel_ctl(tunnel_dev, &p, SIOCADDTUNNEL); - set_fs(oldfs); if (err) goto out; @@ -481,10 +475,8 @@ static struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v) err = dev_set_allmulti(new_dev, 1); if (err) { dev_close(new_dev); - set_fs(KERNEL_DS); - tunnel_dev->netdev_ops->ndo_do_ioctl(tunnel_dev, &ifr, + tunnel_dev->netdev_ops->ndo_tunnel_ctl(tunnel_dev, &p, SIOCDELTUNNEL); - set_fs(oldfs); dev_put(new_dev); new_dev = ERR_PTR(err); } -- cgit v1.2.3 From fd5d687b76b32543f6b2024ebe21f988ed1f8859 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 May 2020 15:03:15 +0200 Subject: sit: refactor ipip6_tunnel_ioctl Split the ioctl handler into one function per command instead of having a all the logic sit in one giant switch statement. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- net/ipv6/sit.c | 368 ++++++++++++++++++++++++++++++++------------------------- 1 file changed, 210 insertions(+), 158 deletions(-) (limited to 'net') diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 98954830c40b..7c158fdc02da 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -83,6 +83,13 @@ struct sit_net { struct net_device *fb_tunnel_dev; }; +static inline struct sit_net *dev_to_sit_net(struct net_device *dev) +{ + struct ip_tunnel *t = netdev_priv(dev); + + return net_generic(t->net, sit_net_id); +} + /* * Must be invoked with rcu_read_lock */ @@ -291,14 +298,18 @@ __ipip6_tunnel_locate_prl(struct ip_tunnel *t, __be32 addr) } -static int ipip6_tunnel_get_prl(struct ip_tunnel *t, - struct ip_tunnel_prl __user *a) +static int ipip6_tunnel_get_prl(struct net_device *dev, struct ifreq *ifr) { + struct ip_tunnel_prl __user *a = ifr->ifr_ifru.ifru_data; + struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_prl kprl, *kp; struct ip_tunnel_prl_entry *prl; unsigned int cmax, c = 0, ca, len; int ret = 0; + if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) + return -EINVAL; + if (copy_from_user(&kprl, a, sizeof(kprl))) return -EFAULT; cmax = kprl.datalen / sizeof(kprl); @@ -441,6 +452,35 @@ out: return err; } +static int ipip6_tunnel_prl_ctl(struct net_device *dev, struct ifreq *ifr, + int cmd) +{ + struct ip_tunnel *t = netdev_priv(dev); + struct ip_tunnel_prl prl; + int err; + + if (!ns_capable(t->net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) + return -EINVAL; + + if (copy_from_user(&prl, ifr->ifr_ifru.ifru_data, sizeof(prl))) + return -EFAULT; + + switch (cmd) { + case SIOCDELPRL: + err = ipip6_tunnel_del_prl(t, &prl); + break; + case SIOCADDPRL: + case SIOCCHGPRL: + err = ipip6_tunnel_add_prl(t, &prl, cmd == SIOCCHGPRL); + break; + } + dst_cache_reset(&t->dst_cache); + netdev_state_change(dev); + return err; +} + static int isatap_chksrc(struct sk_buff *skb, const struct iphdr *iph, struct ip_tunnel *t) { @@ -1151,7 +1191,53 @@ static int ipip6_tunnel_update_6rd(struct ip_tunnel *t, netdev_state_change(t->dev); return 0; } -#endif + +static int +ipip6_tunnel_get6rd(struct net_device *dev, struct ifreq *ifr) +{ + struct ip_tunnel *t = netdev_priv(dev); + struct ip_tunnel_6rd ip6rd; + struct ip_tunnel_parm p; + + if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) { + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + return -EFAULT; + t = ipip6_tunnel_locate(t->net, &p, 0); + } + if (!t) + t = netdev_priv(dev); + + ip6rd.prefix = t->ip6rd.prefix; + ip6rd.relay_prefix = t->ip6rd.relay_prefix; + ip6rd.prefixlen = t->ip6rd.prefixlen; + ip6rd.relay_prefixlen = t->ip6rd.relay_prefixlen; + if (copy_to_user(ifr->ifr_ifru.ifru_data, &ip6rd, sizeof(ip6rd))) + return -EFAULT; + return 0; +} + +static int +ipip6_tunnel_6rdctl(struct net_device *dev, struct ifreq *ifr, int cmd) +{ + struct ip_tunnel *t = netdev_priv(dev); + struct ip_tunnel_6rd ip6rd; + int err; + + if (!ns_capable(t->net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + if (copy_from_user(&ip6rd, ifr->ifr_ifru.ifru_data, sizeof(ip6rd))) + return -EFAULT; + + if (cmd != SIOCDEL6RD) { + err = ipip6_tunnel_update_6rd(t, &ip6rd); + if (err < 0) + return err; + } else + ipip6_tunnel_clone_6rd(dev, dev_to_sit_net(dev)); + return 0; +} + +#endif /* CONFIG_IPV6_SIT_6RD */ static bool ipip6_valid_ip_proto(u8 ipproto) { @@ -1164,185 +1250,151 @@ static bool ipip6_valid_ip_proto(u8 ipproto) } static int -ipip6_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +__ipip6_tunnel_ioctl_validate(struct net *net, struct ip_tunnel_parm *p) { - int err = 0; + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + + if (!ipip6_valid_ip_proto(p->iph.protocol)) + return -EINVAL; + if (p->iph.version != 4 || + p->iph.ihl != 5 || (p->iph.frag_off & htons(~IP_DF))) + return -EINVAL; + + if (p->iph.ttl) + p->iph.frag_off |= htons(IP_DF); + return 0; +} + +static int +ipip6_tunnel_get(struct net_device *dev, struct ifreq *ifr) +{ + struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_parm p; - struct ip_tunnel_prl prl; + + if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) { + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + return -EFAULT; + t = ipip6_tunnel_locate(t->net, &p, 0); + } + if (!t) + t = netdev_priv(dev); + + if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p))) + return -EFAULT; + return 0; +} + +static int +ipip6_tunnel_add(struct net_device *dev, struct ifreq *ifr) +{ struct ip_tunnel *t = netdev_priv(dev); - struct net *net = t->net; - struct sit_net *sitn = net_generic(net, sit_net_id); -#ifdef CONFIG_IPV6_SIT_6RD - struct ip_tunnel_6rd ip6rd; -#endif + struct ip_tunnel_parm p; + int err; - switch (cmd) { - case SIOCGETTUNNEL: -#ifdef CONFIG_IPV6_SIT_6RD - case SIOCGET6RD: -#endif - if (dev == sitn->fb_tunnel_dev) { - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { - err = -EFAULT; - break; - } - t = ipip6_tunnel_locate(net, &p, 0); - if (!t) - t = netdev_priv(dev); - } + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + return -EFAULT; + err = __ipip6_tunnel_ioctl_validate(t->net, &p); + if (err) + return err; - err = -EFAULT; - if (cmd == SIOCGETTUNNEL) { - memcpy(&p, &t->parms, sizeof(p)); - if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, - sizeof(p))) - goto done; -#ifdef CONFIG_IPV6_SIT_6RD + t = ipip6_tunnel_locate(t->net, &p, 1); + if (!t) + return -ENOBUFS; + + if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p))) + return -EFAULT; + return 0; +} + +static int +ipip6_tunnel_change(struct net_device *dev, struct ifreq *ifr) +{ + struct ip_tunnel *t = netdev_priv(dev); + struct ip_tunnel_parm p; + int err; + + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + return -EFAULT; + err = __ipip6_tunnel_ioctl_validate(t->net, &p); + if (err) + return err; + + t = ipip6_tunnel_locate(t->net, &p, 0); + if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) { + if (!t) + return -ENOENT; + } else { + if (t) { + if (t->dev != dev) + return -EEXIST; } else { - ip6rd.prefix = t->ip6rd.prefix; - ip6rd.relay_prefix = t->ip6rd.relay_prefix; - ip6rd.prefixlen = t->ip6rd.prefixlen; - ip6rd.relay_prefixlen = t->ip6rd.relay_prefixlen; - if (copy_to_user(ifr->ifr_ifru.ifru_data, &ip6rd, - sizeof(ip6rd))) - goto done; -#endif + if (((dev->flags & IFF_POINTOPOINT) && !p.iph.daddr) || + (!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr)) + return -EINVAL; + t = netdev_priv(dev); } - err = 0; - break; - case SIOCADDTUNNEL: - case SIOCCHGTUNNEL: - err = -EPERM; - if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) - goto done; + ipip6_tunnel_update(t, &p, t->fwmark); + } - err = -EFAULT; - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) - goto done; - - err = -EINVAL; - if (!ipip6_valid_ip_proto(p.iph.protocol)) - goto done; - if (p.iph.version != 4 || - p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF))) - goto done; - if (p.iph.ttl) - p.iph.frag_off |= htons(IP_DF); - - t = ipip6_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL); - - if (dev != sitn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { - if (t) { - if (t->dev != dev) { - err = -EEXIST; - break; - } - } else { - if (((dev->flags&IFF_POINTOPOINT) && !p.iph.daddr) || - (!(dev->flags&IFF_POINTOPOINT) && p.iph.daddr)) { - err = -EINVAL; - break; - } - t = netdev_priv(dev); - } + if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p))) + return -EFAULT; + return 0; +} - ipip6_tunnel_update(t, &p, t->fwmark); - } +static int +ipip6_tunnel_del(struct net_device *dev, struct ifreq *ifr) +{ + struct ip_tunnel *t = netdev_priv(dev); + struct ip_tunnel_parm p; - if (t) { - err = 0; - if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p))) - err = -EFAULT; - } else - err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); - break; + if (!ns_capable(t->net->user_ns, CAP_NET_ADMIN)) + return -EPERM; - case SIOCDELTUNNEL: - err = -EPERM; - if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) - goto done; - - if (dev == sitn->fb_tunnel_dev) { - err = -EFAULT; - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) - goto done; - err = -ENOENT; - t = ipip6_tunnel_locate(net, &p, 0); - if (!t) - goto done; - err = -EPERM; - if (t == netdev_priv(sitn->fb_tunnel_dev)) - goto done; - dev = t->dev; - } - unregister_netdevice(dev); - err = 0; - break; + if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) { + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + return -EFAULT; + t = ipip6_tunnel_locate(t->net, &p, 0); + if (!t) + return -ENOENT; + if (t == netdev_priv(dev_to_sit_net(dev)->fb_tunnel_dev)) + return -EPERM; + dev = t->dev; + } + unregister_netdevice(dev); + return 0; +} +static int +ipip6_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +{ + switch (cmd) { + case SIOCGETTUNNEL: + return ipip6_tunnel_get(dev, ifr); + case SIOCADDTUNNEL: + return ipip6_tunnel_add(dev, ifr); + case SIOCCHGTUNNEL: + return ipip6_tunnel_change(dev, ifr); + case SIOCDELTUNNEL: + return ipip6_tunnel_del(dev, ifr); case SIOCGETPRL: - err = -EINVAL; - if (dev == sitn->fb_tunnel_dev) - goto done; - err = ipip6_tunnel_get_prl(t, ifr->ifr_ifru.ifru_data); - break; - + return ipip6_tunnel_get_prl(dev, ifr); case SIOCADDPRL: case SIOCDELPRL: case SIOCCHGPRL: - err = -EPERM; - if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) - goto done; - err = -EINVAL; - if (dev == sitn->fb_tunnel_dev) - goto done; - err = -EFAULT; - if (copy_from_user(&prl, ifr->ifr_ifru.ifru_data, sizeof(prl))) - goto done; - - switch (cmd) { - case SIOCDELPRL: - err = ipip6_tunnel_del_prl(t, &prl); - break; - case SIOCADDPRL: - case SIOCCHGPRL: - err = ipip6_tunnel_add_prl(t, &prl, cmd == SIOCCHGPRL); - break; - } - dst_cache_reset(&t->dst_cache); - netdev_state_change(dev); - break; - + return ipip6_tunnel_prl_ctl(dev, ifr, cmd); #ifdef CONFIG_IPV6_SIT_6RD + case SIOCGET6RD: + return ipip6_tunnel_get6rd(dev, ifr); case SIOCADD6RD: case SIOCCHG6RD: case SIOCDEL6RD: - err = -EPERM; - if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) - goto done; - - err = -EFAULT; - if (copy_from_user(&ip6rd, ifr->ifr_ifru.ifru_data, - sizeof(ip6rd))) - goto done; - - if (cmd != SIOCDEL6RD) { - err = ipip6_tunnel_update_6rd(t, &ip6rd); - if (err < 0) - goto done; - } else - ipip6_tunnel_clone_6rd(dev, sitn); - - err = 0; - break; + return ipip6_tunnel_6rdctl(dev, ifr, cmd); #endif - default: - err = -EINVAL; + return -EINVAL; } - -done: - return err; } static const struct net_device_ops ipip6_netdev_ops = { -- cgit v1.2.3 From f60fe2df931d6c0b41c3d4f8ec3f7a429f977a3f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 May 2020 15:03:16 +0200 Subject: sit: impement ->ndo_tunnel_ctl Implement the ->ndo_tunnel_ctl method, and use ip_tunnel_ioctl to handle userspace requests for the SIOCGETTUNNEL, SIOCADDTUNNEL, SIOCCHGTUNNEL and SIOCDELTUNNEL ioctls. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- net/ipv6/sit.c | 73 +++++++++++++++++++++++++++------------------------------- 1 file changed, 34 insertions(+), 39 deletions(-) (limited to 'net') diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 7c158fdc02da..1fbb4dfbb191 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1267,60 +1267,45 @@ __ipip6_tunnel_ioctl_validate(struct net *net, struct ip_tunnel_parm *p) } static int -ipip6_tunnel_get(struct net_device *dev, struct ifreq *ifr) +ipip6_tunnel_get(struct net_device *dev, struct ip_tunnel_parm *p) { struct ip_tunnel *t = netdev_priv(dev); - struct ip_tunnel_parm p; - if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) { - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) - return -EFAULT; - t = ipip6_tunnel_locate(t->net, &p, 0); - } + if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) + t = ipip6_tunnel_locate(t->net, p, 0); if (!t) t = netdev_priv(dev); - - if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p))) - return -EFAULT; + memcpy(p, &t->parms, sizeof(*p)); return 0; } static int -ipip6_tunnel_add(struct net_device *dev, struct ifreq *ifr) +ipip6_tunnel_add(struct net_device *dev, struct ip_tunnel_parm *p) { struct ip_tunnel *t = netdev_priv(dev); - struct ip_tunnel_parm p; int err; - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) - return -EFAULT; - err = __ipip6_tunnel_ioctl_validate(t->net, &p); + err = __ipip6_tunnel_ioctl_validate(t->net, p); if (err) return err; - t = ipip6_tunnel_locate(t->net, &p, 1); + t = ipip6_tunnel_locate(t->net, p, 1); if (!t) return -ENOBUFS; - - if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p))) - return -EFAULT; return 0; } static int -ipip6_tunnel_change(struct net_device *dev, struct ifreq *ifr) +ipip6_tunnel_change(struct net_device *dev, struct ip_tunnel_parm *p) { struct ip_tunnel *t = netdev_priv(dev); - struct ip_tunnel_parm p; int err; - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) - return -EFAULT; - err = __ipip6_tunnel_ioctl_validate(t->net, &p); + err = __ipip6_tunnel_ioctl_validate(t->net, p); if (err) return err; - t = ipip6_tunnel_locate(t->net, &p, 0); + t = ipip6_tunnel_locate(t->net, p, 0); if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) { if (!t) return -ENOENT; @@ -1329,33 +1314,28 @@ ipip6_tunnel_change(struct net_device *dev, struct ifreq *ifr) if (t->dev != dev) return -EEXIST; } else { - if (((dev->flags & IFF_POINTOPOINT) && !p.iph.daddr) || - (!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr)) + if (((dev->flags & IFF_POINTOPOINT) && !p->iph.daddr) || + (!(dev->flags & IFF_POINTOPOINT) && p->iph.daddr)) return -EINVAL; t = netdev_priv(dev); } - ipip6_tunnel_update(t, &p, t->fwmark); + ipip6_tunnel_update(t, p, t->fwmark); } - if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p))) - return -EFAULT; return 0; } static int -ipip6_tunnel_del(struct net_device *dev, struct ifreq *ifr) +ipip6_tunnel_del(struct net_device *dev, struct ip_tunnel_parm *p) { struct ip_tunnel *t = netdev_priv(dev); - struct ip_tunnel_parm p; if (!ns_capable(t->net->user_ns, CAP_NET_ADMIN)) return -EPERM; if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) { - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) - return -EFAULT; - t = ipip6_tunnel_locate(t->net, &p, 0); + t = ipip6_tunnel_locate(t->net, p, 0); if (!t) return -ENOENT; if (t == netdev_priv(dev_to_sit_net(dev)->fb_tunnel_dev)) @@ -1366,18 +1346,32 @@ ipip6_tunnel_del(struct net_device *dev, struct ifreq *ifr) return 0; } +static int +ipip6_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) +{ + switch (cmd) { + case SIOCGETTUNNEL: + return ipip6_tunnel_get(dev, p); + case SIOCADDTUNNEL: + return ipip6_tunnel_add(dev, p); + case SIOCCHGTUNNEL: + return ipip6_tunnel_change(dev, p); + case SIOCDELTUNNEL: + return ipip6_tunnel_del(dev, p); + default: + return -EINVAL; + } +} + static int ipip6_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { switch (cmd) { case SIOCGETTUNNEL: - return ipip6_tunnel_get(dev, ifr); case SIOCADDTUNNEL: - return ipip6_tunnel_add(dev, ifr); case SIOCCHGTUNNEL: - return ipip6_tunnel_change(dev, ifr); case SIOCDELTUNNEL: - return ipip6_tunnel_del(dev, ifr); + return ip_tunnel_ioctl(dev, ifr, cmd); case SIOCGETPRL: return ipip6_tunnel_get_prl(dev, ifr); case SIOCADDPRL: @@ -1404,6 +1398,7 @@ static const struct net_device_ops ipip6_netdev_ops = { .ndo_do_ioctl = ipip6_tunnel_ioctl, .ndo_get_stats64 = ip_tunnel_get_stats64, .ndo_get_iflink = ip_tunnel_get_iflink, + .ndo_tunnel_ctl = ipip6_tunnel_ctl, }; static void ipip6_dev_free(struct net_device *dev) -- cgit v1.2.3 From f098846044c9edb359bf2dae4bcf0d537dda22fe Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 May 2020 15:03:17 +0200 Subject: ipv6: stub out even more of addrconf_set_dstaddr if SIT is disabled There is no point in copying the structure from userspace or looking up a device if SIT support is not disabled and we'll eventually return -ENODEV anyway. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index ab7e839753ae..8300176f91e7 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2794,6 +2794,9 @@ int addrconf_set_dstaddr(struct net *net, void __user *arg) struct net_device *dev; int err = -EINVAL; + if (!IS_ENABLED(CONFIG_IPV6_SIT)) + return -ENODEV; + rtnl_lock(); err = -EFAULT; @@ -2806,7 +2809,6 @@ int addrconf_set_dstaddr(struct net *net, void __user *arg) if (!dev) goto err_exit; -#if IS_ENABLED(CONFIG_IPV6_SIT) if (dev->type == ARPHRD_SIT) { const struct net_device_ops *ops = dev->netdev_ops; struct ifreq ifr; @@ -2842,7 +2844,6 @@ int addrconf_set_dstaddr(struct net *net, void __user *arg) err = dev_open(dev, NULL); } } -#endif err_exit: rtnl_unlock(); -- cgit v1.2.3 From 68ad6886dd3fb7d0b336363a90ace55b22f0dcb9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 May 2020 15:03:18 +0200 Subject: ipv6: streamline addrconf_set_dstaddr Factor out a addrconf_set_sit_dstaddr helper for the actual work if we found a SIT device, and only hold the rtnl lock around the device lookup and that new helper, as there is no point in holding it over a copy_from_user call. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 87 +++++++++++++++++++++++------------------------------ 1 file changed, 38 insertions(+), 49 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 8300176f91e7..c827edf87741 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2783,6 +2783,38 @@ put: in6_dev_put(in6_dev); } +static int addrconf_set_sit_dstaddr(struct net *net, struct net_device *dev, + struct in6_ifreq *ireq) +{ + struct ip_tunnel_parm p = { }; + mm_segment_t oldfs = get_fs(); + struct ifreq ifr; + int err; + + if (!(ipv6_addr_type(&ireq->ifr6_addr) & IPV6_ADDR_COMPATv4)) + return -EADDRNOTAVAIL; + + p.iph.daddr = ireq->ifr6_addr.s6_addr32[3]; + p.iph.version = 4; + p.iph.ihl = 5; + p.iph.protocol = IPPROTO_IPV6; + p.iph.ttl = 64; + ifr.ifr_ifru.ifru_data = (__force void __user *)&p; + + if (!dev->netdev_ops->ndo_do_ioctl) + return -EOPNOTSUPP; + set_fs(KERNEL_DS); + err = dev->netdev_ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL); + set_fs(oldfs); + if (err) + return err; + + dev = __dev_get_by_name(net, p.name); + if (!dev) + return -ENOBUFS; + return dev_open(dev, NULL); +} + /* * Set destination address. * Special case for SIT interfaces where we create a new "virtual" @@ -2790,62 +2822,19 @@ put: */ int addrconf_set_dstaddr(struct net *net, void __user *arg) { - struct in6_ifreq ireq; struct net_device *dev; - int err = -EINVAL; + struct in6_ifreq ireq; + int err = -ENODEV; if (!IS_ENABLED(CONFIG_IPV6_SIT)) return -ENODEV; - - rtnl_lock(); - - err = -EFAULT; if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) - goto err_exit; + return -EFAULT; + rtnl_lock(); dev = __dev_get_by_index(net, ireq.ifr6_ifindex); - - err = -ENODEV; - if (!dev) - goto err_exit; - - if (dev->type == ARPHRD_SIT) { - const struct net_device_ops *ops = dev->netdev_ops; - struct ifreq ifr; - struct ip_tunnel_parm p; - - err = -EADDRNOTAVAIL; - if (!(ipv6_addr_type(&ireq.ifr6_addr) & IPV6_ADDR_COMPATv4)) - goto err_exit; - - memset(&p, 0, sizeof(p)); - p.iph.daddr = ireq.ifr6_addr.s6_addr32[3]; - p.iph.saddr = 0; - p.iph.version = 4; - p.iph.ihl = 5; - p.iph.protocol = IPPROTO_IPV6; - p.iph.ttl = 64; - ifr.ifr_ifru.ifru_data = (__force void __user *)&p; - - if (ops->ndo_do_ioctl) { - mm_segment_t oldfs = get_fs(); - - set_fs(KERNEL_DS); - err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL); - set_fs(oldfs); - } else - err = -EOPNOTSUPP; - - if (err == 0) { - err = -ENOBUFS; - dev = __dev_get_by_name(net, p.name); - if (!dev) - goto err_exit; - err = dev_open(dev, NULL); - } - } - -err_exit: + if (dev && dev->type == ARPHRD_SIT) + err = addrconf_set_sit_dstaddr(net, dev, &ireq); rtnl_unlock(); return err; } -- cgit v1.2.3 From 8e3db0bbb29aa8d135341c5327bae738e93932a7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 May 2020 15:03:19 +0200 Subject: ipv6: use ->ndo_tunnel_ctl in addrconf_set_dstaddr Use the new ->ndo_tunnel_ctl instead of overriding the address limit and using ->ndo_do_ioctl just to do a pointless user copy. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index c827edf87741..09cfbf5dd7ce 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2787,8 +2787,6 @@ static int addrconf_set_sit_dstaddr(struct net *net, struct net_device *dev, struct in6_ifreq *ireq) { struct ip_tunnel_parm p = { }; - mm_segment_t oldfs = get_fs(); - struct ifreq ifr; int err; if (!(ipv6_addr_type(&ireq->ifr6_addr) & IPV6_ADDR_COMPATv4)) @@ -2799,13 +2797,10 @@ static int addrconf_set_sit_dstaddr(struct net *net, struct net_device *dev, p.iph.ihl = 5; p.iph.protocol = IPPROTO_IPV6; p.iph.ttl = 64; - ifr.ifr_ifru.ifru_data = (__force void __user *)&p; - if (!dev->netdev_ops->ndo_do_ioctl) + if (!dev->netdev_ops->ndo_tunnel_ctl) return -EOPNOTSUPP; - set_fs(KERNEL_DS); - err = dev->netdev_ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL); - set_fs(oldfs); + err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, SIOCADDTUNNEL); if (err) return err; -- cgit v1.2.3 From 4f65e2f483b6f764c15094d14dd53dda048a4048 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 19 May 2020 15:50:12 -0700 Subject: net: unexport skb_gro_receive() skb_gro_receive() used to be used by SCTP, it is no longer the case. skb_gro_receive_list() is in the same category : never used from modules. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/skbuff.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 35a133c6d13b..b8afefe6f6b6 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3727,7 +3727,6 @@ int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb) return 0; } -EXPORT_SYMBOL_GPL(skb_gro_receive_list); /** * skb_segment - Perform protocol segmentation on skb. @@ -4191,7 +4190,6 @@ done: NAPI_GRO_CB(skb)->same_flow = 1; return 0; } -EXPORT_SYMBOL_GPL(skb_gro_receive); #ifdef CONFIG_SKB_EXTENSIONS #define SKB_EXT_ALIGN_VALUE 8 -- cgit v1.2.3 From 3ca44c16b0dcc764b641ee4ac226909f5c421aa3 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Tue, 19 May 2020 13:25:19 -0700 Subject: Bluetooth: Consolidate encryption handling in hci_encrypt_cfm This makes hci_encrypt_cfm calls hci_connect_cfm in case the connection state is BT_CONFIG so callers don't have to check the state. Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 20 ++++++++++++++++++-- net/bluetooth/hci_event.c | 28 +++------------------------- 2 files changed, 21 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 5dcf85f186c6..cdd4f1db8670 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1381,10 +1381,26 @@ static inline void hci_auth_cfm(struct hci_conn *conn, __u8 status) conn->security_cfm_cb(conn, status); } -static inline void hci_encrypt_cfm(struct hci_conn *conn, __u8 status, - __u8 encrypt) +static inline void hci_encrypt_cfm(struct hci_conn *conn, __u8 status) { struct hci_cb *cb; + __u8 encrypt; + + if (conn->state == BT_CONFIG) { + if (status) + conn->state = BT_CONNECTED; + + hci_connect_cfm(conn, status); + hci_conn_drop(conn); + return; + } + + if (!test_bit(HCI_CONN_ENCRYPT, &conn->flags)) + encrypt = 0x00; + else if (test_bit(HCI_CONN_AES_CCM, &conn->flags)) + encrypt = 0x02; + else + encrypt = 0x01; if (conn->sec_level == BT_SECURITY_SDP) conn->sec_level = BT_SECURITY_LOW; diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index f024b3d57a1c..cfeaee347db3 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -2931,7 +2931,7 @@ static void hci_auth_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) &cp); } else { clear_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags); - hci_encrypt_cfm(conn, ev->status, 0x00); + hci_encrypt_cfm(conn, ev->status); } } @@ -3016,22 +3016,7 @@ static void read_enc_key_size_complete(struct hci_dev *hdev, u8 status, conn->enc_key_size = rp->key_size; } - if (conn->state == BT_CONFIG) { - conn->state = BT_CONNECTED; - hci_connect_cfm(conn, 0); - hci_conn_drop(conn); - } else { - u8 encrypt; - - if (!test_bit(HCI_CONN_ENCRYPT, &conn->flags)) - encrypt = 0x00; - else if (test_bit(HCI_CONN_AES_CCM, &conn->flags)) - encrypt = 0x02; - else - encrypt = 0x01; - - hci_encrypt_cfm(conn, 0, encrypt); - } + hci_encrypt_cfm(conn, 0); unlock: hci_dev_unlock(hdev); @@ -3149,14 +3134,7 @@ static void hci_encrypt_change_evt(struct hci_dev *hdev, struct sk_buff *skb) } notify: - if (conn->state == BT_CONFIG) { - if (!ev->status) - conn->state = BT_CONNECTED; - - hci_connect_cfm(conn, ev->status); - hci_conn_drop(conn); - } else - hci_encrypt_cfm(conn, ev->status, ev->encrypt); + hci_encrypt_cfm(conn, ev->status); unlock: hci_dev_unlock(hdev); -- cgit v1.2.3 From 755dfcbca83710fa967d0efa7c5bb601f871a747 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Tue, 19 May 2020 13:25:17 -0700 Subject: Bluetooth: Fix assuming EIR flags can result in SSP authentication EIR flags should just hint if SSP may be supported but we shall verify this with use of the actual features as the SSP bits may be disabled in the lower layers which would result in legacy authentication to be used. Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_conn.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 07c34c55fc50..307800fd18e6 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -225,8 +225,6 @@ static void hci_acl_create_connection(struct hci_conn *conn) } memcpy(conn->dev_class, ie->data.dev_class, 3); - if (ie->data.ssp_mode > 0) - set_bit(HCI_CONN_SSP_ENABLED, &conn->flags); } cp.pkt_type = cpu_to_le16(conn->pkt_type); -- cgit v1.2.3 From f78cdbd75a57245ecc68f5a40e470933426a082b Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Wed, 20 May 2020 12:41:47 -0700 Subject: rds: fix crash in rds_info_getsockopt() The conversion to pin_user_pages() had a bug: it overlooked the case of allocation of pages failing. Fix that by restoring an equivalent check. Reported-by: syzbot+118ac0af4ac7f785a45b@syzkaller.appspotmail.com Fixes: dbfe7d74376e ("rds: convert get_user_pages() --> pin_user_pages()") Cc: David S. Miller Cc: Jakub Kicinski Cc: netdev@vger.kernel.org Cc: linux-rdma@vger.kernel.org Cc: rds-devel@oss.oracle.com Signed-off-by: John Hubbard Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/info.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/rds/info.c b/net/rds/info.c index e1d63563e81c..b6b46a8214a0 100644 --- a/net/rds/info.c +++ b/net/rds/info.c @@ -234,7 +234,8 @@ call_func: ret = -EFAULT; out: - unpin_user_pages(pages, nr_pages); + if (pages) + unpin_user_pages(pages, nr_pages); kfree(pages); return ret; -- cgit v1.2.3 From 63287de66df11308d239483415d67fe94079f47b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 9 May 2020 20:58:17 -0400 Subject: lift compat definitions of mcast [sg]etsockopt requests into net/compat.h We want to get rid of compat_mc_[sg]etsockopt() and to have that stuff handled without compat_alloc_user_space(), extra copying through userland, etc. To do that we'll need ipv4 and ipv6 instances of ->compat_[sg]etsockopt() to manipulate the 32bit variants of mcast requests, so we need to move the definitions of those out of net/compat.c and into a public header. This patch just does a mechanical move to include/net/compat.h Signed-off-by: Al Viro --- include/net/compat.h | 24 ++++++++++++++++++++++++ net/compat.c | 25 ------------------------- 2 files changed, 24 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/include/net/compat.h b/include/net/compat.h index 2b5e1f7ba153..69a8cd29c0ae 100644 --- a/include/net/compat.h +++ b/include/net/compat.h @@ -74,4 +74,28 @@ int compat_mc_getsockopt(struct sock *, int, int, char __user *, int __user *, int (*)(struct sock *, int, int, char __user *, int __user *)); +struct compat_group_req { + __u32 gr_interface; + struct __kernel_sockaddr_storage gr_group + __aligned(4); +} __packed; + +struct compat_group_source_req { + __u32 gsr_interface; + struct __kernel_sockaddr_storage gsr_group + __aligned(4); + struct __kernel_sockaddr_storage gsr_source + __aligned(4); +} __packed; + +struct compat_group_filter { + __u32 gf_interface; + struct __kernel_sockaddr_storage gf_group + __aligned(4); + __u32 gf_fmode; + __u32 gf_numsrc; + struct __kernel_sockaddr_storage gf_slist[1] + __aligned(4); +} __packed; + #endif /* NET_COMPAT_H */ diff --git a/net/compat.c b/net/compat.c index 69fc6d1e4e6e..032114de4fec 100644 --- a/net/compat.c +++ b/net/compat.c @@ -448,34 +448,9 @@ COMPAT_SYSCALL_DEFINE5(getsockopt, int, fd, int, level, int, optname, return __compat_sys_getsockopt(fd, level, optname, optval, optlen); } -struct compat_group_req { - __u32 gr_interface; - struct __kernel_sockaddr_storage gr_group - __aligned(4); -} __packed; - -struct compat_group_source_req { - __u32 gsr_interface; - struct __kernel_sockaddr_storage gsr_group - __aligned(4); - struct __kernel_sockaddr_storage gsr_source - __aligned(4); -} __packed; - -struct compat_group_filter { - __u32 gf_interface; - struct __kernel_sockaddr_storage gf_group - __aligned(4); - __u32 gf_fmode; - __u32 gf_numsrc; - struct __kernel_sockaddr_storage gf_slist[1] - __aligned(4); -} __packed; - #define __COMPAT_GF0_SIZE (sizeof(struct compat_group_filter) - \ sizeof(struct __kernel_sockaddr_storage)) - int compat_mc_setsockopt(struct sock *sock, int level, int optname, char __user *optval, unsigned int optlen, int (*setsockopt)(struct sock *, int, int, char __user *, unsigned int)) -- cgit v1.2.3 From e9c375fb5edeb550786d1436784db909bf672e9f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 9 May 2020 21:16:36 -0400 Subject: compat_ip{,v6}_setsockopt(): enumerate MCAST_... options explicitly We want to check if optname is among the MCAST_... ones; do that as an explicit switch. Signed-off-by: Al Viro --- net/ipv4/ip_sockglue.c | 10 +++++++++- net/ipv6/ipv6_sockglue.c | 10 +++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 8206047d70b6..3c2c6cd3933b 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -1272,9 +1272,17 @@ int compat_ip_setsockopt(struct sock *sk, int level, int optname, if (level != SOL_IP) return -ENOPROTOOPT; - if (optname >= MCAST_JOIN_GROUP && optname <= MCAST_MSFILTER) + switch (optname) { + case MCAST_JOIN_GROUP: + case MCAST_LEAVE_GROUP: + case MCAST_JOIN_SOURCE_GROUP: + case MCAST_LEAVE_SOURCE_GROUP: + case MCAST_BLOCK_SOURCE: + case MCAST_UNBLOCK_SOURCE: + case MCAST_MSFILTER: return compat_mc_setsockopt(sk, level, optname, optval, optlen, ip_setsockopt); + } err = do_ip_setsockopt(sk, level, optname, optval, optlen); #ifdef CONFIG_NETFILTER diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index a0e50cc57e54..96e3f603c8d8 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -973,9 +973,17 @@ int compat_ipv6_setsockopt(struct sock *sk, int level, int optname, if (level != SOL_IPV6) return -ENOPROTOOPT; - if (optname >= MCAST_JOIN_GROUP && optname <= MCAST_MSFILTER) + switch (optname) { + case MCAST_JOIN_GROUP: + case MCAST_LEAVE_GROUP: + case MCAST_JOIN_SOURCE_GROUP: + case MCAST_LEAVE_SOURCE_GROUP: + case MCAST_BLOCK_SOURCE: + case MCAST_UNBLOCK_SOURCE: + case MCAST_MSFILTER: return compat_mc_setsockopt(sk, level, optname, optval, optlen, ipv6_setsockopt); + } err = do_ipv6_setsockopt(sk, level, optname, optval, optlen); #ifdef CONFIG_NETFILTER -- cgit v1.2.3 From 931ca7ab7fe804d77bc6952f1512950c0d870f26 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 29 Mar 2020 17:18:30 -0400 Subject: ip*_mc_gsfget(): lift copyout of struct group_filter into callers pass the userland pointer to the array in its tail, so that part gets copied out by our functions; copyout of everything else is done in the callers. Rationale: reuse for compat; the array is the same in native and compat, the layout of parts before it is different for compat. Signed-off-by: Al Viro --- include/linux/igmp.h | 2 +- include/net/ipv6.h | 2 +- net/ipv4/igmp.c | 18 +++++------------- net/ipv4/ip_sockglue.c | 19 ++++++++++++++----- net/ipv6/ipv6_sockglue.c | 18 ++++++++++++++---- net/ipv6/mcast.c | 10 +++------- 6 files changed, 38 insertions(+), 31 deletions(-) (limited to 'net') diff --git a/include/linux/igmp.h b/include/linux/igmp.h index faa6586a5783..64ce8cd1cfaf 100644 --- a/include/linux/igmp.h +++ b/include/linux/igmp.h @@ -123,7 +123,7 @@ extern int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf,int ifindex); extern int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf, struct ip_msfilter __user *optval, int __user *optlen); extern int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf, - struct group_filter __user *optval, int __user *optlen); + struct sockaddr_storage __user *p); extern int ip_mc_sf_allow(struct sock *sk, __be32 local, __be32 rmt, int dif, int sdif); extern void ip_mc_init_dev(struct in_device *); diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 5fc3a9d7b053..c45eb78d970f 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -1138,7 +1138,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk, struct group_source_req *pgsr); int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf); int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, - struct group_filter __user *optval, int __user *optlen); + struct sockaddr_storage __user *p); #ifdef CONFIG_PROC_FS int ac6_proc_init(struct net *net); diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 47f0502b2101..7b272bbed2b4 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -2565,9 +2565,9 @@ done: } int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf, - struct group_filter __user *optval, int __user *optlen) + struct sockaddr_storage __user *p) { - int err, i, count, copycount; + int i, count, copycount; struct sockaddr_in *psin; __be32 addr; struct ip_mc_socklist *pmc; @@ -2583,37 +2583,29 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf, if (!ipv4_is_multicast(addr)) return -EINVAL; - err = -EADDRNOTAVAIL; - for_each_pmc_rtnl(inet, pmc) { if (pmc->multi.imr_multiaddr.s_addr == addr && pmc->multi.imr_ifindex == gsf->gf_interface) break; } if (!pmc) /* must have a prior join */ - goto done; + return -EADDRNOTAVAIL; gsf->gf_fmode = pmc->sfmode; psl = rtnl_dereference(pmc->sflist); count = psl ? psl->sl_count : 0; copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc; gsf->gf_numsrc = count; - if (put_user(GROUP_FILTER_SIZE(copycount), optlen) || - copy_to_user(optval, gsf, GROUP_FILTER_SIZE(0))) { - return -EFAULT; - } - for (i = 0; i < copycount; i++) { + for (i = 0; i < copycount; i++, p++) { struct sockaddr_storage ss; psin = (struct sockaddr_in *)&ss; memset(&ss, 0, sizeof(ss)); psin->sin_family = AF_INET; psin->sin_addr.s_addr = psl->sl_addr[i]; - if (copy_to_user(&optval->gf_slist[i], &ss, sizeof(ss))) + if (copy_to_user(p, &ss, sizeof(ss))) return -EFAULT; } return 0; -done: - return err; } /* diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 3c2c6cd3933b..e3703a3e7ef4 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -1473,19 +1473,28 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, } case MCAST_MSFILTER: { + struct group_filter __user *p = (void __user *)optval; struct group_filter gsf; + const int size0 = offsetof(struct group_filter, gf_slist); + int num; - if (len < GROUP_FILTER_SIZE(0)) { + if (len < size0) { err = -EINVAL; goto out; } - if (copy_from_user(&gsf, optval, GROUP_FILTER_SIZE(0))) { + if (copy_from_user(&gsf, p, size0)) { err = -EFAULT; goto out; } - err = ip_mc_gsfget(sk, &gsf, - (struct group_filter __user *)optval, - optlen); + num = gsf.gf_numsrc; + err = ip_mc_gsfget(sk, &gsf, p->gf_slist); + if (err) + goto out; + if (gsf.gf_numsrc < num) + num = gsf.gf_numsrc; + if (put_user(GROUP_FILTER_SIZE(num), optlen) || + copy_to_user(p, &gsf, size0)) + err = -EFAULT; goto out; } case IP_MULTICAST_ALL: diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 96e3f603c8d8..e4a62ca1a3d0 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -1056,18 +1056,28 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, break; case MCAST_MSFILTER: { + struct group_filter __user *p = (void __user *)optval; struct group_filter gsf; + const int size0 = offsetof(struct group_filter, gf_slist); + int num; int err; - if (len < GROUP_FILTER_SIZE(0)) + if (len < size0) return -EINVAL; - if (copy_from_user(&gsf, optval, GROUP_FILTER_SIZE(0))) + if (copy_from_user(&gsf, p, size0)) return -EFAULT; if (gsf.gf_group.ss_family != AF_INET6) return -EADDRNOTAVAIL; + num = gsf.gf_numsrc; lock_sock(sk); - err = ip6_mc_msfget(sk, &gsf, - (struct group_filter __user *)optval, optlen); + err = ip6_mc_msfget(sk, &gsf, p->gf_slist); + if (!err) { + if (num > gsf.gf_numsrc) + num = gsf.gf_numsrc; + if (put_user(GROUP_FILTER_SIZE(num), optlen) || + copy_to_user(p, &gsf, size0)) + err = -EFAULT; + } release_sock(sk); return err; } diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index eaa4c2cc2fbb..97d796c7d6c0 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -547,7 +547,7 @@ done: } int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, - struct group_filter __user *optval, int __user *optlen) + struct sockaddr_storage *p) { int err, i, count, copycount; const struct in6_addr *group; @@ -592,14 +592,10 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc; gsf->gf_numsrc = count; - if (put_user(GROUP_FILTER_SIZE(copycount), optlen) || - copy_to_user(optval, gsf, GROUP_FILTER_SIZE(0))) { - return -EFAULT; - } /* changes to psl require the socket lock, and a write lock * on pmc->sflock. We have the socket lock so reading here is safe. */ - for (i = 0; i < copycount; i++) { + for (i = 0; i < copycount; i++, p++) { struct sockaddr_in6 *psin6; struct sockaddr_storage ss; @@ -607,7 +603,7 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, memset(&ss, 0, sizeof(ss)); psin6->sin6_family = AF_INET6; psin6->sin6_addr = psl->sl_addr[i]; - if (copy_to_user(&optval->gf_slist[i], &ss, sizeof(ss))) + if (copy_to_user(p, &ss, sizeof(ss))) return -EFAULT; } return 0; -- cgit v1.2.3 From 0dfe6581a7e35bafe5fc4d9b84edd0e66b4fd78a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 29 Mar 2020 22:08:59 -0400 Subject: get rid of compat_mc_getsockopt() now we can do MCAST_MSFILTER in compat ->getsockopt() without playing silly buggers with copying things back and forth. We can form a native struct group_filter (sans the variable-length tail) on stack, pass that + pointer to the tail of original request to the helper doing the bulk of the work, then do the rest of copyout - same as the native getsockopt() does. Signed-off-by: Al Viro --- include/net/compat.h | 3 -- net/compat.c | 79 ------------------------------------------------ net/ipv4/ip_sockglue.c | 44 +++++++++++++++++++++++++-- net/ipv6/ipv6_sockglue.c | 41 +++++++++++++++++++++++-- 4 files changed, 79 insertions(+), 88 deletions(-) (limited to 'net') diff --git a/include/net/compat.h b/include/net/compat.h index 69a8cd29c0ae..d714076d63d5 100644 --- a/include/net/compat.h +++ b/include/net/compat.h @@ -70,9 +70,6 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *, struct sock *, int compat_mc_setsockopt(struct sock *, int, int, char __user *, unsigned int, int (*)(struct sock *, int, int, char __user *, unsigned int)); -int compat_mc_getsockopt(struct sock *, int, int, char __user *, int __user *, - int (*)(struct sock *, int, int, char __user *, - int __user *)); struct compat_group_req { __u32 gr_interface; diff --git a/net/compat.c b/net/compat.c index 032114de4fec..7bdfda2b382a 100644 --- a/net/compat.c +++ b/net/compat.c @@ -538,85 +538,6 @@ int compat_mc_setsockopt(struct sock *sock, int level, int optname, } EXPORT_SYMBOL(compat_mc_setsockopt); -int compat_mc_getsockopt(struct sock *sock, int level, int optname, - char __user *optval, int __user *optlen, - int (*getsockopt)(struct sock *, int, int, char __user *, int __user *)) -{ - struct compat_group_filter __user *gf32 = (void __user *)optval; - struct group_filter __user *kgf; - int __user *koptlen; - u32 interface, fmode, numsrc; - int klen, ulen, err; - - if (optname != MCAST_MSFILTER) - return getsockopt(sock, level, optname, optval, optlen); - - koptlen = compat_alloc_user_space(sizeof(*koptlen)); - if (!access_ok(optlen, sizeof(*optlen)) || - __get_user(ulen, optlen)) - return -EFAULT; - - /* adjust len for pad */ - klen = ulen + sizeof(*kgf) - sizeof(*gf32); - - if (klen < GROUP_FILTER_SIZE(0)) - return -EINVAL; - - if (!access_ok(koptlen, sizeof(*koptlen)) || - __put_user(klen, koptlen)) - return -EFAULT; - - /* have to allow space for previous compat_alloc_user_space, too */ - kgf = compat_alloc_user_space(klen+sizeof(*optlen)); - - if (!access_ok(gf32, __COMPAT_GF0_SIZE) || - __get_user(interface, &gf32->gf_interface) || - __get_user(fmode, &gf32->gf_fmode) || - __get_user(numsrc, &gf32->gf_numsrc) || - __put_user(interface, &kgf->gf_interface) || - __put_user(fmode, &kgf->gf_fmode) || - __put_user(numsrc, &kgf->gf_numsrc) || - copy_in_user(&kgf->gf_group, &gf32->gf_group, sizeof(kgf->gf_group))) - return -EFAULT; - - err = getsockopt(sock, level, optname, (char __user *)kgf, koptlen); - if (err) - return err; - - if (!access_ok(koptlen, sizeof(*koptlen)) || - __get_user(klen, koptlen)) - return -EFAULT; - - ulen = klen - (sizeof(*kgf)-sizeof(*gf32)); - - if (!access_ok(optlen, sizeof(*optlen)) || - __put_user(ulen, optlen)) - return -EFAULT; - - if (!access_ok(kgf, klen) || - !access_ok(gf32, ulen) || - __get_user(interface, &kgf->gf_interface) || - __get_user(fmode, &kgf->gf_fmode) || - __get_user(numsrc, &kgf->gf_numsrc) || - __put_user(interface, &gf32->gf_interface) || - __put_user(fmode, &gf32->gf_fmode) || - __put_user(numsrc, &gf32->gf_numsrc)) - return -EFAULT; - if (numsrc) { - int copylen; - - klen -= GROUP_FILTER_SIZE(0); - copylen = numsrc * sizeof(gf32->gf_slist[0]); - if (copylen > klen) - copylen = klen; - if (copy_in_user(gf32->gf_slist, kgf->gf_slist, copylen)) - return -EFAULT; - } - return err; -} -EXPORT_SYMBOL(compat_mc_getsockopt); - - /* Argument list sizes for compat_sys_socketcall */ #define AL(x) ((x) * sizeof(u32)) static unsigned char nas[21] = { diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index e3703a3e7ef4..65a30e7672ff 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -1607,9 +1607,47 @@ int compat_ip_getsockopt(struct sock *sk, int level, int optname, { int err; - if (optname == MCAST_MSFILTER) - return compat_mc_getsockopt(sk, level, optname, optval, optlen, - ip_getsockopt); + if (optname == MCAST_MSFILTER) { + const int size0 = offsetof(struct compat_group_filter, gf_slist); + struct compat_group_filter __user *p = (void __user *)optval; + struct compat_group_filter gf32; + struct group_filter gf; + int ulen, err; + int num; + + if (level != SOL_IP) + return -EOPNOTSUPP; + + if (get_user(ulen, optlen)) + return -EFAULT; + + if (ulen < size0) + return -EINVAL; + + if (copy_from_user(&gf32, p, size0)) + return -EFAULT; + + gf.gf_interface = gf32.gf_interface; + gf.gf_fmode = gf32.gf_fmode; + num = gf.gf_numsrc = gf32.gf_numsrc; + gf.gf_group = gf32.gf_group; + + rtnl_lock(); + lock_sock(sk); + err = ip_mc_gsfget(sk, &gf, p->gf_slist); + release_sock(sk); + rtnl_unlock(); + if (err) + return err; + if (gf.gf_numsrc < num) + num = gf.gf_numsrc; + ulen = GROUP_FILTER_SIZE(num) - (sizeof(gf) - sizeof(gf32)); + if (put_user(ulen, optlen) || + put_user(gf.gf_fmode, &p->gf_fmode) || + put_user(gf.gf_numsrc, &p->gf_numsrc)) + return -EFAULT; + return 0; + } err = do_ip_getsockopt(sk, level, optname, optval, optlen, MSG_CMSG_COMPAT); diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index e4a62ca1a3d0..0bbafe73bdde 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -1446,9 +1446,44 @@ int compat_ipv6_getsockopt(struct sock *sk, int level, int optname, if (level != SOL_IPV6) return -ENOPROTOOPT; - if (optname == MCAST_MSFILTER) - return compat_mc_getsockopt(sk, level, optname, optval, optlen, - ipv6_getsockopt); + if (optname == MCAST_MSFILTER) { + const int size0 = offsetof(struct compat_group_filter, gf_slist); + struct compat_group_filter __user *p = (void __user *)optval; + struct compat_group_filter gf32; + struct group_filter gf; + int ulen, err; + int num; + + if (get_user(ulen, optlen)) + return -EFAULT; + + if (ulen < size0) + return -EINVAL; + + if (copy_from_user(&gf32, p, size0)) + return -EFAULT; + + gf.gf_interface = gf32.gf_interface; + gf.gf_fmode = gf32.gf_fmode; + num = gf.gf_numsrc = gf32.gf_numsrc; + gf.gf_group = gf32.gf_group; + + if (gf.gf_group.ss_family != AF_INET6) + return -EADDRNOTAVAIL; + lock_sock(sk); + err = ip6_mc_msfget(sk, &gf, p->gf_slist); + release_sock(sk); + if (err) + return err; + if (num > gf.gf_numsrc) + num = gf.gf_numsrc; + ulen = GROUP_FILTER_SIZE(num) - (sizeof(gf)-sizeof(gf32)); + if (put_user(ulen, optlen) || + put_user(gf.gf_fmode, &p->gf_fmode) || + put_user(gf.gf_numsrc, &p->gf_numsrc)) + return -EFAULT; + return 0; + } err = do_ipv6_getsockopt(sk, level, optname, optval, optlen, MSG_CMSG_COMPAT); -- cgit v1.2.3 From e986d4dabcb8fab87b36b607bc710e42fe206baf Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 29 Mar 2020 22:37:56 -0400 Subject: set_mcast_msfilter(): take the guts of setsockopt(MCAST_MSFILTER) into a helper Signed-off-by: Al Viro --- net/ipv4/ip_sockglue.c | 73 +++++++++++++++++++++++++++----------------------- 1 file changed, 40 insertions(+), 33 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 65a30e7672ff..cc0441157b02 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -587,6 +587,43 @@ static bool setsockopt_needs_rtnl(int optname) return false; } +static int set_mcast_msfilter(struct sock *sk, int ifindex, + int numsrc, int fmode, + struct sockaddr_storage *group, + struct sockaddr_storage *list) +{ + int msize = IP_MSFILTER_SIZE(numsrc); + struct ip_msfilter *msf; + struct sockaddr_in *psin; + int err, i; + + msf = kmalloc(msize, GFP_KERNEL); + if (!msf) + return -ENOBUFS; + + psin = (struct sockaddr_in *)group; + if (psin->sin_family != AF_INET) + goto Eaddrnotavail; + msf->imsf_multiaddr = psin->sin_addr.s_addr; + msf->imsf_interface = 0; + msf->imsf_fmode = fmode; + msf->imsf_numsrc = numsrc; + for (i = 0; i < numsrc; ++i) { + psin = (struct sockaddr_in *)&list[i]; + + if (psin->sin_family != AF_INET) + goto Eaddrnotavail; + msf->imsf_slist[i] = psin->sin_addr.s_addr; + } + err = ip_mc_msfilter(sk, msf, ifindex); + kfree(msf); + return err; + +Eaddrnotavail: + kfree(msf); + return -EADDRNOTAVAIL; +} + static int do_ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, unsigned int optlen) { @@ -1079,10 +1116,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, } case MCAST_MSFILTER: { - struct sockaddr_in *psin; - struct ip_msfilter *msf = NULL; struct group_filter *gsf = NULL; - int msize, i, ifindex; if (optlen < GROUP_FILTER_SIZE(0)) goto e_inval; @@ -1095,7 +1129,6 @@ static int do_ip_setsockopt(struct sock *sk, int level, err = PTR_ERR(gsf); break; } - /* numsrc >= (4G-140)/128 overflow in 32 bits */ if (gsf->gf_numsrc >= 0x1ffffff || gsf->gf_numsrc > net->ipv4.sysctl_igmp_max_msf) { @@ -1106,36 +1139,10 @@ static int do_ip_setsockopt(struct sock *sk, int level, err = -EINVAL; goto mc_msf_out; } - msize = IP_MSFILTER_SIZE(gsf->gf_numsrc); - msf = kmalloc(msize, GFP_KERNEL); - if (!msf) { - err = -ENOBUFS; - goto mc_msf_out; - } - ifindex = gsf->gf_interface; - psin = (struct sockaddr_in *)&gsf->gf_group; - if (psin->sin_family != AF_INET) { - err = -EADDRNOTAVAIL; - goto mc_msf_out; - } - msf->imsf_multiaddr = psin->sin_addr.s_addr; - msf->imsf_interface = 0; - msf->imsf_fmode = gsf->gf_fmode; - msf->imsf_numsrc = gsf->gf_numsrc; - err = -EADDRNOTAVAIL; - for (i = 0; i < gsf->gf_numsrc; ++i) { - psin = (struct sockaddr_in *)&gsf->gf_slist[i]; - - if (psin->sin_family != AF_INET) - goto mc_msf_out; - msf->imsf_slist[i] = psin->sin_addr.s_addr; - } - kfree(gsf); - gsf = NULL; - - err = ip_mc_msfilter(sk, msf, ifindex); + err = set_mcast_msfilter(sk, gsf->gf_interface, + gsf->gf_numsrc, gsf->gf_fmode, + &gsf->gf_group, gsf->gf_slist); mc_msf_out: - kfree(msf); kfree(gsf); break; } -- cgit v1.2.3 From 2e04172875c9daf929659eb5c3ef4b98fdf34396 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 30 Mar 2020 15:39:43 -0400 Subject: ipv4: do compat setsockopt for MCAST_MSFILTER directly Parallel to what the native setsockopt() does, except that unlike the native setsockopt() we do not use memdup_user() - we want the sockaddr_storage fields properly aligned, so we allocate 4 bytes more and copy compat_group_filter at the offset 4, which yields the proper alignments. Signed-off-by: Al Viro --- net/ipv4/ip_sockglue.c | 48 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index cc0441157b02..b6b889b5dacf 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -1286,9 +1286,55 @@ int compat_ip_setsockopt(struct sock *sk, int level, int optname, case MCAST_LEAVE_SOURCE_GROUP: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: - case MCAST_MSFILTER: return compat_mc_setsockopt(sk, level, optname, optval, optlen, ip_setsockopt); + case MCAST_MSFILTER: + { + const int size0 = offsetof(struct compat_group_filter, gf_slist); + struct compat_group_filter *gf32; + void *p; + int n; + + if (optlen < size0) + return -EINVAL; + if (optlen > sysctl_optmem_max - 4) + return -ENOBUFS; + + p = kmalloc(optlen + 4, GFP_KERNEL); + if (!p) + return -ENOMEM; + gf32 = p + 4; /* we want ->gf_group and ->gf_slist aligned */ + if (copy_from_user(gf32, optval, optlen)) { + err = -EFAULT; + goto mc_msf_out; + } + + n = gf32->gf_numsrc; + /* numsrc >= (4G-140)/128 overflow in 32 bits */ + if (n >= 0x1ffffff) { + err = -ENOBUFS; + goto mc_msf_out; + } + if (offsetof(struct compat_group_filter, gf_slist[n]) > optlen) { + err = -EINVAL; + goto mc_msf_out; + } + + rtnl_lock(); + lock_sock(sk); + /* numsrc >= (4G-140)/128 overflow in 32 bits */ + if (n > sock_net(sk)->ipv4.sysctl_igmp_max_msf) + err = -ENOBUFS; + else + err = set_mcast_msfilter(sk, gf32->gf_interface, + n, gf32->gf_fmode, + &gf32->gf_group, gf32->gf_slist); + release_sock(sk); + rtnl_unlock(); +mc_msf_out: + kfree(p); + return err; + } } err = do_ip_setsockopt(sk, level, optname, optval, optlen); -- cgit v1.2.3 From d59eb177c84f9572a6b51024c0b2611c3b5a27c5 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 30 Mar 2020 15:43:10 -0400 Subject: ip6_mc_msfilter(): pass the address list separately that way we'll be able to reuse it for compat case Signed-off-by: Al Viro --- include/net/ipv6.h | 3 ++- net/ipv6/ipv6_sockglue.c | 2 +- net/ipv6/mcast.c | 7 ++++--- 3 files changed, 7 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index c45eb78d970f..39a00d3ef5e2 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -1136,7 +1136,8 @@ struct group_filter; int ip6_mc_source(int add, int omode, struct sock *sk, struct group_source_req *pgsr); -int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf); +int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf, + struct sockaddr_storage *list); int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, struct sockaddr_storage __user *p); diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 0bbafe73bdde..7d3ecc0e69d1 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -780,7 +780,7 @@ done: retv = -EINVAL; break; } - retv = ip6_mc_msfilter(sk, gsf); + retv = ip6_mc_msfilter(sk, gsf, gsf->gf_slist); kfree(gsf); break; diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 97d796c7d6c0..7e12d2114158 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -457,7 +457,8 @@ done: return err; } -int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf) +int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf, + struct sockaddr_storage *list) { const struct in6_addr *group; struct ipv6_mc_socklist *pmc; @@ -509,10 +510,10 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf) goto done; } newpsl->sl_max = newpsl->sl_count = gsf->gf_numsrc; - for (i = 0; i < newpsl->sl_count; ++i) { + for (i = 0; i < newpsl->sl_count; ++i, ++list) { struct sockaddr_in6 *psin6; - psin6 = (struct sockaddr_in6 *)&gsf->gf_slist[i]; + psin6 = (struct sockaddr_in6 *)list; newpsl->sl_addr[i] = psin6->sin6_addr; } err = ip6_mc_add_src(idev, group, gsf->gf_fmode, -- cgit v1.2.3 From 168a2cca81438aef819e43feb161614488dee97b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 30 Mar 2020 16:40:04 -0400 Subject: ipv6: do compat setsockopt for MCAST_MSFILTER directly similar to the ipv4 counterpart of that patch - the same trick used to align the tail array properly. Signed-off-by: Al Viro --- net/ipv6/ipv6_sockglue.c | 48 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 7d3ecc0e69d1..2b5029df8f1e 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -980,9 +980,55 @@ int compat_ipv6_setsockopt(struct sock *sk, int level, int optname, case MCAST_LEAVE_SOURCE_GROUP: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: - case MCAST_MSFILTER: return compat_mc_setsockopt(sk, level, optname, optval, optlen, ipv6_setsockopt); + case MCAST_MSFILTER: + { + const int size0 = offsetof(struct compat_group_filter, gf_slist); + struct compat_group_filter *gf32; + void *p; + int n; + + if (optlen < size0) + return -EINVAL; + if (optlen > sysctl_optmem_max - 4) + return -ENOBUFS; + + p = kmalloc(optlen + 4, GFP_KERNEL); + if (!p) + return -ENOMEM; + + gf32 = p + 4; /* we want ->gf_group and ->gf_slist aligned */ + if (copy_from_user(gf32, optval, optlen)) { + err = -EFAULT; + goto mc_msf_out; + } + + n = gf32->gf_numsrc; + /* numsrc >= (4G-140)/128 overflow in 32 bits */ + if (n >= 0x1ffffffU || + n > sysctl_mld_max_msf) { + err = -ENOBUFS; + goto mc_msf_out; + } + if (offsetof(struct compat_group_filter, gf_slist[n]) > optlen) { + err = -EINVAL; + goto mc_msf_out; + } + + rtnl_lock(); + lock_sock(sk); + err = ip6_mc_msfilter(sk, &(struct group_filter){ + .gf_interface = gf32->gf_interface, + .gf_group = gf32->gf_group, + .gf_fmode = gf32->gf_fmode, + .gf_numsrc = gf32->gf_numsrc}, gf32->gf_slist); + release_sock(sk); + rtnl_unlock(); +mc_msf_out: + kfree(p); + return err; + } } err = do_ipv6_setsockopt(sk, level, optname, optval, optlen); -- cgit v1.2.3 From 2f984f11fdc06bcfd5bb528d07a93c20301dd068 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 26 Apr 2020 19:56:22 -0400 Subject: ipv[46]: do compat setsockopt for MCAST_{JOIN,LEAVE}_GROUP directly direct parallel to the way these two are handled in the native ->setsockopt() instances - the helpers that do the real work are already separated and can be reused as-is in this case. Signed-off-by: Al Viro --- net/ipv4/ip_sockglue.c | 31 +++++++++++++++++++++++++++++++ net/ipv6/ipv6_sockglue.c | 28 ++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+) (limited to 'net') diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index b6b889b5dacf..34c3a43a9c98 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -1282,6 +1282,37 @@ int compat_ip_setsockopt(struct sock *sk, int level, int optname, switch (optname) { case MCAST_JOIN_GROUP: case MCAST_LEAVE_GROUP: + { + struct compat_group_req __user *gr32 = (void __user *)optval; + struct group_req greq; + struct sockaddr_in *psin = (struct sockaddr_in *)&greq.gr_group; + struct ip_mreqn mreq; + + if (optlen < sizeof(struct compat_group_req)) + return -EINVAL; + + if (get_user(greq.gr_interface, &gr32->gr_interface) || + copy_from_user(&greq.gr_group, &gr32->gr_group, + sizeof(greq.gr_group))) + return -EFAULT; + + if (psin->sin_family != AF_INET) + return -EINVAL; + + memset(&mreq, 0, sizeof(mreq)); + mreq.imr_multiaddr = psin->sin_addr; + mreq.imr_ifindex = greq.gr_interface; + + rtnl_lock(); + lock_sock(sk); + if (optname == MCAST_JOIN_GROUP) + err = ip_mc_join_group(sk, &mreq); + else + err = ip_mc_leave_group(sk, &mreq); + release_sock(sk); + rtnl_unlock(); + return err; + } case MCAST_JOIN_SOURCE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: case MCAST_BLOCK_SOURCE: diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 2b5029df8f1e..209d827950cc 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -976,6 +976,34 @@ int compat_ipv6_setsockopt(struct sock *sk, int level, int optname, switch (optname) { case MCAST_JOIN_GROUP: case MCAST_LEAVE_GROUP: + { + struct compat_group_req __user *gr32 = (void __user *)optval; + struct group_req greq; + struct sockaddr_in6 *psin6 = (struct sockaddr_in6 *)&greq.gr_group; + + if (optlen < sizeof(struct compat_group_req)) + return -EINVAL; + + if (get_user(greq.gr_interface, &gr32->gr_interface) || + copy_from_user(&greq.gr_group, &gr32->gr_group, + sizeof(greq.gr_group))) + return -EFAULT; + + if (greq.gr_group.ss_family != AF_INET6) + return -EADDRNOTAVAIL; + + rtnl_lock(); + lock_sock(sk); + if (optname == MCAST_JOIN_GROUP) + err = ipv6_sock_mc_join(sk, greq.gr_interface, + &psin6->sin6_addr); + else + err = ipv6_sock_mc_drop(sk, greq.gr_interface, + &psin6->sin6_addr); + release_sock(sk); + rtnl_unlock(); + return err; + } case MCAST_JOIN_SOURCE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: case MCAST_BLOCK_SOURCE: -- cgit v1.2.3 From 2bbf8c1ead651c65bd0f7b6ba6d57cb09e2d1a57 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 27 Apr 2020 10:49:26 -0400 Subject: ipv4: take handling of group_source_req options into a helper Signed-off-by: Al Viro --- net/ipv4/ip_sockglue.c | 83 ++++++++++++++++++++++++++------------------------ 1 file changed, 44 insertions(+), 39 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 34c3a43a9c98..7f065a68664e 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -624,6 +624,49 @@ Eaddrnotavail: return -EADDRNOTAVAIL; } +static int do_mcast_group_source(struct sock *sk, int optname, + struct group_source_req *greqs) +{ + struct ip_mreq_source mreqs; + struct sockaddr_in *psin; + int omode, add, err; + + if (greqs->gsr_group.ss_family != AF_INET || + greqs->gsr_source.ss_family != AF_INET) + return -EADDRNOTAVAIL; + + psin = (struct sockaddr_in *)&greqs->gsr_group; + mreqs.imr_multiaddr = psin->sin_addr.s_addr; + psin = (struct sockaddr_in *)&greqs->gsr_source; + mreqs.imr_sourceaddr = psin->sin_addr.s_addr; + mreqs.imr_interface = 0; /* use index for mc_source */ + + if (optname == MCAST_BLOCK_SOURCE) { + omode = MCAST_EXCLUDE; + add = 1; + } else if (optname == MCAST_UNBLOCK_SOURCE) { + omode = MCAST_EXCLUDE; + add = 0; + } else if (optname == MCAST_JOIN_SOURCE_GROUP) { + struct ip_mreqn mreq; + + psin = (struct sockaddr_in *)&greqs->gsr_group; + mreq.imr_multiaddr = psin->sin_addr; + mreq.imr_address.s_addr = 0; + mreq.imr_ifindex = greqs->gsr_interface; + err = ip_mc_join_group_ssm(sk, &mreq, MCAST_INCLUDE); + if (err && err != -EADDRINUSE) + return err; + greqs->gsr_interface = mreq.imr_ifindex; + omode = MCAST_INCLUDE; + add = 1; + } else /* MCAST_LEAVE_SOURCE_GROUP */ { + omode = MCAST_INCLUDE; + add = 0; + } + return ip_mc_source(add, omode, sk, &mreqs, greqs->gsr_interface); +} + static int do_ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, unsigned int optlen) { @@ -1066,9 +1109,6 @@ static int do_ip_setsockopt(struct sock *sk, int level, case MCAST_UNBLOCK_SOURCE: { struct group_source_req greqs; - struct ip_mreq_source mreqs; - struct sockaddr_in *psin; - int omode, add; if (optlen != sizeof(struct group_source_req)) goto e_inval; @@ -1076,42 +1116,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, err = -EFAULT; break; } - if (greqs.gsr_group.ss_family != AF_INET || - greqs.gsr_source.ss_family != AF_INET) { - err = -EADDRNOTAVAIL; - break; - } - psin = (struct sockaddr_in *)&greqs.gsr_group; - mreqs.imr_multiaddr = psin->sin_addr.s_addr; - psin = (struct sockaddr_in *)&greqs.gsr_source; - mreqs.imr_sourceaddr = psin->sin_addr.s_addr; - mreqs.imr_interface = 0; /* use index for mc_source */ - - if (optname == MCAST_BLOCK_SOURCE) { - omode = MCAST_EXCLUDE; - add = 1; - } else if (optname == MCAST_UNBLOCK_SOURCE) { - omode = MCAST_EXCLUDE; - add = 0; - } else if (optname == MCAST_JOIN_SOURCE_GROUP) { - struct ip_mreqn mreq; - - psin = (struct sockaddr_in *)&greqs.gsr_group; - mreq.imr_multiaddr = psin->sin_addr; - mreq.imr_address.s_addr = 0; - mreq.imr_ifindex = greqs.gsr_interface; - err = ip_mc_join_group_ssm(sk, &mreq, MCAST_INCLUDE); - if (err && err != -EADDRINUSE) - break; - greqs.gsr_interface = mreq.imr_ifindex; - omode = MCAST_INCLUDE; - add = 1; - } else /* MCAST_LEAVE_SOURCE_GROUP */ { - omode = MCAST_INCLUDE; - add = 0; - } - err = ip_mc_source(add, omode, sk, &mreqs, - greqs.gsr_interface); + err = do_mcast_group_source(sk, optname, &greqs); break; } case MCAST_MSFILTER: -- cgit v1.2.3 From fcfa0b09d3f794af66cf8bcee03ddb7055934742 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 27 Apr 2020 11:00:01 -0400 Subject: ipv6: take handling of group_source_req options into a helper Signed-off-by: Al Viro --- net/ipv6/ipv6_sockglue.c | 65 +++++++++++++++++++++++++++--------------------- 1 file changed, 36 insertions(+), 29 deletions(-) (limited to 'net') diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 209d827950cc..bb049feeb787 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -136,6 +136,41 @@ static bool setsockopt_needs_rtnl(int optname) return false; } +static int do_ipv6_mcast_group_source(struct sock *sk, int optname, + struct group_source_req *greqs) +{ + int omode, add; + + if (greqs->gsr_group.ss_family != AF_INET6 || + greqs->gsr_source.ss_family != AF_INET6) + return -EADDRNOTAVAIL; + + if (optname == MCAST_BLOCK_SOURCE) { + omode = MCAST_EXCLUDE; + add = 1; + } else if (optname == MCAST_UNBLOCK_SOURCE) { + omode = MCAST_EXCLUDE; + add = 0; + } else if (optname == MCAST_JOIN_SOURCE_GROUP) { + struct sockaddr_in6 *psin6; + int retv; + + psin6 = (struct sockaddr_in6 *)&greqs->gsr_group; + retv = ipv6_sock_mc_join_ssm(sk, greqs->gsr_interface, + &psin6->sin6_addr, + MCAST_INCLUDE); + /* prior join w/ different source is ok */ + if (retv && retv != -EADDRINUSE) + return retv; + omode = MCAST_INCLUDE; + add = 1; + } else /* MCAST_LEAVE_SOURCE_GROUP */ { + omode = MCAST_INCLUDE; + add = 0; + } + return ip6_mc_source(add, omode, sk, greqs); +} + static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, char __user *optval, unsigned int optlen) { @@ -715,7 +750,6 @@ done: case MCAST_UNBLOCK_SOURCE: { struct group_source_req greqs; - int omode, add; if (optlen < sizeof(struct group_source_req)) goto e_inval; @@ -723,34 +757,7 @@ done: retv = -EFAULT; break; } - if (greqs.gsr_group.ss_family != AF_INET6 || - greqs.gsr_source.ss_family != AF_INET6) { - retv = -EADDRNOTAVAIL; - break; - } - if (optname == MCAST_BLOCK_SOURCE) { - omode = MCAST_EXCLUDE; - add = 1; - } else if (optname == MCAST_UNBLOCK_SOURCE) { - omode = MCAST_EXCLUDE; - add = 0; - } else if (optname == MCAST_JOIN_SOURCE_GROUP) { - struct sockaddr_in6 *psin6; - - psin6 = (struct sockaddr_in6 *)&greqs.gsr_group; - retv = ipv6_sock_mc_join_ssm(sk, greqs.gsr_interface, - &psin6->sin6_addr, - MCAST_INCLUDE); - /* prior join w/ different source is ok */ - if (retv && retv != -EADDRINUSE) - break; - omode = MCAST_INCLUDE; - add = 1; - } else /* MCAST_LEAVE_SOURCE_GROUP */ { - omode = MCAST_INCLUDE; - add = 0; - } - retv = ip6_mc_source(add, omode, sk, &greqs); + retv = do_ipv6_mcast_group_source(sk, optname, &greqs); break; } case MCAST_MSFILTER: -- cgit v1.2.3 From b212c322c8d73c48062340dc4cbe150c4ce97fb8 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 27 Apr 2020 11:37:02 -0400 Subject: handle the group_source_req options directly Native ->setsockopt() handling of these options (MCAST_..._SOURCE_GROUP and MCAST_{,UN}BLOCK_SOURCE) consists of copyin + call of a helper that does the actual work. The only change needed for ->compat_setsockopt() is a slightly different copyin - the helpers can be reused as-is. Signed-off-by: Al Viro --- net/ipv4/ip_sockglue.c | 23 +++++++++++++++++++++-- net/ipv6/ipv6_sockglue.c | 23 +++++++++++++++++++++-- 2 files changed, 42 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 7f065a68664e..a2469bc57cfe 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -1322,8 +1322,27 @@ int compat_ip_setsockopt(struct sock *sk, int level, int optname, case MCAST_LEAVE_SOURCE_GROUP: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: - return compat_mc_setsockopt(sk, level, optname, optval, optlen, - ip_setsockopt); + { + struct compat_group_source_req __user *gsr32 = (void __user *)optval; + struct group_source_req greqs; + + if (optlen != sizeof(struct compat_group_source_req)) + return -EINVAL; + + if (get_user(greqs.gsr_interface, &gsr32->gsr_interface) || + copy_from_user(&greqs.gsr_group, &gsr32->gsr_group, + sizeof(greqs.gsr_group)) || + copy_from_user(&greqs.gsr_source, &gsr32->gsr_source, + sizeof(greqs.gsr_source))) + return -EFAULT; + + rtnl_lock(); + lock_sock(sk); + err = do_mcast_group_source(sk, optname, &greqs); + release_sock(sk); + rtnl_unlock(); + return err; + } case MCAST_MSFILTER: { const int size0 = offsetof(struct compat_group_filter, gf_slist); diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index bb049feeb787..e10258c2210e 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -1015,8 +1015,27 @@ int compat_ipv6_setsockopt(struct sock *sk, int level, int optname, case MCAST_LEAVE_SOURCE_GROUP: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: - return compat_mc_setsockopt(sk, level, optname, optval, optlen, - ipv6_setsockopt); + { + struct compat_group_source_req __user *gsr32 = (void __user *)optval; + struct group_source_req greqs; + + if (optlen < sizeof(struct compat_group_source_req)) + return -EINVAL; + + if (get_user(greqs.gsr_interface, &gsr32->gsr_interface) || + copy_from_user(&greqs.gsr_group, &gsr32->gsr_group, + sizeof(greqs.gsr_group)) || + copy_from_user(&greqs.gsr_source, &gsr32->gsr_source, + sizeof(greqs.gsr_source))) + return -EFAULT; + + rtnl_lock(); + lock_sock(sk); + err = do_ipv6_mcast_group_source(sk, optname, &greqs); + release_sock(sk); + rtnl_unlock(); + return err; + } case MCAST_MSFILTER: { const int size0 = offsetof(struct compat_group_filter, gf_slist); -- cgit v1.2.3 From bbced07d9952ca290e8de3957c75b8b401d7a867 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 27 Apr 2020 11:37:02 -0400 Subject: get rid of compat_mc_setsockopt() not used anymore Signed-off-by: Al Viro --- include/net/compat.h | 4 --- net/compat.c | 90 ---------------------------------------------------- 2 files changed, 94 deletions(-) (limited to 'net') diff --git a/include/net/compat.h b/include/net/compat.h index d714076d63d5..f241666117d8 100644 --- a/include/net/compat.h +++ b/include/net/compat.h @@ -67,10 +67,6 @@ int put_cmsg_compat(struct msghdr*, int, int, int, void *); int cmsghdr_from_user_compat_to_kern(struct msghdr *, struct sock *, unsigned char *, int); -int compat_mc_setsockopt(struct sock *, int, int, char __user *, unsigned int, - int (*)(struct sock *, int, int, char __user *, - unsigned int)); - struct compat_group_req { __u32 gr_interface; struct __kernel_sockaddr_storage gr_group diff --git a/net/compat.c b/net/compat.c index 7bdfda2b382a..afd7b444e0bf 100644 --- a/net/compat.c +++ b/net/compat.c @@ -448,96 +448,6 @@ COMPAT_SYSCALL_DEFINE5(getsockopt, int, fd, int, level, int, optname, return __compat_sys_getsockopt(fd, level, optname, optval, optlen); } -#define __COMPAT_GF0_SIZE (sizeof(struct compat_group_filter) - \ - sizeof(struct __kernel_sockaddr_storage)) - -int compat_mc_setsockopt(struct sock *sock, int level, int optname, - char __user *optval, unsigned int optlen, - int (*setsockopt)(struct sock *, int, int, char __user *, unsigned int)) -{ - char __user *koptval = optval; - int koptlen = optlen; - - switch (optname) { - case MCAST_JOIN_GROUP: - case MCAST_LEAVE_GROUP: - { - struct compat_group_req __user *gr32 = (void __user *)optval; - struct group_req __user *kgr = - compat_alloc_user_space(sizeof(struct group_req)); - u32 interface; - - if (!access_ok(gr32, sizeof(*gr32)) || - !access_ok(kgr, sizeof(struct group_req)) || - __get_user(interface, &gr32->gr_interface) || - __put_user(interface, &kgr->gr_interface) || - copy_in_user(&kgr->gr_group, &gr32->gr_group, - sizeof(kgr->gr_group))) - return -EFAULT; - koptval = (char __user *)kgr; - koptlen = sizeof(struct group_req); - break; - } - case MCAST_JOIN_SOURCE_GROUP: - case MCAST_LEAVE_SOURCE_GROUP: - case MCAST_BLOCK_SOURCE: - case MCAST_UNBLOCK_SOURCE: - { - struct compat_group_source_req __user *gsr32 = (void __user *)optval; - struct group_source_req __user *kgsr = compat_alloc_user_space( - sizeof(struct group_source_req)); - u32 interface; - - if (!access_ok(gsr32, sizeof(*gsr32)) || - !access_ok(kgsr, - sizeof(struct group_source_req)) || - __get_user(interface, &gsr32->gsr_interface) || - __put_user(interface, &kgsr->gsr_interface) || - copy_in_user(&kgsr->gsr_group, &gsr32->gsr_group, - sizeof(kgsr->gsr_group)) || - copy_in_user(&kgsr->gsr_source, &gsr32->gsr_source, - sizeof(kgsr->gsr_source))) - return -EFAULT; - koptval = (char __user *)kgsr; - koptlen = sizeof(struct group_source_req); - break; - } - case MCAST_MSFILTER: - { - struct compat_group_filter __user *gf32 = (void __user *)optval; - struct group_filter __user *kgf; - u32 interface, fmode, numsrc; - - if (!access_ok(gf32, __COMPAT_GF0_SIZE) || - __get_user(interface, &gf32->gf_interface) || - __get_user(fmode, &gf32->gf_fmode) || - __get_user(numsrc, &gf32->gf_numsrc)) - return -EFAULT; - koptlen = optlen + sizeof(struct group_filter) - - sizeof(struct compat_group_filter); - if (koptlen < GROUP_FILTER_SIZE(numsrc)) - return -EINVAL; - kgf = compat_alloc_user_space(koptlen); - if (!access_ok(kgf, koptlen) || - __put_user(interface, &kgf->gf_interface) || - __put_user(fmode, &kgf->gf_fmode) || - __put_user(numsrc, &kgf->gf_numsrc) || - copy_in_user(&kgf->gf_group, &gf32->gf_group, - sizeof(kgf->gf_group)) || - (numsrc && copy_in_user(kgf->gf_slist, gf32->gf_slist, - numsrc * sizeof(kgf->gf_slist[0])))) - return -EFAULT; - koptval = (char __user *)kgf; - break; - } - - default: - break; - } - return setsockopt(sock, level, optname, koptval, koptlen); -} -EXPORT_SYMBOL(compat_mc_setsockopt); - /* Argument list sizes for compat_sys_socketcall */ #define AL(x) ((x) * sizeof(u32)) static unsigned char nas[21] = { -- cgit v1.2.3 From 38c53ca3c114fa2a7030f0d1f54feaf044957609 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 23 Apr 2020 10:19:04 -0400 Subject: batadv_socket_read(): get rid of pointless access_ok() address is passed only to copy_to_user() Signed-off-by: Al Viro --- net/batman-adv/icmp_socket.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net') diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c index ccb535c77e5d..8bdabc03b0b2 100644 --- a/net/batman-adv/icmp_socket.c +++ b/net/batman-adv/icmp_socket.c @@ -135,9 +135,6 @@ static ssize_t batadv_socket_read(struct file *file, char __user *buf, if (!buf || count < sizeof(struct batadv_icmp_packet)) return -EINVAL; - if (!access_ok(buf, count)) - return -EFAULT; - error = wait_event_interruptible(socket_client->queue_wait, socket_client->queue_len); -- cgit v1.2.3 From 8c2348e36af0da79477b0726781da297263269a4 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 10 May 2020 17:20:49 -0400 Subject: atm: separate ATM_GETNAMES handling from the rest of atm_dev_ioctl() atm_dev_ioctl() does copyin in two different ways - one for ATM_GETNAMES, another for everything else. Start with separating the former into a new helper (atm_getnames()). The next step will be to lift the copyin into the callers. Signed-off-by: Al Viro --- net/atm/ioctl.c | 6 +++- net/atm/resources.c | 88 +++++++++++++++++++++++++++-------------------------- net/atm/resources.h | 1 + 3 files changed, 51 insertions(+), 44 deletions(-) (limited to 'net') diff --git a/net/atm/ioctl.c b/net/atm/ioctl.c index d955b683aa7c..0b4b07740fe4 100644 --- a/net/atm/ioctl.c +++ b/net/atm/ioctl.c @@ -162,7 +162,11 @@ static int do_vcc_ioctl(struct socket *sock, unsigned int cmd, if (error != -ENOIOCTLCMD) goto done; - error = atm_dev_ioctl(cmd, argp, compat); + if (cmd == ATM_GETNAMES) { + error = atm_getnames(argp, compat); + } else { + error = atm_dev_ioctl(cmd, argp, compat); + } done: return error; diff --git a/net/atm/resources.c b/net/atm/resources.c index 889349c6d90d..a2ab75929eec 100644 --- a/net/atm/resources.c +++ b/net/atm/resources.c @@ -193,61 +193,63 @@ static int fetch_stats(struct atm_dev *dev, struct atm_dev_stats __user *arg, return error ? -EFAULT : 0; } -int atm_dev_ioctl(unsigned int cmd, void __user *arg, int compat) +int atm_getnames(void __user *arg, int compat) { void __user *buf; - int error, len, number, size = 0; + int error, len, size = 0; struct atm_dev *dev; struct list_head *p; int *tmp_buf, *tmp_p; - int __user *sioc_len; int __user *iobuf_len; - switch (cmd) { - case ATM_GETNAMES: - if (IS_ENABLED(CONFIG_COMPAT) && compat) { + if (IS_ENABLED(CONFIG_COMPAT) && compat) { #ifdef CONFIG_COMPAT - struct compat_atm_iobuf __user *ciobuf = arg; - compat_uptr_t cbuf; - iobuf_len = &ciobuf->length; - if (get_user(cbuf, &ciobuf->buffer)) - return -EFAULT; - buf = compat_ptr(cbuf); + struct compat_atm_iobuf __user *ciobuf = arg; + compat_uptr_t cbuf; + iobuf_len = &ciobuf->length; + if (get_user(cbuf, &ciobuf->buffer)) + return -EFAULT; + buf = compat_ptr(cbuf); #endif - } else { - struct atm_iobuf __user *iobuf = arg; - iobuf_len = &iobuf->length; - if (get_user(buf, &iobuf->buffer)) - return -EFAULT; - } - if (get_user(len, iobuf_len)) + } else { + struct atm_iobuf __user *iobuf = arg; + iobuf_len = &iobuf->length; + if (get_user(buf, &iobuf->buffer)) return -EFAULT; - mutex_lock(&atm_dev_mutex); - list_for_each(p, &atm_devs) - size += sizeof(int); - if (size > len) { - mutex_unlock(&atm_dev_mutex); - return -E2BIG; - } - tmp_buf = kmalloc(size, GFP_ATOMIC); - if (!tmp_buf) { - mutex_unlock(&atm_dev_mutex); - return -ENOMEM; - } - tmp_p = tmp_buf; - list_for_each(p, &atm_devs) { - dev = list_entry(p, struct atm_dev, dev_list); - *tmp_p++ = dev->number; - } + } + if (get_user(len, iobuf_len)) + return -EFAULT; + mutex_lock(&atm_dev_mutex); + list_for_each(p, &atm_devs) + size += sizeof(int); + if (size > len) { mutex_unlock(&atm_dev_mutex); - error = ((copy_to_user(buf, tmp_buf, size)) || - put_user(size, iobuf_len)) - ? -EFAULT : 0; - kfree(tmp_buf); - return error; - default: - break; + return -E2BIG; } + tmp_buf = kmalloc(size, GFP_ATOMIC); + if (!tmp_buf) { + mutex_unlock(&atm_dev_mutex); + return -ENOMEM; + } + tmp_p = tmp_buf; + list_for_each(p, &atm_devs) { + dev = list_entry(p, struct atm_dev, dev_list); + *tmp_p++ = dev->number; + } + mutex_unlock(&atm_dev_mutex); + error = ((copy_to_user(buf, tmp_buf, size)) || + put_user(size, iobuf_len)) + ? -EFAULT : 0; + kfree(tmp_buf); + return error; +} + +int atm_dev_ioctl(unsigned int cmd, void __user *arg, int compat) +{ + void __user *buf; + int error, len, number, size = 0; + struct atm_dev *dev; + int __user *sioc_len; if (IS_ENABLED(CONFIG_COMPAT) && compat) { #ifdef CONFIG_COMPAT diff --git a/net/atm/resources.h b/net/atm/resources.h index 048232e4d4c6..18f8e5948ce4 100644 --- a/net/atm/resources.h +++ b/net/atm/resources.h @@ -14,6 +14,7 @@ extern struct list_head atm_devs; extern struct mutex atm_dev_mutex; +int atm_getnames(void __user *arg, int compat); int atm_dev_ioctl(unsigned int cmd, void __user *arg, int compat); -- cgit v1.2.3 From a3929484af75ee524419edbbc4e9ce012c3d67c9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 10 May 2020 17:34:20 -0400 Subject: atm: move copyin from atm_getnames() into the caller Signed-off-by: Al Viro --- net/atm/ioctl.c | 19 ++++++++++++++++++- net/atm/resources.c | 19 +------------------ net/atm/resources.h | 2 +- 3 files changed, 20 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/atm/ioctl.c b/net/atm/ioctl.c index 0b4b07740fe4..e239cebf48da 100644 --- a/net/atm/ioctl.c +++ b/net/atm/ioctl.c @@ -56,6 +56,8 @@ static int do_vcc_ioctl(struct socket *sock, unsigned int cmd, int error; struct list_head *pos; void __user *argp = (void __user *)arg; + void __user *buf; + int __user *len; vcc = ATM_SD(sock); switch (cmd) { @@ -163,7 +165,22 @@ static int do_vcc_ioctl(struct socket *sock, unsigned int cmd, goto done; if (cmd == ATM_GETNAMES) { - error = atm_getnames(argp, compat); + if (IS_ENABLED(CONFIG_COMPAT) && compat) { +#ifdef CONFIG_COMPAT + struct compat_atm_iobuf __user *ciobuf = argp; + compat_uptr_t cbuf; + len = &ciobuf->length; + if (get_user(cbuf, &ciobuf->buffer)) + return -EFAULT; + buf = compat_ptr(cbuf); +#endif + } else { + struct atm_iobuf __user *iobuf = argp; + len = &iobuf->length; + if (get_user(buf, &iobuf->buffer)) + return -EFAULT; + } + error = atm_getnames(buf, len); } else { error = atm_dev_ioctl(cmd, argp, compat); } diff --git a/net/atm/resources.c b/net/atm/resources.c index a2ab75929eec..5507cc608969 100644 --- a/net/atm/resources.c +++ b/net/atm/resources.c @@ -193,30 +193,13 @@ static int fetch_stats(struct atm_dev *dev, struct atm_dev_stats __user *arg, return error ? -EFAULT : 0; } -int atm_getnames(void __user *arg, int compat) +int atm_getnames(void __user *buf, int __user *iobuf_len) { - void __user *buf; int error, len, size = 0; struct atm_dev *dev; struct list_head *p; int *tmp_buf, *tmp_p; - int __user *iobuf_len; - if (IS_ENABLED(CONFIG_COMPAT) && compat) { -#ifdef CONFIG_COMPAT - struct compat_atm_iobuf __user *ciobuf = arg; - compat_uptr_t cbuf; - iobuf_len = &ciobuf->length; - if (get_user(cbuf, &ciobuf->buffer)) - return -EFAULT; - buf = compat_ptr(cbuf); -#endif - } else { - struct atm_iobuf __user *iobuf = arg; - iobuf_len = &iobuf->length; - if (get_user(buf, &iobuf->buffer)) - return -EFAULT; - } if (get_user(len, iobuf_len)) return -EFAULT; mutex_lock(&atm_dev_mutex); diff --git a/net/atm/resources.h b/net/atm/resources.h index 18f8e5948ce4..5e2c68d37d63 100644 --- a/net/atm/resources.h +++ b/net/atm/resources.h @@ -14,7 +14,7 @@ extern struct list_head atm_devs; extern struct mutex atm_dev_mutex; -int atm_getnames(void __user *arg, int compat); +int atm_getnames(void __user *buf, int __user *iobuf_len); int atm_dev_ioctl(unsigned int cmd, void __user *arg, int compat); -- cgit v1.2.3 From 36085049bc0acb6f2e784f430c2cc66944a2ef07 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 10 May 2020 17:41:51 -0400 Subject: atm: switch do_atm_iobuf() to direct use of atm_getnames() ... and sod the compat_alloc_user_space() with its complications Signed-off-by: Al Viro --- net/atm/ioctl.c | 25 +++---------------------- 1 file changed, 3 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/atm/ioctl.c b/net/atm/ioctl.c index e239cebf48da..fdd0e3434523 100644 --- a/net/atm/ioctl.c +++ b/net/atm/ioctl.c @@ -251,32 +251,13 @@ static struct { static int do_atm_iobuf(struct socket *sock, unsigned int cmd, unsigned long arg) { - struct atm_iobuf __user *iobuf; - struct compat_atm_iobuf __user *iobuf32; + struct compat_atm_iobuf __user *iobuf32 = compat_ptr(arg); u32 data; - void __user *datap; - int len, err; - - iobuf = compat_alloc_user_space(sizeof(*iobuf)); - iobuf32 = compat_ptr(arg); - if (get_user(len, &iobuf32->length) || - get_user(data, &iobuf32->buffer)) - return -EFAULT; - datap = compat_ptr(data); - if (put_user(len, &iobuf->length) || - put_user(datap, &iobuf->buffer)) + if (get_user(data, &iobuf32->buffer)) return -EFAULT; - err = do_vcc_ioctl(sock, cmd, (unsigned long) iobuf, 0); - - if (!err) { - if (copy_in_user(&iobuf32->length, &iobuf->length, - sizeof(int))) - err = -EFAULT; - } - - return err; + return atm_getnames(&iobuf32->length, compat_ptr(data)); } static int do_atmif_sioc(struct socket *sock, unsigned int cmd, -- cgit v1.2.3 From 8cacb4165985444c275a6f813f91f08479bdbfad Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 10 May 2020 17:53:35 -0400 Subject: atm: lift copyin from atm_dev_ioctl() Signed-off-by: Al Viro --- net/atm/ioctl.c | 25 ++++++++++++++++++++++++- net/atm/resources.c | 35 +++++------------------------------ net/atm/resources.h | 4 ++-- 3 files changed, 31 insertions(+), 33 deletions(-) (limited to 'net') diff --git a/net/atm/ioctl.c b/net/atm/ioctl.c index fdd0e3434523..52f2c77e656f 100644 --- a/net/atm/ioctl.c +++ b/net/atm/ioctl.c @@ -182,7 +182,30 @@ static int do_vcc_ioctl(struct socket *sock, unsigned int cmd, } error = atm_getnames(buf, len); } else { - error = atm_dev_ioctl(cmd, argp, compat); + int number; + + if (IS_ENABLED(CONFIG_COMPAT) && compat) { +#ifdef CONFIG_COMPAT + struct compat_atmif_sioc __user *csioc = argp; + compat_uptr_t carg; + + len = &csioc->length; + if (get_user(carg, &csioc->arg)) + return -EFAULT; + buf = compat_ptr(carg); + if (get_user(number, &csioc->number)) + return -EFAULT; +#endif + } else { + struct atmif_sioc __user *sioc = argp; + + len = &sioc->length; + if (get_user(buf, &sioc->arg)) + return -EFAULT; + if (get_user(number, &sioc->number)) + return -EFAULT; + } + error = atm_dev_ioctl(cmd, buf, len, number, compat); } done: diff --git a/net/atm/resources.c b/net/atm/resources.c index 5507cc608969..94bdc6527ee8 100644 --- a/net/atm/resources.c +++ b/net/atm/resources.c @@ -227,39 +227,14 @@ int atm_getnames(void __user *buf, int __user *iobuf_len) return error; } -int atm_dev_ioctl(unsigned int cmd, void __user *arg, int compat) +int atm_dev_ioctl(unsigned int cmd, void __user *buf, int __user *sioc_len, + int number, int compat) { - void __user *buf; - int error, len, number, size = 0; + int error, len, size = 0; struct atm_dev *dev; - int __user *sioc_len; - if (IS_ENABLED(CONFIG_COMPAT) && compat) { -#ifdef CONFIG_COMPAT - struct compat_atmif_sioc __user *csioc = arg; - compat_uptr_t carg; - - sioc_len = &csioc->length; - if (get_user(carg, &csioc->arg)) - return -EFAULT; - buf = compat_ptr(carg); - - if (get_user(len, &csioc->length)) - return -EFAULT; - if (get_user(number, &csioc->number)) - return -EFAULT; -#endif - } else { - struct atmif_sioc __user *sioc = arg; - - sioc_len = &sioc->length; - if (get_user(buf, &sioc->arg)) - return -EFAULT; - if (get_user(len, &sioc->length)) - return -EFAULT; - if (get_user(number, &sioc->number)) - return -EFAULT; - } + if (get_user(len, sioc_len)) + return -EFAULT; dev = try_then_request_module(atm_dev_lookup(number), "atm-device-%d", number); diff --git a/net/atm/resources.h b/net/atm/resources.h index 5e2c68d37d63..4a0839e92ff3 100644 --- a/net/atm/resources.h +++ b/net/atm/resources.h @@ -15,8 +15,8 @@ extern struct list_head atm_devs; extern struct mutex atm_dev_mutex; int atm_getnames(void __user *buf, int __user *iobuf_len); -int atm_dev_ioctl(unsigned int cmd, void __user *arg, int compat); - +int atm_dev_ioctl(unsigned int cmd, void __user *buf, int __user *sioc_len, + int number, int compat); #ifdef CONFIG_PROC_FS -- cgit v1.2.3 From 0edecc020b33f8e31d8baa80735b45e8e8434700 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 10 May 2020 18:13:56 -0400 Subject: atm: switch do_atmif_sioc() to direct use of atm_dev_ioctl() Signed-off-by: Al Viro --- net/atm/ioctl.c | 25 ++++--------------------- 1 file changed, 4 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/atm/ioctl.c b/net/atm/ioctl.c index 52f2c77e656f..838ebf0cabbf 100644 --- a/net/atm/ioctl.c +++ b/net/atm/ioctl.c @@ -286,30 +286,13 @@ static int do_atm_iobuf(struct socket *sock, unsigned int cmd, static int do_atmif_sioc(struct socket *sock, unsigned int cmd, unsigned long arg) { - struct atmif_sioc __user *sioc; - struct compat_atmif_sioc __user *sioc32; + struct compat_atmif_sioc __user *sioc32 = compat_ptr(arg); + int number; u32 data; - void __user *datap; - int err; - sioc = compat_alloc_user_space(sizeof(*sioc)); - sioc32 = compat_ptr(arg); - - if (copy_in_user(&sioc->number, &sioc32->number, 2 * sizeof(int)) || - get_user(data, &sioc32->arg)) - return -EFAULT; - datap = compat_ptr(data); - if (put_user(datap, &sioc->arg)) + if (get_user(data, &sioc32->arg) || get_user(number, &sioc32->number)) return -EFAULT; - - err = do_vcc_ioctl(sock, cmd, (unsigned long) sioc, 0); - - if (!err) { - if (copy_in_user(&sioc32->length, &sioc->length, - sizeof(int))) - err = -EFAULT; - } - return err; + return atm_dev_ioctl(cmd, compat_ptr(data), &sioc32->length, number, 0); } static int do_atm_ioctl(struct socket *sock, unsigned int cmd32, -- cgit v1.2.3 From d8bed686ab96169ac80b497d1cbed89300d97f83 Mon Sep 17 00:00:00 2001 From: Chris Mi Date: Tue, 19 May 2020 22:45:20 +0800 Subject: net: psample: Add tunnel support Currently, psample can only send the packet bits after decapsulation. The tunnel information is lost. Add the tunnel support. If the sampled packet has no tunnel info, the behavior is the same as before. If it has, add a nested metadata field named PSAMPLE_ATTR_TUNNEL and include the tunnel subfields if applicable. Increase the metadata length for sampled packet with the tunnel info. If new subfields of tunnel info should be included, update the metadata length accordingly. Signed-off-by: Chris Mi Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/psample.h | 22 ++++++ net/psample/psample.c | 157 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 179 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/psample.h b/include/uapi/linux/psample.h index ce1116cff53d..aea26ab1431c 100644 --- a/include/uapi/linux/psample.h +++ b/include/uapi/linux/psample.h @@ -11,6 +11,7 @@ enum { PSAMPLE_ATTR_GROUP_SEQ, PSAMPLE_ATTR_SAMPLE_RATE, PSAMPLE_ATTR_DATA, + PSAMPLE_ATTR_TUNNEL, /* commands attributes */ PSAMPLE_ATTR_GROUP_REFCOUNT, @@ -25,6 +26,27 @@ enum psample_command { PSAMPLE_CMD_DEL_GROUP, }; +enum psample_tunnel_key_attr { + PSAMPLE_TUNNEL_KEY_ATTR_ID, /* be64 Tunnel ID */ + PSAMPLE_TUNNEL_KEY_ATTR_IPV4_SRC, /* be32 src IP address. */ + PSAMPLE_TUNNEL_KEY_ATTR_IPV4_DST, /* be32 dst IP address. */ + PSAMPLE_TUNNEL_KEY_ATTR_TOS, /* u8 Tunnel IP ToS. */ + PSAMPLE_TUNNEL_KEY_ATTR_TTL, /* u8 Tunnel IP TTL. */ + PSAMPLE_TUNNEL_KEY_ATTR_DONT_FRAGMENT, /* No argument, set DF. */ + PSAMPLE_TUNNEL_KEY_ATTR_CSUM, /* No argument. CSUM packet. */ + PSAMPLE_TUNNEL_KEY_ATTR_OAM, /* No argument. OAM frame. */ + PSAMPLE_TUNNEL_KEY_ATTR_GENEVE_OPTS, /* Array of Geneve options. */ + PSAMPLE_TUNNEL_KEY_ATTR_TP_SRC, /* be16 src Transport Port. */ + PSAMPLE_TUNNEL_KEY_ATTR_TP_DST, /* be16 dst Transport Port. */ + PSAMPLE_TUNNEL_KEY_ATTR_VXLAN_OPTS, /* Nested VXLAN opts* */ + PSAMPLE_TUNNEL_KEY_ATTR_IPV6_SRC, /* struct in6_addr src IPv6 address. */ + PSAMPLE_TUNNEL_KEY_ATTR_IPV6_DST, /* struct in6_addr dst IPv6 address. */ + PSAMPLE_TUNNEL_KEY_ATTR_PAD, + PSAMPLE_TUNNEL_KEY_ATTR_ERSPAN_OPTS, /* struct erspan_metadata */ + PSAMPLE_TUNNEL_KEY_ATTR_IPV4_INFO_BRIDGE, /* No argument. IPV4_INFO_BRIDGE mode.*/ + __PSAMPLE_TUNNEL_KEY_ATTR_MAX +}; + /* Can be overridden at runtime by module option */ #define PSAMPLE_ATTR_MAX (__PSAMPLE_ATTR_MAX - 1) diff --git a/net/psample/psample.c b/net/psample/psample.c index 6f2fbc6b9eb2..34a74043840b 100644 --- a/net/psample/psample.c +++ b/net/psample/psample.c @@ -14,6 +14,8 @@ #include #include #include +#include +#include #define PSAMPLE_MAX_PACKET_SIZE 0xffff @@ -207,10 +209,155 @@ void psample_group_put(struct psample_group *group) } EXPORT_SYMBOL_GPL(psample_group_put); +static int __psample_ip_tun_to_nlattr(struct sk_buff *skb, + struct ip_tunnel_info *tun_info) +{ + unsigned short tun_proto = ip_tunnel_info_af(tun_info); + const void *tun_opts = ip_tunnel_info_opts(tun_info); + const struct ip_tunnel_key *tun_key = &tun_info->key; + int tun_opts_len = tun_info->options_len; + + if (tun_key->tun_flags & TUNNEL_KEY && + nla_put_be64(skb, PSAMPLE_TUNNEL_KEY_ATTR_ID, tun_key->tun_id, + PSAMPLE_TUNNEL_KEY_ATTR_PAD)) + return -EMSGSIZE; + + if (tun_info->mode & IP_TUNNEL_INFO_BRIDGE && + nla_put_flag(skb, PSAMPLE_TUNNEL_KEY_ATTR_IPV4_INFO_BRIDGE)) + return -EMSGSIZE; + + switch (tun_proto) { + case AF_INET: + if (tun_key->u.ipv4.src && + nla_put_in_addr(skb, PSAMPLE_TUNNEL_KEY_ATTR_IPV4_SRC, + tun_key->u.ipv4.src)) + return -EMSGSIZE; + if (tun_key->u.ipv4.dst && + nla_put_in_addr(skb, PSAMPLE_TUNNEL_KEY_ATTR_IPV4_DST, + tun_key->u.ipv4.dst)) + return -EMSGSIZE; + break; + case AF_INET6: + if (!ipv6_addr_any(&tun_key->u.ipv6.src) && + nla_put_in6_addr(skb, PSAMPLE_TUNNEL_KEY_ATTR_IPV6_SRC, + &tun_key->u.ipv6.src)) + return -EMSGSIZE; + if (!ipv6_addr_any(&tun_key->u.ipv6.dst) && + nla_put_in6_addr(skb, PSAMPLE_TUNNEL_KEY_ATTR_IPV6_DST, + &tun_key->u.ipv6.dst)) + return -EMSGSIZE; + break; + } + if (tun_key->tos && + nla_put_u8(skb, PSAMPLE_TUNNEL_KEY_ATTR_TOS, tun_key->tos)) + return -EMSGSIZE; + if (nla_put_u8(skb, PSAMPLE_TUNNEL_KEY_ATTR_TTL, tun_key->ttl)) + return -EMSGSIZE; + if ((tun_key->tun_flags & TUNNEL_DONT_FRAGMENT) && + nla_put_flag(skb, PSAMPLE_TUNNEL_KEY_ATTR_DONT_FRAGMENT)) + return -EMSGSIZE; + if ((tun_key->tun_flags & TUNNEL_CSUM) && + nla_put_flag(skb, PSAMPLE_TUNNEL_KEY_ATTR_CSUM)) + return -EMSGSIZE; + if (tun_key->tp_src && + nla_put_be16(skb, PSAMPLE_TUNNEL_KEY_ATTR_TP_SRC, tun_key->tp_src)) + return -EMSGSIZE; + if (tun_key->tp_dst && + nla_put_be16(skb, PSAMPLE_TUNNEL_KEY_ATTR_TP_DST, tun_key->tp_dst)) + return -EMSGSIZE; + if ((tun_key->tun_flags & TUNNEL_OAM) && + nla_put_flag(skb, PSAMPLE_TUNNEL_KEY_ATTR_OAM)) + return -EMSGSIZE; + if (tun_opts_len) { + if (tun_key->tun_flags & TUNNEL_GENEVE_OPT && + nla_put(skb, PSAMPLE_TUNNEL_KEY_ATTR_GENEVE_OPTS, + tun_opts_len, tun_opts)) + return -EMSGSIZE; + else if (tun_key->tun_flags & TUNNEL_ERSPAN_OPT && + nla_put(skb, PSAMPLE_TUNNEL_KEY_ATTR_ERSPAN_OPTS, + tun_opts_len, tun_opts)) + return -EMSGSIZE; + } + + return 0; +} + +static int psample_ip_tun_to_nlattr(struct sk_buff *skb, + struct ip_tunnel_info *tun_info) +{ + struct nlattr *nla; + int err; + + nla = nla_nest_start_noflag(skb, PSAMPLE_ATTR_TUNNEL); + if (!nla) + return -EMSGSIZE; + + err = __psample_ip_tun_to_nlattr(skb, tun_info); + if (err) { + nla_nest_cancel(skb, nla); + return err; + } + + nla_nest_end(skb, nla); + + return 0; +} + +static int psample_tunnel_meta_len(struct ip_tunnel_info *tun_info) +{ + unsigned short tun_proto = ip_tunnel_info_af(tun_info); + const struct ip_tunnel_key *tun_key = &tun_info->key; + int tun_opts_len = tun_info->options_len; + int sum = 0; + + if (tun_key->tun_flags & TUNNEL_KEY) + sum += nla_total_size(sizeof(u64)); + + if (tun_info->mode & IP_TUNNEL_INFO_BRIDGE) + sum += nla_total_size(0); + + switch (tun_proto) { + case AF_INET: + if (tun_key->u.ipv4.src) + sum += nla_total_size(sizeof(u32)); + if (tun_key->u.ipv4.dst) + sum += nla_total_size(sizeof(u32)); + break; + case AF_INET6: + if (!ipv6_addr_any(&tun_key->u.ipv6.src)) + sum += nla_total_size(sizeof(struct in6_addr)); + if (!ipv6_addr_any(&tun_key->u.ipv6.dst)) + sum += nla_total_size(sizeof(struct in6_addr)); + break; + } + if (tun_key->tos) + sum += nla_total_size(sizeof(u8)); + sum += nla_total_size(sizeof(u8)); /* TTL */ + if (tun_key->tun_flags & TUNNEL_DONT_FRAGMENT) + sum += nla_total_size(0); + if (tun_key->tun_flags & TUNNEL_CSUM) + sum += nla_total_size(0); + if (tun_key->tp_src) + sum += nla_total_size(sizeof(u16)); + if (tun_key->tp_dst) + sum += nla_total_size(sizeof(u16)); + if (tun_key->tun_flags & TUNNEL_OAM) + sum += nla_total_size(0); + if (tun_opts_len) { + if (tun_key->tun_flags & TUNNEL_GENEVE_OPT) + sum += nla_total_size(tun_opts_len); + else if (tun_key->tun_flags & TUNNEL_ERSPAN_OPT) + sum += nla_total_size(tun_opts_len); + } + + return sum; +} + void psample_sample_packet(struct psample_group *group, struct sk_buff *skb, u32 trunc_size, int in_ifindex, int out_ifindex, u32 sample_rate) { + struct ip_tunnel_info *tun_info; struct sk_buff *nl_skb; int data_len; int meta_len; @@ -224,6 +371,10 @@ void psample_sample_packet(struct psample_group *group, struct sk_buff *skb, nla_total_size(sizeof(u32)) + /* group_num */ nla_total_size(sizeof(u32)); /* seq */ + tun_info = skb_tunnel_info(skb); + if (tun_info) + meta_len += psample_tunnel_meta_len(tun_info); + data_len = min(skb->len, trunc_size); if (meta_len + nla_total_size(data_len) > PSAMPLE_MAX_PACKET_SIZE) data_len = PSAMPLE_MAX_PACKET_SIZE - meta_len - NLA_HDRLEN @@ -278,6 +429,12 @@ void psample_sample_packet(struct psample_group *group, struct sk_buff *skb, goto error; } + if (tun_info) { + ret = psample_ip_tun_to_nlattr(nl_skb, tun_info); + if (unlikely(ret < 0)) + goto error; + } + genlmsg_end(nl_skb, data); genlmsg_multicast_netns(&psample_nl_family, group->net, nl_skb, 0, PSAMPLE_NL_MCGRP_SAMPLE, GFP_ATOMIC); -- cgit v1.2.3 From 8066021915924f58ed338bf38208215f5a7355f6 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Wed, 20 May 2020 08:29:14 +0200 Subject: ethtool: provide UAPI for PHY Signal Quality Index (SQI) Signal Quality Index is a mandatory value required by "OPEN Alliance SIG" for the 100Base-T1 PHYs [1]. This indicator can be used for cable integrity diagnostic and investigating other noise sources and implement by at least two vendors: NXP[2] and TI[3]. [1] http://www.opensig.org/download/document/218/Advanced_PHY_features_for_automotive_Ethernet_V1.0.pdf [2] https://www.nxp.com/docs/en/data-sheet/TJA1100.pdf [3] https://www.ti.com/product/DP83TC811R-Q1 Signed-off-by: Oleksij Rempel Reviewed-by: Andrew Lunn Reviewed-by: Michal Kubecek Signed-off-by: David S. Miller --- Documentation/networking/ethtool-netlink.rst | 6 ++- include/linux/phy.h | 2 + include/uapi/linux/ethtool_netlink.h | 2 + net/ethtool/linkstate.c | 75 +++++++++++++++++++++++++++- 4 files changed, 82 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index eed46b6aa07d..7e651ea33eab 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -454,10 +454,12 @@ Request contents: Kernel response contents: - ==================================== ====== ========================== + ==================================== ====== ============================ ``ETHTOOL_A_LINKSTATE_HEADER`` nested reply header ``ETHTOOL_A_LINKSTATE_LINK`` bool link state (up/down) - ==================================== ====== ========================== + ``ETHTOOL_A_LINKSTATE_SQI`` u32 Current Signal Quality Index + ``ETHTOOL_A_LINKSTATE_SQI_MAX`` u32 Max support SQI value + ==================================== ====== ============================ For most NIC drivers, the value of ``ETHTOOL_A_LINKSTATE_LINK`` returns carrier flag provided by ``netif_carrier_ok()`` but there are drivers which diff --git a/include/linux/phy.h b/include/linux/phy.h index 467aa8bf9f64..2bcdf19ed3b4 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -723,6 +723,8 @@ struct phy_driver { struct ethtool_tunable *tuna, const void *data); int (*set_loopback)(struct phy_device *dev, bool enable); + int (*get_sqi)(struct phy_device *dev); + int (*get_sqi_max)(struct phy_device *dev); }; #define to_phy_driver(d) container_of(to_mdio_common_driver(d), \ struct phy_driver, mdiodrv) diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 2881af411f76..e6f109b76c9a 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -232,6 +232,8 @@ enum { ETHTOOL_A_LINKSTATE_UNSPEC, ETHTOOL_A_LINKSTATE_HEADER, /* nest - _A_HEADER_* */ ETHTOOL_A_LINKSTATE_LINK, /* u8 */ + ETHTOOL_A_LINKSTATE_SQI, /* u32 */ + ETHTOOL_A_LINKSTATE_SQI_MAX, /* u32 */ /* add new constants above here */ __ETHTOOL_A_LINKSTATE_CNT, diff --git a/net/ethtool/linkstate.c b/net/ethtool/linkstate.c index 2740cde0a182..7f47ba89054e 100644 --- a/net/ethtool/linkstate.c +++ b/net/ethtool/linkstate.c @@ -2,6 +2,7 @@ #include "netlink.h" #include "common.h" +#include struct linkstate_req_info { struct ethnl_req_info base; @@ -10,6 +11,8 @@ struct linkstate_req_info { struct linkstate_reply_data { struct ethnl_reply_data base; int link; + int sqi; + int sqi_max; }; #define LINKSTATE_REPDATA(__reply_base) \ @@ -20,8 +23,46 @@ linkstate_get_policy[ETHTOOL_A_LINKSTATE_MAX + 1] = { [ETHTOOL_A_LINKSTATE_UNSPEC] = { .type = NLA_REJECT }, [ETHTOOL_A_LINKSTATE_HEADER] = { .type = NLA_NESTED }, [ETHTOOL_A_LINKSTATE_LINK] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKSTATE_SQI] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKSTATE_SQI_MAX] = { .type = NLA_REJECT }, }; +static int linkstate_get_sqi(struct net_device *dev) +{ + struct phy_device *phydev = dev->phydev; + int ret; + + if (!phydev) + return -EOPNOTSUPP; + + mutex_lock(&phydev->lock); + if (!phydev->drv || !phydev->drv->get_sqi) + ret = -EOPNOTSUPP; + else + ret = phydev->drv->get_sqi(phydev); + mutex_unlock(&phydev->lock); + + return ret; +} + +static int linkstate_get_sqi_max(struct net_device *dev) +{ + struct phy_device *phydev = dev->phydev; + int ret; + + if (!phydev) + return -EOPNOTSUPP; + + mutex_lock(&phydev->lock); + if (!phydev->drv || !phydev->drv->get_sqi_max) + ret = -EOPNOTSUPP; + else + ret = phydev->drv->get_sqi_max(phydev); + mutex_unlock(&phydev->lock); + + return ret; +} + static int linkstate_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, struct genl_info *info) @@ -34,6 +75,19 @@ static int linkstate_prepare_data(const struct ethnl_req_info *req_base, if (ret < 0) return ret; data->link = __ethtool_get_link(dev); + + ret = linkstate_get_sqi(dev); + if (ret < 0 && ret != -EOPNOTSUPP) + return ret; + + data->sqi = ret; + + ret = linkstate_get_sqi_max(dev); + if (ret < 0 && ret != -EOPNOTSUPP) + return ret; + + data->sqi_max = ret; + ethnl_ops_complete(dev); return 0; @@ -42,8 +96,19 @@ static int linkstate_prepare_data(const struct ethnl_req_info *req_base, static int linkstate_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { - return nla_total_size(sizeof(u8)) /* LINKSTATE_LINK */ + struct linkstate_reply_data *data = LINKSTATE_REPDATA(reply_base); + int len; + + len = nla_total_size(sizeof(u8)) /* LINKSTATE_LINK */ + 0; + + if (data->sqi != -EOPNOTSUPP) + len += nla_total_size(sizeof(u32)); + + if (data->sqi_max != -EOPNOTSUPP) + len += nla_total_size(sizeof(u32)); + + return len; } static int linkstate_fill_reply(struct sk_buff *skb, @@ -56,6 +121,14 @@ static int linkstate_fill_reply(struct sk_buff *skb, nla_put_u8(skb, ETHTOOL_A_LINKSTATE_LINK, !!data->link)) return -EMSGSIZE; + if (data->sqi != -EOPNOTSUPP && + nla_put_u32(skb, ETHTOOL_A_LINKSTATE_SQI, data->sqi)) + return -EMSGSIZE; + + if (data->sqi_max != -EOPNOTSUPP && + nla_put_u32(skb, ETHTOOL_A_LINKSTATE_SQI_MAX, data->sqi_max)) + return -EMSGSIZE; + return 0; } -- cgit v1.2.3 From d20a1676df7e4c3c23d73299159811a50e4854bc Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Wed, 20 May 2020 21:20:50 +0200 Subject: xsk: Move xskmap.c to net/xdp/ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The XSKMAP is partly implemented by net/xdp/xsk.c. Move xskmap.c from kernel/bpf/ to net/xdp/, which is the logical place for AF_XDP related code. Also, move AF_XDP struct definitions, and function declarations only used by AF_XDP internals into net/xdp/xsk.h. Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200520192103.355233-3-bjorn.topel@gmail.com --- include/net/xdp_sock.h | 20 ---- kernel/bpf/Makefile | 3 - kernel/bpf/xskmap.c | 265 ------------------------------------------------ net/xdp/Makefile | 2 +- net/xdp/xsk.h | 16 +++ net/xdp/xskmap.c | 267 +++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 284 insertions(+), 289 deletions(-) delete mode 100644 kernel/bpf/xskmap.c create mode 100644 net/xdp/xskmap.c (limited to 'net') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 6b1137ce1692..8f3f6f5b0dfe 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -65,22 +65,12 @@ struct xdp_umem { struct list_head xsk_tx_list; }; -/* Nodes are linked in the struct xdp_sock map_list field, and used to - * track which maps a certain socket reside in. - */ - struct xsk_map { struct bpf_map map; spinlock_t lock; /* Synchronize map updates */ struct xdp_sock *xsk_map[]; }; -struct xsk_map_node { - struct list_head node; - struct xsk_map *map; - struct xdp_sock **map_entry; -}; - struct xdp_sock { /* struct sock must be the first member of struct xdp_sock */ struct sock sk; @@ -114,7 +104,6 @@ struct xdp_sock { struct xdp_buff; #ifdef CONFIG_XDP_SOCKETS int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); -bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs); /* Used from netdev driver */ bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt); bool xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr); @@ -133,10 +122,6 @@ void xsk_clear_rx_need_wakeup(struct xdp_umem *umem); void xsk_clear_tx_need_wakeup(struct xdp_umem *umem); bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem); -void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, - struct xdp_sock **map_entry); -int xsk_map_inc(struct xsk_map *map); -void xsk_map_put(struct xsk_map *map); int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp); void __xsk_map_flush(void); @@ -248,11 +233,6 @@ static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) return -ENOTSUPP; } -static inline bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) -{ - return false; -} - static inline bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt) { return false; diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 37b2d8620153..375b933010dd 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -12,9 +12,6 @@ obj-$(CONFIG_BPF_JIT) += dispatcher.o ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o obj-$(CONFIG_BPF_SYSCALL) += cpumap.o -ifeq ($(CONFIG_XDP_SOCKETS),y) -obj-$(CONFIG_BPF_SYSCALL) += xskmap.o -endif obj-$(CONFIG_BPF_SYSCALL) += offload.o endif ifeq ($(CONFIG_PERF_EVENTS),y) diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c deleted file mode 100644 index 2cc5c8f4c800..000000000000 --- a/kernel/bpf/xskmap.c +++ /dev/null @@ -1,265 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* XSKMAP used for AF_XDP sockets - * Copyright(c) 2018 Intel Corporation. - */ - -#include -#include -#include -#include -#include - -int xsk_map_inc(struct xsk_map *map) -{ - bpf_map_inc(&map->map); - return 0; -} - -void xsk_map_put(struct xsk_map *map) -{ - bpf_map_put(&map->map); -} - -static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map, - struct xdp_sock **map_entry) -{ - struct xsk_map_node *node; - int err; - - node = kzalloc(sizeof(*node), GFP_ATOMIC | __GFP_NOWARN); - if (!node) - return ERR_PTR(-ENOMEM); - - err = xsk_map_inc(map); - if (err) { - kfree(node); - return ERR_PTR(err); - } - - node->map = map; - node->map_entry = map_entry; - return node; -} - -static void xsk_map_node_free(struct xsk_map_node *node) -{ - xsk_map_put(node->map); - kfree(node); -} - -static void xsk_map_sock_add(struct xdp_sock *xs, struct xsk_map_node *node) -{ - spin_lock_bh(&xs->map_list_lock); - list_add_tail(&node->node, &xs->map_list); - spin_unlock_bh(&xs->map_list_lock); -} - -static void xsk_map_sock_delete(struct xdp_sock *xs, - struct xdp_sock **map_entry) -{ - struct xsk_map_node *n, *tmp; - - spin_lock_bh(&xs->map_list_lock); - list_for_each_entry_safe(n, tmp, &xs->map_list, node) { - if (map_entry == n->map_entry) { - list_del(&n->node); - xsk_map_node_free(n); - } - } - spin_unlock_bh(&xs->map_list_lock); -} - -static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) -{ - struct bpf_map_memory mem; - int err, numa_node; - struct xsk_map *m; - u64 size; - - if (!capable(CAP_NET_ADMIN)) - return ERR_PTR(-EPERM); - - if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size != 4 || - attr->map_flags & ~(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)) - return ERR_PTR(-EINVAL); - - numa_node = bpf_map_attr_numa_node(attr); - size = struct_size(m, xsk_map, attr->max_entries); - - err = bpf_map_charge_init(&mem, size); - if (err < 0) - return ERR_PTR(err); - - m = bpf_map_area_alloc(size, numa_node); - if (!m) { - bpf_map_charge_finish(&mem); - return ERR_PTR(-ENOMEM); - } - - bpf_map_init_from_attr(&m->map, attr); - bpf_map_charge_move(&m->map.memory, &mem); - spin_lock_init(&m->lock); - - return &m->map; -} - -static void xsk_map_free(struct bpf_map *map) -{ - struct xsk_map *m = container_of(map, struct xsk_map, map); - - bpf_clear_redirect_map(map); - synchronize_net(); - bpf_map_area_free(m); -} - -static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key) -{ - struct xsk_map *m = container_of(map, struct xsk_map, map); - u32 index = key ? *(u32 *)key : U32_MAX; - u32 *next = next_key; - - if (index >= m->map.max_entries) { - *next = 0; - return 0; - } - - if (index == m->map.max_entries - 1) - return -ENOENT; - *next = index + 1; - return 0; -} - -static u32 xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) -{ - const int ret = BPF_REG_0, mp = BPF_REG_1, index = BPF_REG_2; - struct bpf_insn *insn = insn_buf; - - *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); - *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5); - *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(sizeof(struct xsk_sock *))); - *insn++ = BPF_ALU64_IMM(BPF_ADD, mp, offsetof(struct xsk_map, xsk_map)); - *insn++ = BPF_ALU64_REG(BPF_ADD, ret, mp); - *insn++ = BPF_LDX_MEM(BPF_SIZEOF(struct xsk_sock *), ret, ret, 0); - *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); - *insn++ = BPF_MOV64_IMM(ret, 0); - return insn - insn_buf; -} - -static void *xsk_map_lookup_elem(struct bpf_map *map, void *key) -{ - WARN_ON_ONCE(!rcu_read_lock_held()); - return __xsk_map_lookup_elem(map, *(u32 *)key); -} - -static void *xsk_map_lookup_elem_sys_only(struct bpf_map *map, void *key) -{ - return ERR_PTR(-EOPNOTSUPP); -} - -static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value, - u64 map_flags) -{ - struct xsk_map *m = container_of(map, struct xsk_map, map); - struct xdp_sock *xs, *old_xs, **map_entry; - u32 i = *(u32 *)key, fd = *(u32 *)value; - struct xsk_map_node *node; - struct socket *sock; - int err; - - if (unlikely(map_flags > BPF_EXIST)) - return -EINVAL; - if (unlikely(i >= m->map.max_entries)) - return -E2BIG; - - sock = sockfd_lookup(fd, &err); - if (!sock) - return err; - - if (sock->sk->sk_family != PF_XDP) { - sockfd_put(sock); - return -EOPNOTSUPP; - } - - xs = (struct xdp_sock *)sock->sk; - - if (!xsk_is_setup_for_bpf_map(xs)) { - sockfd_put(sock); - return -EOPNOTSUPP; - } - - map_entry = &m->xsk_map[i]; - node = xsk_map_node_alloc(m, map_entry); - if (IS_ERR(node)) { - sockfd_put(sock); - return PTR_ERR(node); - } - - spin_lock_bh(&m->lock); - old_xs = READ_ONCE(*map_entry); - if (old_xs == xs) { - err = 0; - goto out; - } else if (old_xs && map_flags == BPF_NOEXIST) { - err = -EEXIST; - goto out; - } else if (!old_xs && map_flags == BPF_EXIST) { - err = -ENOENT; - goto out; - } - xsk_map_sock_add(xs, node); - WRITE_ONCE(*map_entry, xs); - if (old_xs) - xsk_map_sock_delete(old_xs, map_entry); - spin_unlock_bh(&m->lock); - sockfd_put(sock); - return 0; - -out: - spin_unlock_bh(&m->lock); - sockfd_put(sock); - xsk_map_node_free(node); - return err; -} - -static int xsk_map_delete_elem(struct bpf_map *map, void *key) -{ - struct xsk_map *m = container_of(map, struct xsk_map, map); - struct xdp_sock *old_xs, **map_entry; - int k = *(u32 *)key; - - if (k >= map->max_entries) - return -EINVAL; - - spin_lock_bh(&m->lock); - map_entry = &m->xsk_map[k]; - old_xs = xchg(map_entry, NULL); - if (old_xs) - xsk_map_sock_delete(old_xs, map_entry); - spin_unlock_bh(&m->lock); - - return 0; -} - -void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, - struct xdp_sock **map_entry) -{ - spin_lock_bh(&map->lock); - if (READ_ONCE(*map_entry) == xs) { - WRITE_ONCE(*map_entry, NULL); - xsk_map_sock_delete(xs, map_entry); - } - spin_unlock_bh(&map->lock); -} - -const struct bpf_map_ops xsk_map_ops = { - .map_alloc = xsk_map_alloc, - .map_free = xsk_map_free, - .map_get_next_key = xsk_map_get_next_key, - .map_lookup_elem = xsk_map_lookup_elem, - .map_gen_lookup = xsk_map_gen_lookup, - .map_lookup_elem_sys_only = xsk_map_lookup_elem_sys_only, - .map_update_elem = xsk_map_update_elem, - .map_delete_elem = xsk_map_delete_elem, - .map_check_btf = map_check_no_btf, -}; diff --git a/net/xdp/Makefile b/net/xdp/Makefile index 71e2bdafb2ce..90b5460d6166 100644 --- a/net/xdp/Makefile +++ b/net/xdp/Makefile @@ -1,3 +1,3 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o +obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o xskmap.o obj-$(CONFIG_XDP_SOCKETS_DIAG) += xsk_diag.o diff --git a/net/xdp/xsk.h b/net/xdp/xsk.h index 4cfd106bdb53..d6a0979050e6 100644 --- a/net/xdp/xsk.h +++ b/net/xdp/xsk.h @@ -17,9 +17,25 @@ struct xdp_mmap_offsets_v1 { struct xdp_ring_offset_v1 cr; }; +/* Nodes are linked in the struct xdp_sock map_list field, and used to + * track which maps a certain socket reside in. + */ + +struct xsk_map_node { + struct list_head node; + struct xsk_map *map; + struct xdp_sock **map_entry; +}; + static inline struct xdp_sock *xdp_sk(struct sock *sk) { return (struct xdp_sock *)sk; } +bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs); +void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, + struct xdp_sock **map_entry); +int xsk_map_inc(struct xsk_map *map); +void xsk_map_put(struct xsk_map *map); + #endif /* XSK_H_ */ diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c new file mode 100644 index 000000000000..1dc7208c71ba --- /dev/null +++ b/net/xdp/xskmap.c @@ -0,0 +1,267 @@ +// SPDX-License-Identifier: GPL-2.0 +/* XSKMAP used for AF_XDP sockets + * Copyright(c) 2018 Intel Corporation. + */ + +#include +#include +#include +#include +#include + +#include "xsk.h" + +int xsk_map_inc(struct xsk_map *map) +{ + bpf_map_inc(&map->map); + return 0; +} + +void xsk_map_put(struct xsk_map *map) +{ + bpf_map_put(&map->map); +} + +static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map, + struct xdp_sock **map_entry) +{ + struct xsk_map_node *node; + int err; + + node = kzalloc(sizeof(*node), GFP_ATOMIC | __GFP_NOWARN); + if (!node) + return ERR_PTR(-ENOMEM); + + err = xsk_map_inc(map); + if (err) { + kfree(node); + return ERR_PTR(err); + } + + node->map = map; + node->map_entry = map_entry; + return node; +} + +static void xsk_map_node_free(struct xsk_map_node *node) +{ + xsk_map_put(node->map); + kfree(node); +} + +static void xsk_map_sock_add(struct xdp_sock *xs, struct xsk_map_node *node) +{ + spin_lock_bh(&xs->map_list_lock); + list_add_tail(&node->node, &xs->map_list); + spin_unlock_bh(&xs->map_list_lock); +} + +static void xsk_map_sock_delete(struct xdp_sock *xs, + struct xdp_sock **map_entry) +{ + struct xsk_map_node *n, *tmp; + + spin_lock_bh(&xs->map_list_lock); + list_for_each_entry_safe(n, tmp, &xs->map_list, node) { + if (map_entry == n->map_entry) { + list_del(&n->node); + xsk_map_node_free(n); + } + } + spin_unlock_bh(&xs->map_list_lock); +} + +static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) +{ + struct bpf_map_memory mem; + int err, numa_node; + struct xsk_map *m; + u64 size; + + if (!capable(CAP_NET_ADMIN)) + return ERR_PTR(-EPERM); + + if (attr->max_entries == 0 || attr->key_size != 4 || + attr->value_size != 4 || + attr->map_flags & ~(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)) + return ERR_PTR(-EINVAL); + + numa_node = bpf_map_attr_numa_node(attr); + size = struct_size(m, xsk_map, attr->max_entries); + + err = bpf_map_charge_init(&mem, size); + if (err < 0) + return ERR_PTR(err); + + m = bpf_map_area_alloc(size, numa_node); + if (!m) { + bpf_map_charge_finish(&mem); + return ERR_PTR(-ENOMEM); + } + + bpf_map_init_from_attr(&m->map, attr); + bpf_map_charge_move(&m->map.memory, &mem); + spin_lock_init(&m->lock); + + return &m->map; +} + +static void xsk_map_free(struct bpf_map *map) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + + bpf_clear_redirect_map(map); + synchronize_net(); + bpf_map_area_free(m); +} + +static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + u32 index = key ? *(u32 *)key : U32_MAX; + u32 *next = next_key; + + if (index >= m->map.max_entries) { + *next = 0; + return 0; + } + + if (index == m->map.max_entries - 1) + return -ENOENT; + *next = index + 1; + return 0; +} + +static u32 xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) +{ + const int ret = BPF_REG_0, mp = BPF_REG_1, index = BPF_REG_2; + struct bpf_insn *insn = insn_buf; + + *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); + *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5); + *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(sizeof(struct xsk_sock *))); + *insn++ = BPF_ALU64_IMM(BPF_ADD, mp, offsetof(struct xsk_map, xsk_map)); + *insn++ = BPF_ALU64_REG(BPF_ADD, ret, mp); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(struct xsk_sock *), ret, ret, 0); + *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); + *insn++ = BPF_MOV64_IMM(ret, 0); + return insn - insn_buf; +} + +static void *xsk_map_lookup_elem(struct bpf_map *map, void *key) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + return __xsk_map_lookup_elem(map, *(u32 *)key); +} + +static void *xsk_map_lookup_elem_sys_only(struct bpf_map *map, void *key) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + struct xdp_sock *xs, *old_xs, **map_entry; + u32 i = *(u32 *)key, fd = *(u32 *)value; + struct xsk_map_node *node; + struct socket *sock; + int err; + + if (unlikely(map_flags > BPF_EXIST)) + return -EINVAL; + if (unlikely(i >= m->map.max_entries)) + return -E2BIG; + + sock = sockfd_lookup(fd, &err); + if (!sock) + return err; + + if (sock->sk->sk_family != PF_XDP) { + sockfd_put(sock); + return -EOPNOTSUPP; + } + + xs = (struct xdp_sock *)sock->sk; + + if (!xsk_is_setup_for_bpf_map(xs)) { + sockfd_put(sock); + return -EOPNOTSUPP; + } + + map_entry = &m->xsk_map[i]; + node = xsk_map_node_alloc(m, map_entry); + if (IS_ERR(node)) { + sockfd_put(sock); + return PTR_ERR(node); + } + + spin_lock_bh(&m->lock); + old_xs = READ_ONCE(*map_entry); + if (old_xs == xs) { + err = 0; + goto out; + } else if (old_xs && map_flags == BPF_NOEXIST) { + err = -EEXIST; + goto out; + } else if (!old_xs && map_flags == BPF_EXIST) { + err = -ENOENT; + goto out; + } + xsk_map_sock_add(xs, node); + WRITE_ONCE(*map_entry, xs); + if (old_xs) + xsk_map_sock_delete(old_xs, map_entry); + spin_unlock_bh(&m->lock); + sockfd_put(sock); + return 0; + +out: + spin_unlock_bh(&m->lock); + sockfd_put(sock); + xsk_map_node_free(node); + return err; +} + +static int xsk_map_delete_elem(struct bpf_map *map, void *key) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + struct xdp_sock *old_xs, **map_entry; + int k = *(u32 *)key; + + if (k >= map->max_entries) + return -EINVAL; + + spin_lock_bh(&m->lock); + map_entry = &m->xsk_map[k]; + old_xs = xchg(map_entry, NULL); + if (old_xs) + xsk_map_sock_delete(old_xs, map_entry); + spin_unlock_bh(&m->lock); + + return 0; +} + +void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, + struct xdp_sock **map_entry) +{ + spin_lock_bh(&map->lock); + if (READ_ONCE(*map_entry) == xs) { + WRITE_ONCE(*map_entry, NULL); + xsk_map_sock_delete(xs, map_entry); + } + spin_unlock_bh(&map->lock); +} + +const struct bpf_map_ops xsk_map_ops = { + .map_alloc = xsk_map_alloc, + .map_free = xsk_map_free, + .map_get_next_key = xsk_map_get_next_key, + .map_lookup_elem = xsk_map_lookup_elem, + .map_gen_lookup = xsk_map_gen_lookup, + .map_lookup_elem_sys_only = xsk_map_lookup_elem_sys_only, + .map_update_elem = xsk_map_update_elem, + .map_delete_elem = xsk_map_delete_elem, + .map_check_btf = map_check_no_btf, +}; -- cgit v1.2.3 From a71506a4fda92a39c8ece119876bc7ccde6d3c9d Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 20 May 2020 21:20:51 +0200 Subject: xsk: Move driver interface to xdp_sock_drv.h Move the AF_XDP zero-copy driver interface to its own include file called xdp_sock_drv.h. This, hopefully, will make it more clear for NIC driver implementors to know what functions to use for zero-copy support. v4->v5: Fix -Wmissing-prototypes by include header file. (Jakub) Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200520192103.355233-4-bjorn.topel@gmail.com --- drivers/net/ethernet/intel/i40e/i40e_main.c | 2 +- drivers/net/ethernet/intel/i40e/i40e_xsk.c | 2 +- drivers/net/ethernet/intel/ice/ice_xsk.c | 2 +- drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c | 2 +- .../net/ethernet/mellanox/mlx5/core/en/xsk/rx.h | 2 +- .../net/ethernet/mellanox/mlx5/core/en/xsk/tx.h | 2 +- .../net/ethernet/mellanox/mlx5/core/en/xsk/umem.c | 2 +- include/net/xdp_sock.h | 214 +------------------- include/net/xdp_sock_drv.h | 217 +++++++++++++++++++++ net/ethtool/channels.c | 2 +- net/ethtool/ioctl.c | 2 +- net/xdp/xdp_umem.h | 2 +- net/xdp/xsk.c | 2 +- net/xdp/xsk_queue.c | 1 + 15 files changed, 238 insertions(+), 218 deletions(-) create mode 100644 include/net/xdp_sock_drv.h (limited to 'net') diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 2a037ec244b9..d6b2db4f2c65 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -11,7 +11,7 @@ #include "i40e_diag.h" #include "i40e_xsk.h" #include -#include +#include /* All i40e tracepoints are defined by the include below, which * must be included exactly once across the whole kernel with * CREATE_TRACE_POINTS defined diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c index 2b9184aead5f..d8b0be29099a 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c @@ -2,7 +2,7 @@ /* Copyright(c) 2018 Intel Corporation. */ #include -#include +#include #include #include "i40e.h" diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index 23e5515d4527..70e204307a93 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -2,7 +2,7 @@ /* Copyright (c) 2019, Intel Corporation. */ #include -#include +#include #include #include "ice.h" #include "ice_base.h" diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c index a656ee9a1fae..82e4effae704 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c @@ -2,7 +2,7 @@ /* Copyright(c) 2018 Intel Corporation. */ #include -#include +#include #include #include "ixgbe.h" diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c index 761c8979bd41..3507d23f0eb8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c @@ -31,7 +31,7 @@ */ #include -#include +#include #include "en/xdp.h" #include "en/params.h" diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h index cab0e93497ae..a8e11adbf426 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h @@ -5,7 +5,7 @@ #define __MLX5_EN_XSK_RX_H__ #include "en.h" -#include +#include /* RX data path */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h index 79b487d89757..39fa0a705856 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h @@ -5,7 +5,7 @@ #define __MLX5_EN_XSK_TX_H__ #include "en.h" -#include +#include /* TX data path */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/umem.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/umem.c index 4baaa5788320..5e49fdb564b3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/umem.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/umem.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* Copyright (c) 2019 Mellanox Technologies. */ -#include +#include #include "umem.h" #include "setup.h" #include "en/params.h" diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 8f3f6f5b0dfe..6a986dcbc336 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -15,6 +15,7 @@ struct net_device; struct xsk_queue; +struct xdp_buff; /* Masks for xdp_umem_page flags. * The low 12-bits of the addr will be 0 since this is the page address, so we @@ -101,27 +102,9 @@ struct xdp_sock { spinlock_t map_list_lock; }; -struct xdp_buff; #ifdef CONFIG_XDP_SOCKETS -int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); -/* Used from netdev driver */ -bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt); -bool xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr); -void xsk_umem_release_addr(struct xdp_umem *umem); -void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries); -bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc); -void xsk_umem_consume_tx_done(struct xdp_umem *umem); -struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries); -struct xdp_umem_fq_reuse *xsk_reuseq_swap(struct xdp_umem *umem, - struct xdp_umem_fq_reuse *newq); -void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq); -struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev, u16 queue_id); -void xsk_set_rx_need_wakeup(struct xdp_umem *umem); -void xsk_set_tx_need_wakeup(struct xdp_umem *umem); -void xsk_clear_rx_need_wakeup(struct xdp_umem *umem); -void xsk_clear_tx_need_wakeup(struct xdp_umem *umem); -bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem); +int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp); void __xsk_map_flush(void); @@ -153,131 +136,24 @@ static inline u64 xsk_umem_add_offset_to_addr(u64 addr) return xsk_umem_extract_addr(addr) + xsk_umem_extract_offset(addr); } -static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr) -{ - unsigned long page_addr; - - addr = xsk_umem_add_offset_to_addr(addr); - page_addr = (unsigned long)umem->pages[addr >> PAGE_SHIFT].addr; - - return (char *)(page_addr & PAGE_MASK) + (addr & ~PAGE_MASK); -} - -static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr) -{ - addr = xsk_umem_add_offset_to_addr(addr); - - return umem->pages[addr >> PAGE_SHIFT].dma + (addr & ~PAGE_MASK); -} - -/* Reuse-queue aware version of FILL queue helpers */ -static inline bool xsk_umem_has_addrs_rq(struct xdp_umem *umem, u32 cnt) -{ - struct xdp_umem_fq_reuse *rq = umem->fq_reuse; - - if (rq->length >= cnt) - return true; - - return xsk_umem_has_addrs(umem, cnt - rq->length); -} - -static inline bool xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr) -{ - struct xdp_umem_fq_reuse *rq = umem->fq_reuse; - - if (!rq->length) - return xsk_umem_peek_addr(umem, addr); - - *addr = rq->handles[rq->length - 1]; - return addr; -} - -static inline void xsk_umem_release_addr_rq(struct xdp_umem *umem) -{ - struct xdp_umem_fq_reuse *rq = umem->fq_reuse; - - if (!rq->length) - xsk_umem_release_addr(umem); - else - rq->length--; -} - -static inline void xsk_umem_fq_reuse(struct xdp_umem *umem, u64 addr) -{ - struct xdp_umem_fq_reuse *rq = umem->fq_reuse; - - rq->handles[rq->length++] = addr; -} - -/* Handle the offset appropriately depending on aligned or unaligned mode. - * For unaligned mode, we store the offset in the upper 16-bits of the address. - * For aligned mode, we simply add the offset to the address. - */ -static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 address, - u64 offset) -{ - if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) - return address + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT); - else - return address + offset; -} - -static inline u32 xsk_umem_xdp_frame_sz(struct xdp_umem *umem) -{ - return umem->chunk_size_nohr; -} - #else + static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { return -ENOTSUPP; } -static inline bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt) -{ - return false; -} - -static inline u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr) -{ - return NULL; -} - -static inline void xsk_umem_release_addr(struct xdp_umem *umem) -{ -} - -static inline void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries) -{ -} - -static inline bool xsk_umem_consume_tx(struct xdp_umem *umem, - struct xdp_desc *desc) -{ - return false; -} - -static inline void xsk_umem_consume_tx_done(struct xdp_umem *umem) -{ -} - -static inline struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries) +static inline int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp) { - return NULL; + return -EOPNOTSUPP; } -static inline struct xdp_umem_fq_reuse *xsk_reuseq_swap( - struct xdp_umem *umem, - struct xdp_umem_fq_reuse *newq) -{ - return NULL; -} -static inline void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq) +static inline void __xsk_map_flush(void) { } -static inline struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev, - u16 queue_id) +static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, + u32 key) { return NULL; } @@ -297,80 +173,6 @@ static inline u64 xsk_umem_add_offset_to_addr(u64 addr) return 0; } -static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr) -{ - return NULL; -} - -static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr) -{ - return 0; -} - -static inline bool xsk_umem_has_addrs_rq(struct xdp_umem *umem, u32 cnt) -{ - return false; -} - -static inline u64 *xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr) -{ - return NULL; -} - -static inline void xsk_umem_release_addr_rq(struct xdp_umem *umem) -{ -} - -static inline void xsk_umem_fq_reuse(struct xdp_umem *umem, u64 addr) -{ -} - -static inline void xsk_set_rx_need_wakeup(struct xdp_umem *umem) -{ -} - -static inline void xsk_set_tx_need_wakeup(struct xdp_umem *umem) -{ -} - -static inline void xsk_clear_rx_need_wakeup(struct xdp_umem *umem) -{ -} - -static inline void xsk_clear_tx_need_wakeup(struct xdp_umem *umem) -{ -} - -static inline bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem) -{ - return false; -} - -static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 handle, - u64 offset) -{ - return 0; -} - -static inline u32 xsk_umem_xdp_frame_sz(struct xdp_umem *umem) -{ - return 0; -} - -static inline int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp) -{ - return -EOPNOTSUPP; -} - -static inline void __xsk_map_flush(void) -{ -} - -static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, - u32 key) -{ - return NULL; -} #endif /* CONFIG_XDP_SOCKETS */ #endif /* _LINUX_XDP_SOCK_H */ diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h new file mode 100644 index 000000000000..d67f2361937a --- /dev/null +++ b/include/net/xdp_sock_drv.h @@ -0,0 +1,217 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Interface for implementing AF_XDP zero-copy support in drivers. + * Copyright(c) 2020 Intel Corporation. + */ + +#ifndef _LINUX_XDP_SOCK_DRV_H +#define _LINUX_XDP_SOCK_DRV_H + +#include + +#ifdef CONFIG_XDP_SOCKETS + +bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt); +bool xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr); +void xsk_umem_release_addr(struct xdp_umem *umem); +void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries); +bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc); +void xsk_umem_consume_tx_done(struct xdp_umem *umem); +struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries); +struct xdp_umem_fq_reuse *xsk_reuseq_swap(struct xdp_umem *umem, + struct xdp_umem_fq_reuse *newq); +void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq); +struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev, u16 queue_id); +void xsk_set_rx_need_wakeup(struct xdp_umem *umem); +void xsk_set_tx_need_wakeup(struct xdp_umem *umem); +void xsk_clear_rx_need_wakeup(struct xdp_umem *umem); +void xsk_clear_tx_need_wakeup(struct xdp_umem *umem); +bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem); + +static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr) +{ + unsigned long page_addr; + + addr = xsk_umem_add_offset_to_addr(addr); + page_addr = (unsigned long)umem->pages[addr >> PAGE_SHIFT].addr; + + return (char *)(page_addr & PAGE_MASK) + (addr & ~PAGE_MASK); +} + +static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr) +{ + addr = xsk_umem_add_offset_to_addr(addr); + + return umem->pages[addr >> PAGE_SHIFT].dma + (addr & ~PAGE_MASK); +} + +/* Reuse-queue aware version of FILL queue helpers */ +static inline bool xsk_umem_has_addrs_rq(struct xdp_umem *umem, u32 cnt) +{ + struct xdp_umem_fq_reuse *rq = umem->fq_reuse; + + if (rq->length >= cnt) + return true; + + return xsk_umem_has_addrs(umem, cnt - rq->length); +} + +static inline bool xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr) +{ + struct xdp_umem_fq_reuse *rq = umem->fq_reuse; + + if (!rq->length) + return xsk_umem_peek_addr(umem, addr); + + *addr = rq->handles[rq->length - 1]; + return addr; +} + +static inline void xsk_umem_release_addr_rq(struct xdp_umem *umem) +{ + struct xdp_umem_fq_reuse *rq = umem->fq_reuse; + + if (!rq->length) + xsk_umem_release_addr(umem); + else + rq->length--; +} + +static inline void xsk_umem_fq_reuse(struct xdp_umem *umem, u64 addr) +{ + struct xdp_umem_fq_reuse *rq = umem->fq_reuse; + + rq->handles[rq->length++] = addr; +} + +/* Handle the offset appropriately depending on aligned or unaligned mode. + * For unaligned mode, we store the offset in the upper 16-bits of the address. + * For aligned mode, we simply add the offset to the address. + */ +static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 address, + u64 offset) +{ + if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) + return address + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT); + else + return address + offset; +} + +static inline u32 xsk_umem_xdp_frame_sz(struct xdp_umem *umem) +{ + return umem->chunk_size_nohr; +} + +#else + +static inline bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt) +{ + return false; +} + +static inline u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr) +{ + return NULL; +} + +static inline void xsk_umem_release_addr(struct xdp_umem *umem) +{ +} + +static inline void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries) +{ +} + +static inline bool xsk_umem_consume_tx(struct xdp_umem *umem, + struct xdp_desc *desc) +{ + return false; +} + +static inline void xsk_umem_consume_tx_done(struct xdp_umem *umem) +{ +} + +static inline struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries) +{ + return NULL; +} + +static inline struct xdp_umem_fq_reuse *xsk_reuseq_swap( + struct xdp_umem *umem, struct xdp_umem_fq_reuse *newq) +{ + return NULL; +} + +static inline void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq) +{ +} + +static inline struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev, + u16 queue_id) +{ + return NULL; +} + +static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr) +{ + return NULL; +} + +static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr) +{ + return 0; +} + +static inline bool xsk_umem_has_addrs_rq(struct xdp_umem *umem, u32 cnt) +{ + return false; +} + +static inline u64 *xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr) +{ + return NULL; +} + +static inline void xsk_umem_release_addr_rq(struct xdp_umem *umem) +{ +} + +static inline void xsk_umem_fq_reuse(struct xdp_umem *umem, u64 addr) +{ +} + +static inline void xsk_set_rx_need_wakeup(struct xdp_umem *umem) +{ +} + +static inline void xsk_set_tx_need_wakeup(struct xdp_umem *umem) +{ +} + +static inline void xsk_clear_rx_need_wakeup(struct xdp_umem *umem) +{ +} + +static inline void xsk_clear_tx_need_wakeup(struct xdp_umem *umem) +{ +} + +static inline bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem) +{ + return false; +} + +static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 handle, + u64 offset) +{ + return 0; +} + +static inline u32 xsk_umem_xdp_frame_sz(struct xdp_umem *umem) +{ + return 0; +} + +#endif /* CONFIG_XDP_SOCKETS */ + +#endif /* _LINUX_XDP_SOCK_DRV_H */ diff --git a/net/ethtool/channels.c b/net/ethtool/channels.c index 389924b65d05..658a8580b464 100644 --- a/net/ethtool/channels.c +++ b/net/ethtool/channels.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only -#include +#include #include "netlink.h" #include "common.h" diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 52102ab1709b..74892623bacd 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h index a63a9fb251f5..32067fe98f65 100644 --- a/net/xdp/xdp_umem.h +++ b/net/xdp/xdp_umem.h @@ -6,7 +6,7 @@ #ifndef XDP_UMEM_H_ #define XDP_UMEM_H_ -#include +#include int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, u16 queue_id, u16 flags); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 45ffd67b367d..8bda654e82ec 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include #include "xsk_queue.h" diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c index 57fb81bd593c..554b1ebb4d02 100644 --- a/net/xdp/xsk_queue.c +++ b/net/xdp/xsk_queue.c @@ -6,6 +6,7 @@ #include #include #include +#include #include "xsk_queue.h" -- cgit v1.2.3 From 89e4a376e3a3dab639a3947a6c7cf5d461d1aa4c Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Wed, 20 May 2020 21:20:52 +0200 Subject: xsk: Move defines only used by AF_XDP internals to xsk.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the XSK_NEXT_PG_CONTIG_{MASK,SHIFT}, and XDP_UMEM_USES_NEED_WAKEUP defines from xdp_sock.h to the AF_XDP internal xsk.h file. Also, start using the BIT{,_ULL} macro instead of explicit shifts. Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200520192103.355233-5-bjorn.topel@gmail.com --- include/net/xdp_sock.h | 14 -------------- net/xdp/xsk.h | 14 ++++++++++++++ net/xdp/xsk_queue.h | 2 ++ 3 files changed, 16 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 6a986dcbc336..fb7fe3060175 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -17,13 +17,6 @@ struct net_device; struct xsk_queue; struct xdp_buff; -/* Masks for xdp_umem_page flags. - * The low 12-bits of the addr will be 0 since this is the page address, so we - * can use them for flags. - */ -#define XSK_NEXT_PG_CONTIG_SHIFT 0 -#define XSK_NEXT_PG_CONTIG_MASK (1ULL << XSK_NEXT_PG_CONTIG_SHIFT) - struct xdp_umem_page { void *addr; dma_addr_t dma; @@ -35,13 +28,6 @@ struct xdp_umem_fq_reuse { u64 handles[]; }; -/* Flags for the umem flags field. - * - * The NEED_WAKEUP flag is 1 due to the reuse of the flags field for public - * flags. See inlude/uapi/include/linux/if_xdp.h. - */ -#define XDP_UMEM_USES_NEED_WAKEUP (1 << 1) - struct xdp_umem { struct xsk_queue *fq; struct xsk_queue *cq; diff --git a/net/xdp/xsk.h b/net/xdp/xsk.h index d6a0979050e6..455ddd480f3d 100644 --- a/net/xdp/xsk.h +++ b/net/xdp/xsk.h @@ -4,6 +4,20 @@ #ifndef XSK_H_ #define XSK_H_ +/* Masks for xdp_umem_page flags. + * The low 12-bits of the addr will be 0 since this is the page address, so we + * can use them for flags. + */ +#define XSK_NEXT_PG_CONTIG_SHIFT 0 +#define XSK_NEXT_PG_CONTIG_MASK BIT_ULL(XSK_NEXT_PG_CONTIG_SHIFT) + +/* Flags for the umem flags field. + * + * The NEED_WAKEUP flag is 1 due to the reuse of the flags field for public + * flags. See inlude/uapi/include/linux/if_xdp.h. + */ +#define XDP_UMEM_USES_NEED_WAKEUP BIT(1) + struct xdp_ring_offset_v1 { __u64 producer; __u64 consumer; diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 648733ec24ac..a322a7dac58c 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -10,6 +10,8 @@ #include #include +#include "xsk.h" + struct xdp_ring { u32 producer ____cacheline_aligned_in_smp; u32 consumer ____cacheline_aligned_in_smp; -- cgit v1.2.3 From 2b43470add8c8ff1e1ee28dffc5c5df97e955d09 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Wed, 20 May 2020 21:20:53 +0200 Subject: xsk: Introduce AF_XDP buffer allocation API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In order to simplify AF_XDP zero-copy enablement for NIC driver developers, a new AF_XDP buffer allocation API is added. The implementation is based on a single core (single producer/consumer) buffer pool for the AF_XDP UMEM. A buffer is allocated using the xsk_buff_alloc() function, and returned using xsk_buff_free(). If a buffer is disassociated with the pool, e.g. when a buffer is passed to an AF_XDP socket, a buffer is said to be released. Currently, the release function is only used by the AF_XDP internals and not visible to the driver. Drivers using this API should register the XDP memory model with the new MEM_TYPE_XSK_BUFF_POOL type. The API is defined in net/xdp_sock_drv.h. The buffer type is struct xdp_buff, and follows the lifetime of regular xdp_buffs, i.e. the lifetime of an xdp_buff is restricted to a NAPI context. In other words, the API is not replacing xdp_frames. In addition to introducing the API and implementations, the AF_XDP core is migrated to use the new APIs. rfc->v1: Fixed build errors/warnings for m68k and riscv. (kbuild test robot) Added headroom/chunk size getter. (Maxim/Björn) v1->v2: Swapped SoBs. (Maxim) v2->v3: Initialize struct xdp_buff member frame_sz. (Björn) Add API to query the DMA address of a frame. (Maxim) Do DMA sync for CPU till the end of the frame to handle possible growth (frame_sz). (Maxim) Signed-off-by: Björn Töpel Signed-off-by: Maxim Mikityanskiy Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200520192103.355233-6-bjorn.topel@gmail.com --- include/net/xdp.h | 4 +- include/net/xdp_sock.h | 2 + include/net/xdp_sock_drv.h | 164 ++++++++++++++++ include/net/xsk_buff_pool.h | 56 ++++++ include/trace/events/xdp.h | 3 +- net/core/xdp.c | 14 +- net/xdp/Makefile | 1 + net/xdp/xdp_umem.c | 19 +- net/xdp/xsk.c | 147 +++++--------- net/xdp/xsk_buff_pool.c | 467 ++++++++++++++++++++++++++++++++++++++++++++ net/xdp/xsk_diag.c | 2 +- net/xdp/xsk_queue.h | 59 ++++-- 12 files changed, 819 insertions(+), 119 deletions(-) create mode 100644 include/net/xsk_buff_pool.h create mode 100644 net/xdp/xsk_buff_pool.c (limited to 'net') diff --git a/include/net/xdp.h b/include/net/xdp.h index 3094fccf5a88..f432134c7c00 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -40,6 +40,7 @@ enum xdp_mem_type { MEM_TYPE_PAGE_ORDER0, /* Orig XDP full page model */ MEM_TYPE_PAGE_POOL, MEM_TYPE_ZERO_COPY, + MEM_TYPE_XSK_BUFF_POOL, MEM_TYPE_MAX, }; @@ -119,7 +120,8 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp) int metasize; int headroom; - if (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) + if (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY || + xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) return xdp_convert_zc_to_xdp_frame(xdp); /* Assure headroom is available for storing info */ diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index fb7fe3060175..6e7265f63c04 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -31,11 +31,13 @@ struct xdp_umem_fq_reuse { struct xdp_umem { struct xsk_queue *fq; struct xsk_queue *cq; + struct xsk_buff_pool *pool; struct xdp_umem_page *pages; u64 chunk_mask; u64 size; u32 headroom; u32 chunk_size_nohr; + u32 chunk_size; struct user_struct *user; refcount_t users; struct work_struct work; diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index d67f2361937a..7752c8663d1b 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -7,6 +7,7 @@ #define _LINUX_XDP_SOCK_DRV_H #include +#include #ifdef CONFIG_XDP_SOCKETS @@ -101,6 +102,94 @@ static inline u32 xsk_umem_xdp_frame_sz(struct xdp_umem *umem) return umem->chunk_size_nohr; } +static inline u32 xsk_umem_get_headroom(struct xdp_umem *umem) +{ + return XDP_PACKET_HEADROOM + umem->headroom; +} + +static inline u32 xsk_umem_get_chunk_size(struct xdp_umem *umem) +{ + return umem->chunk_size; +} + +static inline u32 xsk_umem_get_rx_frame_size(struct xdp_umem *umem) +{ + return xsk_umem_get_chunk_size(umem) - xsk_umem_get_headroom(umem); +} + +static inline void xsk_buff_set_rxq_info(struct xdp_umem *umem, + struct xdp_rxq_info *rxq) +{ + xp_set_rxq_info(umem->pool, rxq); +} + +static inline void xsk_buff_dma_unmap(struct xdp_umem *umem, + unsigned long attrs) +{ + xp_dma_unmap(umem->pool, attrs); +} + +static inline int xsk_buff_dma_map(struct xdp_umem *umem, struct device *dev, + unsigned long attrs) +{ + return xp_dma_map(umem->pool, dev, attrs, umem->pgs, umem->npgs); +} + +static inline dma_addr_t xsk_buff_xdp_get_dma(struct xdp_buff *xdp) +{ + struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); + + return xp_get_dma(xskb); +} + +static inline dma_addr_t xsk_buff_xdp_get_frame_dma(struct xdp_buff *xdp) +{ + struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); + + return xp_get_frame_dma(xskb); +} + +static inline struct xdp_buff *xsk_buff_alloc(struct xdp_umem *umem) +{ + return xp_alloc(umem->pool); +} + +static inline bool xsk_buff_can_alloc(struct xdp_umem *umem, u32 count) +{ + return xp_can_alloc(umem->pool, count); +} + +static inline void xsk_buff_free(struct xdp_buff *xdp) +{ + struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); + + xp_free(xskb); +} + +static inline dma_addr_t xsk_buff_raw_get_dma(struct xdp_umem *umem, u64 addr) +{ + return xp_raw_get_dma(umem->pool, addr); +} + +static inline void *xsk_buff_raw_get_data(struct xdp_umem *umem, u64 addr) +{ + return xp_raw_get_data(umem->pool, addr); +} + +static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp) +{ + struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); + + xp_dma_sync_for_cpu(xskb); +} + +static inline void xsk_buff_raw_dma_sync_for_device(struct xdp_umem *umem, + dma_addr_t dma, + size_t size) +{ + xp_dma_sync_for_device(umem->pool, dma, size); +} + #else static inline bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt) @@ -212,6 +301,81 @@ static inline u32 xsk_umem_xdp_frame_sz(struct xdp_umem *umem) return 0; } +static inline u32 xsk_umem_get_headroom(struct xdp_umem *umem) +{ + return 0; +} + +static inline u32 xsk_umem_get_chunk_size(struct xdp_umem *umem) +{ + return 0; +} + +static inline u32 xsk_umem_get_rx_frame_size(struct xdp_umem *umem) +{ + return 0; +} + +static inline void xsk_buff_set_rxq_info(struct xdp_umem *umem, + struct xdp_rxq_info *rxq) +{ +} + +static inline void xsk_buff_dma_unmap(struct xdp_umem *umem, + unsigned long attrs) +{ +} + +static inline int xsk_buff_dma_map(struct xdp_umem *umem, struct device *dev, + unsigned long attrs) +{ + return 0; +} + +static inline dma_addr_t xsk_buff_xdp_get_dma(struct xdp_buff *xdp) +{ + return 0; +} + +static inline dma_addr_t xsk_buff_xdp_get_frame_dma(struct xdp_buff *xdp) +{ + return 0; +} + +static inline struct xdp_buff *xsk_buff_alloc(struct xdp_umem *umem) +{ + return NULL; +} + +static inline bool xsk_buff_can_alloc(struct xdp_umem *umem, u32 count) +{ + return false; +} + +static inline void xsk_buff_free(struct xdp_buff *xdp) +{ +} + +static inline dma_addr_t xsk_buff_raw_get_dma(struct xdp_umem *umem, u64 addr) +{ + return 0; +} + +static inline void *xsk_buff_raw_get_data(struct xdp_umem *umem, u64 addr) +{ + return NULL; +} + +static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp) +{ +} + +static inline void xsk_buff_raw_dma_sync_for_device(struct xdp_umem *umem, + dma_addr_t dma, + size_t size) +{ +} + #endif /* CONFIG_XDP_SOCKETS */ #endif /* _LINUX_XDP_SOCK_DRV_H */ diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h new file mode 100644 index 000000000000..9f221b36e405 --- /dev/null +++ b/include/net/xsk_buff_pool.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright(c) 2020 Intel Corporation. */ + +#ifndef XSK_BUFF_POOL_H_ +#define XSK_BUFF_POOL_H_ + +#include +#include +#include + +struct xsk_buff_pool; +struct xdp_rxq_info; +struct xsk_queue; +struct xdp_desc; +struct device; +struct page; + +struct xdp_buff_xsk { + struct xdp_buff xdp; + dma_addr_t dma; + dma_addr_t frame_dma; + struct xsk_buff_pool *pool; + bool unaligned; + u64 orig_addr; + struct list_head free_list_node; +}; + +/* AF_XDP core. */ +struct xsk_buff_pool *xp_create(struct page **pages, u32 nr_pages, u32 chunks, + u32 chunk_size, u32 headroom, u64 size, + bool unaligned); +void xp_set_fq(struct xsk_buff_pool *pool, struct xsk_queue *fq); +void xp_destroy(struct xsk_buff_pool *pool); +void xp_release(struct xdp_buff_xsk *xskb); +u64 xp_get_handle(struct xdp_buff_xsk *xskb); +bool xp_validate_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc); + +/* AF_XDP, and XDP core. */ +void xp_free(struct xdp_buff_xsk *xskb); + +/* AF_XDP ZC drivers, via xdp_sock_buff.h */ +void xp_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq); +int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev, + unsigned long attrs, struct page **pages, u32 nr_pages); +void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs); +struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool); +bool xp_can_alloc(struct xsk_buff_pool *pool, u32 count); +void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr); +dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr); +dma_addr_t xp_get_dma(struct xdp_buff_xsk *xskb); +dma_addr_t xp_get_frame_dma(struct xdp_buff_xsk *xskb); +void xp_dma_sync_for_cpu(struct xdp_buff_xsk *xskb); +void xp_dma_sync_for_device(struct xsk_buff_pool *pool, dma_addr_t dma, + size_t size); + +#endif /* XSK_BUFF_POOL_H_ */ diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index b95d65e8c628..48547a12fa27 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -287,7 +287,8 @@ TRACE_EVENT(xdp_devmap_xmit, FN(PAGE_SHARED) \ FN(PAGE_ORDER0) \ FN(PAGE_POOL) \ - FN(ZERO_COPY) + FN(ZERO_COPY) \ + FN(XSK_BUFF_POOL) #define __MEM_TYPE_TP_FN(x) \ TRACE_DEFINE_ENUM(MEM_TYPE_##x); diff --git a/net/core/xdp.c b/net/core/xdp.c index 490b8f5fa8ee..f0ce8b195193 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -17,6 +17,7 @@ #include #include /* struct xdp_mem_allocator */ #include +#include #define REG_STATE_NEW 0x0 #define REG_STATE_REGISTERED 0x1 @@ -361,7 +362,7 @@ EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); * of xdp_frames/pages in those cases. */ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, - unsigned long handle) + unsigned long handle, struct xdp_buff *xdp) { struct xdp_mem_allocator *xa; struct page *page; @@ -390,6 +391,11 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); xa->zc_alloc->free(xa->zc_alloc, handle); rcu_read_unlock(); + break; + case MEM_TYPE_XSK_BUFF_POOL: + /* NB! Only valid from an xdp_buff! */ + xsk_buff_free(xdp); + break; default: /* Not possible, checked in xdp_rxq_info_reg_mem_model() */ break; @@ -398,19 +404,19 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, void xdp_return_frame(struct xdp_frame *xdpf) { - __xdp_return(xdpf->data, &xdpf->mem, false, 0); + __xdp_return(xdpf->data, &xdpf->mem, false, 0, NULL); } EXPORT_SYMBOL_GPL(xdp_return_frame); void xdp_return_frame_rx_napi(struct xdp_frame *xdpf) { - __xdp_return(xdpf->data, &xdpf->mem, true, 0); + __xdp_return(xdpf->data, &xdpf->mem, true, 0, NULL); } EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi); void xdp_return_buff(struct xdp_buff *xdp) { - __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp->handle); + __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp->handle, xdp); } EXPORT_SYMBOL_GPL(xdp_return_buff); diff --git a/net/xdp/Makefile b/net/xdp/Makefile index 90b5460d6166..30cdc4315f42 100644 --- a/net/xdp/Makefile +++ b/net/xdp/Makefile @@ -1,3 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o xskmap.o +obj-$(CONFIG_XDP_SOCKETS) += xsk_buff_pool.o obj-$(CONFIG_XDP_SOCKETS_DIAG) += xsk_diag.o diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 37ace3bc0d48..7f04688045d5 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -245,7 +245,7 @@ static void xdp_umem_release(struct xdp_umem *umem) } xsk_reuseq_destroy(umem); - + xp_destroy(umem->pool); xdp_umem_unmap_pages(umem); xdp_umem_unpin_pages(umem); @@ -390,6 +390,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) umem->size = size; umem->headroom = headroom; umem->chunk_size_nohr = chunk_size - headroom; + umem->chunk_size = chunk_size; umem->npgs = size / PAGE_SIZE; umem->pgs = NULL; umem->user = NULL; @@ -415,11 +416,21 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) } err = xdp_umem_map_pages(umem); - if (!err) - return 0; + if (err) + goto out_pages; - kvfree(umem->pages); + umem->pool = xp_create(umem->pgs, umem->npgs, chunks, chunk_size, + headroom, size, unaligned_chunks); + if (!umem->pool) { + err = -ENOMEM; + goto out_unmap; + } + return 0; +out_unmap: + xdp_umem_unmap_pages(umem); +out_pages: + kvfree(umem->pages); out_pin: xdp_umem_unpin_pages(umem); out_account: diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 8bda654e82ec..6933f0d494ba 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -117,76 +117,67 @@ bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem) } EXPORT_SYMBOL(xsk_umem_uses_need_wakeup); -/* If a buffer crosses a page boundary, we need to do 2 memcpy's, one for - * each page. This is only required in copy mode. - */ -static void __xsk_rcv_memcpy(struct xdp_umem *umem, u64 addr, void *from_buf, - u32 len, u32 metalen) +static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { - void *to_buf = xdp_umem_get_data(umem, addr); - - addr = xsk_umem_add_offset_to_addr(addr); - if (xskq_cons_crosses_non_contig_pg(umem, addr, len + metalen)) { - void *next_pg_addr = umem->pages[(addr >> PAGE_SHIFT) + 1].addr; - u64 page_start = addr & ~(PAGE_SIZE - 1); - u64 first_len = PAGE_SIZE - (addr - page_start); - - memcpy(to_buf, from_buf, first_len); - memcpy(next_pg_addr, from_buf + first_len, - len + metalen - first_len); + struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); + u64 addr; + int err; - return; + addr = xp_get_handle(xskb); + err = xskq_prod_reserve_desc(xs->rx, addr, len); + if (err) { + xs->rx_dropped++; + return err; } - memcpy(to_buf, from_buf, len + metalen); + xp_release(xskb); + return 0; } -static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) +static void xsk_copy_xdp(struct xdp_buff *to, struct xdp_buff *from, u32 len) { - u64 offset = xs->umem->headroom; - u64 addr, memcpy_addr; - void *from_buf; + void *from_buf, *to_buf; u32 metalen; - int err; - - if (!xskq_cons_peek_addr(xs->umem->fq, &addr, xs->umem) || - len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) { - xs->rx_dropped++; - return -ENOSPC; - } - if (unlikely(xdp_data_meta_unsupported(xdp))) { - from_buf = xdp->data; + if (unlikely(xdp_data_meta_unsupported(from))) { + from_buf = from->data; + to_buf = to->data; metalen = 0; } else { - from_buf = xdp->data_meta; - metalen = xdp->data - xdp->data_meta; + from_buf = from->data_meta; + metalen = from->data - from->data_meta; + to_buf = to->data - metalen; } - memcpy_addr = xsk_umem_adjust_offset(xs->umem, addr, offset); - __xsk_rcv_memcpy(xs->umem, memcpy_addr, from_buf, len, metalen); - - offset += metalen; - addr = xsk_umem_adjust_offset(xs->umem, addr, offset); - err = xskq_prod_reserve_desc(xs->rx, addr, len); - if (!err) { - xskq_cons_release(xs->umem->fq); - xdp_return_buff(xdp); - return 0; - } - - xs->rx_dropped++; - return err; + memcpy(to_buf, from_buf, len + metalen); } -static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) +static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len, + bool explicit_free) { - int err = xskq_prod_reserve_desc(xs->rx, xdp->handle, len); + struct xdp_buff *xsk_xdp; + int err; - if (err) + if (len > xsk_umem_get_rx_frame_size(xs->umem)) { + xs->rx_dropped++; + return -ENOSPC; + } + + xsk_xdp = xsk_buff_alloc(xs->umem); + if (!xsk_xdp) { xs->rx_dropped++; + return -ENOSPC; + } - return err; + xsk_copy_xdp(xsk_xdp, xdp, len); + err = __xsk_rcv_zc(xs, xsk_xdp, len); + if (err) { + xsk_buff_free(xsk_xdp); + return err; + } + if (explicit_free) + xdp_return_buff(xdp); + return 0; } static bool xsk_is_bound(struct xdp_sock *xs) @@ -199,7 +190,8 @@ static bool xsk_is_bound(struct xdp_sock *xs) return false; } -static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) +static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, + bool explicit_free) { u32 len; @@ -211,8 +203,10 @@ static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) len = xdp->data_end - xdp->data; - return (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) ? - __xsk_rcv_zc(xs, xdp, len) : __xsk_rcv(xs, xdp, len); + return xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY || + xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL ? + __xsk_rcv_zc(xs, xdp, len) : + __xsk_rcv(xs, xdp, len, explicit_free); } static void xsk_flush(struct xdp_sock *xs) @@ -224,46 +218,11 @@ static void xsk_flush(struct xdp_sock *xs) int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { - u32 metalen = xdp->data - xdp->data_meta; - u32 len = xdp->data_end - xdp->data; - u64 offset = xs->umem->headroom; - void *buffer; - u64 addr; int err; spin_lock_bh(&xs->rx_lock); - - if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) { - err = -EINVAL; - goto out_unlock; - } - - if (!xskq_cons_peek_addr(xs->umem->fq, &addr, xs->umem) || - len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) { - err = -ENOSPC; - goto out_drop; - } - - addr = xsk_umem_adjust_offset(xs->umem, addr, offset); - buffer = xdp_umem_get_data(xs->umem, addr); - memcpy(buffer, xdp->data_meta, len + metalen); - - addr = xsk_umem_adjust_offset(xs->umem, addr, metalen); - err = xskq_prod_reserve_desc(xs->rx, addr, len); - if (err) - goto out_drop; - - xskq_cons_release(xs->umem->fq); - xskq_prod_submit(xs->rx); - - spin_unlock_bh(&xs->rx_lock); - - xs->sk.sk_data_ready(&xs->sk); - return 0; - -out_drop: - xs->rx_dropped++; -out_unlock: + err = xsk_rcv(xs, xdp, false); + xsk_flush(xs); spin_unlock_bh(&xs->rx_lock); return err; } @@ -273,7 +232,7 @@ int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp) struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list); int err; - err = xsk_rcv(xs, xdp); + err = xsk_rcv(xs, xdp, true); if (err) return err; @@ -404,7 +363,7 @@ static int xsk_generic_xmit(struct sock *sk) skb_put(skb, len); addr = desc.addr; - buffer = xdp_umem_get_data(xs->umem, addr); + buffer = xsk_buff_raw_get_data(xs->umem, addr); err = skb_store_bits(skb, 0, buffer, len); /* This is the backpressure mechanism for the Tx path. * Reserve space in the completion queue and only proceed @@ -860,6 +819,8 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq : &xs->umem->cq; err = xsk_init_queue(entries, q, true); + if (optname == XDP_UMEM_FILL_RING) + xp_set_fq(xs->umem->pool, *q); mutex_unlock(&xs->mutex); return err; } diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c new file mode 100644 index 000000000000..e214a5795a62 --- /dev/null +++ b/net/xdp/xsk_buff_pool.c @@ -0,0 +1,467 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include + +#include "xsk_queue.h" + +struct xsk_buff_pool { + struct xsk_queue *fq; + struct list_head free_list; + dma_addr_t *dma_pages; + struct xdp_buff_xsk *heads; + u64 chunk_mask; + u64 addrs_cnt; + u32 free_list_cnt; + u32 dma_pages_cnt; + u32 heads_cnt; + u32 free_heads_cnt; + u32 headroom; + u32 chunk_size; + u32 frame_len; + bool cheap_dma; + bool unaligned; + void *addrs; + struct device *dev; + struct xdp_buff_xsk *free_heads[]; +}; + +static void xp_addr_unmap(struct xsk_buff_pool *pool) +{ + vunmap(pool->addrs); +} + +static int xp_addr_map(struct xsk_buff_pool *pool, + struct page **pages, u32 nr_pages) +{ + pool->addrs = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); + if (!pool->addrs) + return -ENOMEM; + return 0; +} + +void xp_destroy(struct xsk_buff_pool *pool) +{ + if (!pool) + return; + + xp_addr_unmap(pool); + kvfree(pool->heads); + kvfree(pool); +} + +struct xsk_buff_pool *xp_create(struct page **pages, u32 nr_pages, u32 chunks, + u32 chunk_size, u32 headroom, u64 size, + bool unaligned) +{ + struct xsk_buff_pool *pool; + struct xdp_buff_xsk *xskb; + int err; + u32 i; + + pool = kvzalloc(struct_size(pool, free_heads, chunks), GFP_KERNEL); + if (!pool) + goto out; + + pool->heads = kvcalloc(chunks, sizeof(*pool->heads), GFP_KERNEL); + if (!pool->heads) + goto out; + + pool->chunk_mask = ~((u64)chunk_size - 1); + pool->addrs_cnt = size; + pool->heads_cnt = chunks; + pool->free_heads_cnt = chunks; + pool->headroom = headroom; + pool->chunk_size = chunk_size; + pool->cheap_dma = true; + pool->unaligned = unaligned; + pool->frame_len = chunk_size - headroom - XDP_PACKET_HEADROOM; + INIT_LIST_HEAD(&pool->free_list); + + for (i = 0; i < pool->free_heads_cnt; i++) { + xskb = &pool->heads[i]; + xskb->pool = pool; + xskb->xdp.frame_sz = chunk_size - headroom; + pool->free_heads[i] = xskb; + } + + err = xp_addr_map(pool, pages, nr_pages); + if (!err) + return pool; + +out: + xp_destroy(pool); + return NULL; +} + +void xp_set_fq(struct xsk_buff_pool *pool, struct xsk_queue *fq) +{ + pool->fq = fq; +} + +void xp_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq) +{ + u32 i; + + for (i = 0; i < pool->heads_cnt; i++) + pool->heads[i].xdp.rxq = rxq; +} +EXPORT_SYMBOL(xp_set_rxq_info); + +void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs) +{ + dma_addr_t *dma; + u32 i; + + if (pool->dma_pages_cnt == 0) + return; + + for (i = 0; i < pool->dma_pages_cnt; i++) { + dma = &pool->dma_pages[i]; + if (*dma) { + dma_unmap_page_attrs(pool->dev, *dma, PAGE_SIZE, + DMA_BIDIRECTIONAL, attrs); + *dma = 0; + } + } + + kvfree(pool->dma_pages); + pool->dma_pages_cnt = 0; + pool->dev = NULL; +} +EXPORT_SYMBOL(xp_dma_unmap); + +static void xp_check_dma_contiguity(struct xsk_buff_pool *pool) +{ + u32 i; + + for (i = 0; i < pool->dma_pages_cnt - 1; i++) { + if (pool->dma_pages[i] + PAGE_SIZE == pool->dma_pages[i + 1]) + pool->dma_pages[i] |= XSK_NEXT_PG_CONTIG_MASK; + else + pool->dma_pages[i] &= ~XSK_NEXT_PG_CONTIG_MASK; + } +} + +static bool __maybe_unused xp_check_swiotlb_dma(struct xsk_buff_pool *pool) +{ +#if defined(CONFIG_SWIOTLB) + phys_addr_t paddr; + u32 i; + + for (i = 0; i < pool->dma_pages_cnt; i++) { + paddr = dma_to_phys(pool->dev, pool->dma_pages[i]); + if (is_swiotlb_buffer(paddr)) + return false; + } +#endif + return true; +} + +static bool xp_check_cheap_dma(struct xsk_buff_pool *pool) +{ +#if defined(CONFIG_HAS_DMA) + const struct dma_map_ops *ops = get_dma_ops(pool->dev); + + if (ops) { + return !ops->sync_single_for_cpu && + !ops->sync_single_for_device; + } + + if (!dma_is_direct(ops)) + return false; + + if (!xp_check_swiotlb_dma(pool)) + return false; + + if (!dev_is_dma_coherent(pool->dev)) { +#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \ + defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \ + defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) + return false; +#endif + } +#endif + return true; +} + +int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev, + unsigned long attrs, struct page **pages, u32 nr_pages) +{ + dma_addr_t dma; + u32 i; + + pool->dma_pages = kvcalloc(nr_pages, sizeof(*pool->dma_pages), + GFP_KERNEL); + if (!pool->dma_pages) + return -ENOMEM; + + pool->dev = dev; + pool->dma_pages_cnt = nr_pages; + + for (i = 0; i < pool->dma_pages_cnt; i++) { + dma = dma_map_page_attrs(dev, pages[i], 0, PAGE_SIZE, + DMA_BIDIRECTIONAL, attrs); + if (dma_mapping_error(dev, dma)) { + xp_dma_unmap(pool, attrs); + return -ENOMEM; + } + pool->dma_pages[i] = dma; + } + + if (pool->unaligned) + xp_check_dma_contiguity(pool); + + pool->dev = dev; + pool->cheap_dma = xp_check_cheap_dma(pool); + return 0; +} +EXPORT_SYMBOL(xp_dma_map); + +static bool xp_desc_crosses_non_contig_pg(struct xsk_buff_pool *pool, + u64 addr, u32 len) +{ + bool cross_pg = (addr & (PAGE_SIZE - 1)) + len > PAGE_SIZE; + + if (pool->dma_pages_cnt && cross_pg) { + return !(pool->dma_pages[addr >> PAGE_SHIFT] & + XSK_NEXT_PG_CONTIG_MASK); + } + return false; +} + +static bool xp_addr_crosses_non_contig_pg(struct xsk_buff_pool *pool, + u64 addr) +{ + return xp_desc_crosses_non_contig_pg(pool, addr, pool->chunk_size); +} + +void xp_release(struct xdp_buff_xsk *xskb) +{ + xskb->pool->free_heads[xskb->pool->free_heads_cnt++] = xskb; +} + +static u64 xp_aligned_extract_addr(struct xsk_buff_pool *pool, u64 addr) +{ + return addr & pool->chunk_mask; +} + +static u64 xp_unaligned_extract_addr(u64 addr) +{ + return addr & XSK_UNALIGNED_BUF_ADDR_MASK; +} + +static u64 xp_unaligned_extract_offset(u64 addr) +{ + return addr >> XSK_UNALIGNED_BUF_OFFSET_SHIFT; +} + +static u64 xp_unaligned_add_offset_to_addr(u64 addr) +{ + return xp_unaligned_extract_addr(addr) + + xp_unaligned_extract_offset(addr); +} + +static bool xp_check_unaligned(struct xsk_buff_pool *pool, u64 *addr) +{ + *addr = xp_unaligned_extract_addr(*addr); + if (*addr >= pool->addrs_cnt || + *addr + pool->chunk_size > pool->addrs_cnt || + xp_addr_crosses_non_contig_pg(pool, *addr)) + return false; + return true; +} + +static bool xp_check_aligned(struct xsk_buff_pool *pool, u64 *addr) +{ + *addr = xp_aligned_extract_addr(pool, *addr); + return *addr < pool->addrs_cnt; +} + +static struct xdp_buff_xsk *__xp_alloc(struct xsk_buff_pool *pool) +{ + struct xdp_buff_xsk *xskb; + u64 addr; + bool ok; + + if (pool->free_heads_cnt == 0) + return NULL; + + xskb = pool->free_heads[--pool->free_heads_cnt]; + + for (;;) { + if (!xskq_cons_peek_addr_unchecked(pool->fq, &addr)) { + xp_release(xskb); + return NULL; + } + + ok = pool->unaligned ? xp_check_unaligned(pool, &addr) : + xp_check_aligned(pool, &addr); + if (!ok) { + pool->fq->invalid_descs++; + xskq_cons_release(pool->fq); + continue; + } + break; + } + xskq_cons_release(pool->fq); + + xskb->orig_addr = addr; + xskb->xdp.data_hard_start = pool->addrs + addr + pool->headroom; + if (pool->dma_pages_cnt) { + xskb->frame_dma = (pool->dma_pages[addr >> PAGE_SHIFT] & + ~XSK_NEXT_PG_CONTIG_MASK) + + (addr & ~PAGE_MASK); + xskb->dma = xskb->frame_dma + pool->headroom + + XDP_PACKET_HEADROOM; + } + return xskb; +} + +struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool) +{ + struct xdp_buff_xsk *xskb; + + if (!pool->free_list_cnt) { + xskb = __xp_alloc(pool); + if (!xskb) + return NULL; + } else { + pool->free_list_cnt--; + xskb = list_first_entry(&pool->free_list, struct xdp_buff_xsk, + free_list_node); + list_del(&xskb->free_list_node); + } + + xskb->xdp.data = xskb->xdp.data_hard_start + XDP_PACKET_HEADROOM; + xskb->xdp.data_meta = xskb->xdp.data; + + if (!pool->cheap_dma) { + dma_sync_single_range_for_device(pool->dev, xskb->dma, 0, + pool->frame_len, + DMA_BIDIRECTIONAL); + } + return &xskb->xdp; +} +EXPORT_SYMBOL(xp_alloc); + +bool xp_can_alloc(struct xsk_buff_pool *pool, u32 count) +{ + if (pool->free_list_cnt >= count) + return true; + return xskq_cons_has_entries(pool->fq, count - pool->free_list_cnt); +} +EXPORT_SYMBOL(xp_can_alloc); + +void xp_free(struct xdp_buff_xsk *xskb) +{ + xskb->pool->free_list_cnt++; + list_add(&xskb->free_list_node, &xskb->pool->free_list); +} +EXPORT_SYMBOL(xp_free); + +static bool xp_aligned_validate_desc(struct xsk_buff_pool *pool, + struct xdp_desc *desc) +{ + u64 chunk, chunk_end; + + chunk = xp_aligned_extract_addr(pool, desc->addr); + chunk_end = xp_aligned_extract_addr(pool, desc->addr + desc->len); + if (chunk != chunk_end) + return false; + + if (chunk >= pool->addrs_cnt) + return false; + + if (desc->options) + return false; + return true; +} + +static bool xp_unaligned_validate_desc(struct xsk_buff_pool *pool, + struct xdp_desc *desc) +{ + u64 addr, base_addr; + + base_addr = xp_unaligned_extract_addr(desc->addr); + addr = xp_unaligned_add_offset_to_addr(desc->addr); + + if (desc->len > pool->chunk_size) + return false; + + if (base_addr >= pool->addrs_cnt || addr >= pool->addrs_cnt || + xp_desc_crosses_non_contig_pg(pool, addr, desc->len)) + return false; + + if (desc->options) + return false; + return true; +} + +bool xp_validate_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) +{ + return pool->unaligned ? xp_unaligned_validate_desc(pool, desc) : + xp_aligned_validate_desc(pool, desc); +} + +u64 xp_get_handle(struct xdp_buff_xsk *xskb) +{ + u64 offset = xskb->xdp.data - xskb->xdp.data_hard_start; + + offset += xskb->pool->headroom; + if (!xskb->pool->unaligned) + return xskb->orig_addr + offset; + return xskb->orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT); +} + +void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr) +{ + addr = pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr; + return pool->addrs + addr; +} +EXPORT_SYMBOL(xp_raw_get_data); + +dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr) +{ + addr = pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr; + return (pool->dma_pages[addr >> PAGE_SHIFT] & + ~XSK_NEXT_PG_CONTIG_MASK) + + (addr & ~PAGE_MASK); +} +EXPORT_SYMBOL(xp_raw_get_dma); + +dma_addr_t xp_get_dma(struct xdp_buff_xsk *xskb) +{ + return xskb->dma; +} +EXPORT_SYMBOL(xp_get_dma); + +dma_addr_t xp_get_frame_dma(struct xdp_buff_xsk *xskb) +{ + return xskb->frame_dma; +} +EXPORT_SYMBOL(xp_get_frame_dma); + +void xp_dma_sync_for_cpu(struct xdp_buff_xsk *xskb) +{ + if (xskb->pool->cheap_dma) + return; + + dma_sync_single_range_for_cpu(xskb->pool->dev, xskb->dma, 0, + xskb->pool->frame_len, DMA_BIDIRECTIONAL); +} +EXPORT_SYMBOL(xp_dma_sync_for_cpu); + +void xp_dma_sync_for_device(struct xsk_buff_pool *pool, dma_addr_t dma, + size_t size) +{ + if (pool->cheap_dma) + return; + + dma_sync_single_range_for_device(pool->dev, dma, 0, + size, DMA_BIDIRECTIONAL); +} +EXPORT_SYMBOL(xp_dma_sync_for_device); diff --git a/net/xdp/xsk_diag.c b/net/xdp/xsk_diag.c index f59791ba43a0..0163b26aaf63 100644 --- a/net/xdp/xsk_diag.c +++ b/net/xdp/xsk_diag.c @@ -56,7 +56,7 @@ static int xsk_diag_put_umem(const struct xdp_sock *xs, struct sk_buff *nlskb) du.id = umem->id; du.size = umem->size; du.num_pages = umem->npgs; - du.chunk_size = umem->chunk_size_nohr + umem->headroom; + du.chunk_size = umem->chunk_size; du.headroom = umem->headroom; du.ifindex = umem->dev ? umem->dev->ifindex : 0; du.queue_id = umem->queue_id; diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index a322a7dac58c..9151aef7dbca 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -9,6 +9,7 @@ #include #include #include +#include #include "xsk.h" @@ -172,31 +173,45 @@ out: return false; } -static inline bool xskq_cons_is_valid_desc(struct xsk_queue *q, - struct xdp_desc *d, - struct xdp_umem *umem) +static inline bool xskq_cons_read_addr_aligned(struct xsk_queue *q, u64 *addr) { - if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) { - if (!xskq_cons_is_valid_unaligned(q, d->addr, d->len, umem)) - return false; + struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; - if (d->len > umem->chunk_size_nohr || d->options) { - q->invalid_descs++; - return false; - } + while (q->cached_cons != q->cached_prod) { + u32 idx = q->cached_cons & q->ring_mask; + + *addr = ring->desc[idx]; + if (xskq_cons_is_valid_addr(q, *addr)) + return true; + q->cached_cons++; + } + + return false; +} + +static inline bool xskq_cons_read_addr_unchecked(struct xsk_queue *q, u64 *addr) +{ + struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; + + if (q->cached_cons != q->cached_prod) { + u32 idx = q->cached_cons & q->ring_mask; + + *addr = ring->desc[idx]; return true; } - if (!xskq_cons_is_valid_addr(q, d->addr)) - return false; + return false; +} - if (((d->addr + d->len) & q->chunk_mask) != (d->addr & q->chunk_mask) || - d->options) { +static inline bool xskq_cons_is_valid_desc(struct xsk_queue *q, + struct xdp_desc *d, + struct xdp_umem *umem) +{ + if (!xp_validate_desc(umem->pool, d)) { q->invalid_descs++; return false; } - return true; } @@ -260,6 +275,20 @@ static inline bool xskq_cons_peek_addr(struct xsk_queue *q, u64 *addr, return xskq_cons_read_addr(q, addr, umem); } +static inline bool xskq_cons_peek_addr_aligned(struct xsk_queue *q, u64 *addr) +{ + if (q->cached_prod == q->cached_cons) + xskq_cons_get_entries(q); + return xskq_cons_read_addr_aligned(q, addr); +} + +static inline bool xskq_cons_peek_addr_unchecked(struct xsk_queue *q, u64 *addr) +{ + if (q->cached_prod == q->cached_cons) + xskq_cons_get_entries(q); + return xskq_cons_read_addr_unchecked(q, addr); +} + static inline bool xskq_cons_peek_desc(struct xsk_queue *q, struct xdp_desc *desc, struct xdp_umem *umem) -- cgit v1.2.3 From 0807892ecb35734b7ce6f7c29b078f1b60151c94 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Wed, 20 May 2020 21:21:00 +0200 Subject: xsk: Remove MEM_TYPE_ZERO_COPY and corresponding code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are no users of MEM_TYPE_ZERO_COPY. Remove all corresponding code, including the "handle" member of struct xdp_buff. rfc->v1: Fixed spelling in commit message. (Björn) Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200520192103.355233-13-bjorn.topel@gmail.com --- drivers/net/hyperv/netvsc_bpf.c | 1 - include/net/xdp.h | 9 +-- include/net/xdp_sock.h | 45 ------------ include/net/xdp_sock_drv.h | 149 ---------------------------------------- include/trace/events/xdp.h | 1 - net/core/xdp.c | 42 ++--------- net/xdp/xdp_umem.c | 56 +-------------- net/xdp/xsk.c | 48 +------------ net/xdp/xsk_buff_pool.c | 7 ++ net/xdp/xsk_queue.c | 62 ----------------- net/xdp/xsk_queue.h | 105 ---------------------------- 11 files changed, 15 insertions(+), 510 deletions(-) (limited to 'net') diff --git a/drivers/net/hyperv/netvsc_bpf.c b/drivers/net/hyperv/netvsc_bpf.c index 1e0c024b0a93..8e4141552423 100644 --- a/drivers/net/hyperv/netvsc_bpf.c +++ b/drivers/net/hyperv/netvsc_bpf.c @@ -50,7 +50,6 @@ u32 netvsc_run_xdp(struct net_device *ndev, struct netvsc_channel *nvchan, xdp->data_end = xdp->data + len; xdp->rxq = &nvchan->xdp_rxq; xdp->frame_sz = PAGE_SIZE; - xdp->handle = 0; memcpy(xdp->data, data, len); diff --git a/include/net/xdp.h b/include/net/xdp.h index f432134c7c00..90f11760bd12 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -39,7 +39,6 @@ enum xdp_mem_type { MEM_TYPE_PAGE_SHARED = 0, /* Split-page refcnt based model */ MEM_TYPE_PAGE_ORDER0, /* Orig XDP full page model */ MEM_TYPE_PAGE_POOL, - MEM_TYPE_ZERO_COPY, MEM_TYPE_XSK_BUFF_POOL, MEM_TYPE_MAX, }; @@ -55,10 +54,6 @@ struct xdp_mem_info { struct page_pool; -struct zero_copy_allocator { - void (*free)(struct zero_copy_allocator *zca, unsigned long handle); -}; - struct xdp_rxq_info { struct net_device *dev; u32 queue_index; @@ -71,7 +66,6 @@ struct xdp_buff { void *data_end; void *data_meta; void *data_hard_start; - unsigned long handle; struct xdp_rxq_info *rxq; u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/ }; @@ -120,8 +114,7 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp) int metasize; int headroom; - if (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY || - xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) + if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) return xdp_convert_zc_to_xdp_frame(xdp); /* Assure headroom is available for storing info */ diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 6e7265f63c04..96bfc5f5f24e 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -17,26 +17,12 @@ struct net_device; struct xsk_queue; struct xdp_buff; -struct xdp_umem_page { - void *addr; - dma_addr_t dma; -}; - -struct xdp_umem_fq_reuse { - u32 nentries; - u32 length; - u64 handles[]; -}; - struct xdp_umem { struct xsk_queue *fq; struct xsk_queue *cq; struct xsk_buff_pool *pool; - struct xdp_umem_page *pages; - u64 chunk_mask; u64 size; u32 headroom; - u32 chunk_size_nohr; u32 chunk_size; struct user_struct *user; refcount_t users; @@ -48,7 +34,6 @@ struct xdp_umem { u8 flags; int id; struct net_device *dev; - struct xdp_umem_fq_reuse *fq_reuse; bool zc; spinlock_t xsk_tx_list_lock; struct list_head xsk_tx_list; @@ -109,21 +94,6 @@ static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, return xs; } -static inline u64 xsk_umem_extract_addr(u64 addr) -{ - return addr & XSK_UNALIGNED_BUF_ADDR_MASK; -} - -static inline u64 xsk_umem_extract_offset(u64 addr) -{ - return addr >> XSK_UNALIGNED_BUF_OFFSET_SHIFT; -} - -static inline u64 xsk_umem_add_offset_to_addr(u64 addr) -{ - return xsk_umem_extract_addr(addr) + xsk_umem_extract_offset(addr); -} - #else static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) @@ -146,21 +116,6 @@ static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, return NULL; } -static inline u64 xsk_umem_extract_addr(u64 addr) -{ - return 0; -} - -static inline u64 xsk_umem_extract_offset(u64 addr) -{ - return 0; -} - -static inline u64 xsk_umem_add_offset_to_addr(u64 addr) -{ - return 0; -} - #endif /* CONFIG_XDP_SOCKETS */ #endif /* _LINUX_XDP_SOCK_H */ diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 7752c8663d1b..ccf848f7efa4 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -11,16 +11,9 @@ #ifdef CONFIG_XDP_SOCKETS -bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt); -bool xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr); -void xsk_umem_release_addr(struct xdp_umem *umem); void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries); bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc); void xsk_umem_consume_tx_done(struct xdp_umem *umem); -struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries); -struct xdp_umem_fq_reuse *xsk_reuseq_swap(struct xdp_umem *umem, - struct xdp_umem_fq_reuse *newq); -void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq); struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev, u16 queue_id); void xsk_set_rx_need_wakeup(struct xdp_umem *umem); void xsk_set_tx_need_wakeup(struct xdp_umem *umem); @@ -28,80 +21,6 @@ void xsk_clear_rx_need_wakeup(struct xdp_umem *umem); void xsk_clear_tx_need_wakeup(struct xdp_umem *umem); bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem); -static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr) -{ - unsigned long page_addr; - - addr = xsk_umem_add_offset_to_addr(addr); - page_addr = (unsigned long)umem->pages[addr >> PAGE_SHIFT].addr; - - return (char *)(page_addr & PAGE_MASK) + (addr & ~PAGE_MASK); -} - -static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr) -{ - addr = xsk_umem_add_offset_to_addr(addr); - - return umem->pages[addr >> PAGE_SHIFT].dma + (addr & ~PAGE_MASK); -} - -/* Reuse-queue aware version of FILL queue helpers */ -static inline bool xsk_umem_has_addrs_rq(struct xdp_umem *umem, u32 cnt) -{ - struct xdp_umem_fq_reuse *rq = umem->fq_reuse; - - if (rq->length >= cnt) - return true; - - return xsk_umem_has_addrs(umem, cnt - rq->length); -} - -static inline bool xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr) -{ - struct xdp_umem_fq_reuse *rq = umem->fq_reuse; - - if (!rq->length) - return xsk_umem_peek_addr(umem, addr); - - *addr = rq->handles[rq->length - 1]; - return addr; -} - -static inline void xsk_umem_release_addr_rq(struct xdp_umem *umem) -{ - struct xdp_umem_fq_reuse *rq = umem->fq_reuse; - - if (!rq->length) - xsk_umem_release_addr(umem); - else - rq->length--; -} - -static inline void xsk_umem_fq_reuse(struct xdp_umem *umem, u64 addr) -{ - struct xdp_umem_fq_reuse *rq = umem->fq_reuse; - - rq->handles[rq->length++] = addr; -} - -/* Handle the offset appropriately depending on aligned or unaligned mode. - * For unaligned mode, we store the offset in the upper 16-bits of the address. - * For aligned mode, we simply add the offset to the address. - */ -static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 address, - u64 offset) -{ - if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) - return address + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT); - else - return address + offset; -} - -static inline u32 xsk_umem_xdp_frame_sz(struct xdp_umem *umem) -{ - return umem->chunk_size_nohr; -} - static inline u32 xsk_umem_get_headroom(struct xdp_umem *umem) { return XDP_PACKET_HEADROOM + umem->headroom; @@ -192,20 +111,6 @@ static inline void xsk_buff_raw_dma_sync_for_device(struct xdp_umem *umem, #else -static inline bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt) -{ - return false; -} - -static inline u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr) -{ - return NULL; -} - -static inline void xsk_umem_release_addr(struct xdp_umem *umem) -{ -} - static inline void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries) { } @@ -220,55 +125,12 @@ static inline void xsk_umem_consume_tx_done(struct xdp_umem *umem) { } -static inline struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries) -{ - return NULL; -} - -static inline struct xdp_umem_fq_reuse *xsk_reuseq_swap( - struct xdp_umem *umem, struct xdp_umem_fq_reuse *newq) -{ - return NULL; -} - -static inline void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq) -{ -} - static inline struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev, u16 queue_id) { return NULL; } -static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr) -{ - return NULL; -} - -static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr) -{ - return 0; -} - -static inline bool xsk_umem_has_addrs_rq(struct xdp_umem *umem, u32 cnt) -{ - return false; -} - -static inline u64 *xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr) -{ - return NULL; -} - -static inline void xsk_umem_release_addr_rq(struct xdp_umem *umem) -{ -} - -static inline void xsk_umem_fq_reuse(struct xdp_umem *umem, u64 addr) -{ -} - static inline void xsk_set_rx_need_wakeup(struct xdp_umem *umem) { } @@ -290,17 +152,6 @@ static inline bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem) return false; } -static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 handle, - u64 offset) -{ - return 0; -} - -static inline u32 xsk_umem_xdp_frame_sz(struct xdp_umem *umem) -{ - return 0; -} - static inline u32 xsk_umem_get_headroom(struct xdp_umem *umem) { return 0; diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index 48547a12fa27..b73d3e141323 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -287,7 +287,6 @@ TRACE_EVENT(xdp_devmap_xmit, FN(PAGE_SHARED) \ FN(PAGE_ORDER0) \ FN(PAGE_POOL) \ - FN(ZERO_COPY) \ FN(XSK_BUFF_POOL) #define __MEM_TYPE_TP_FN(x) \ diff --git a/net/core/xdp.c b/net/core/xdp.c index f0ce8b195193..a8c2f243367d 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -110,27 +110,6 @@ static void mem_allocator_disconnect(void *allocator) mutex_unlock(&mem_id_lock); } -static void mem_id_disconnect(int id) -{ - struct xdp_mem_allocator *xa; - - mutex_lock(&mem_id_lock); - - xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params); - if (!xa) { - mutex_unlock(&mem_id_lock); - WARN(1, "Request remove non-existing id(%d), driver bug?", id); - return; - } - - trace_mem_disconnect(xa); - - if (!rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params)) - call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free); - - mutex_unlock(&mem_id_lock); -} - void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) { struct xdp_mem_allocator *xa; @@ -144,9 +123,6 @@ void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) if (id == 0) return; - if (xdp_rxq->mem.type == MEM_TYPE_ZERO_COPY) - return mem_id_disconnect(id); - if (xdp_rxq->mem.type == MEM_TYPE_PAGE_POOL) { rcu_read_lock(); xa = rhashtable_lookup(mem_id_ht, &id, mem_id_rht_params); @@ -302,7 +278,7 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, xdp_rxq->mem.type = type; if (!allocator) { - if (type == MEM_TYPE_PAGE_POOL || type == MEM_TYPE_ZERO_COPY) + if (type == MEM_TYPE_PAGE_POOL) return -EINVAL; /* Setup time check page_pool req */ return 0; } @@ -362,7 +338,7 @@ EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); * of xdp_frames/pages in those cases. */ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, - unsigned long handle, struct xdp_buff *xdp) + struct xdp_buff *xdp) { struct xdp_mem_allocator *xa; struct page *page; @@ -384,14 +360,6 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, page = virt_to_page(data); /* Assumes order0 page*/ put_page(page); break; - case MEM_TYPE_ZERO_COPY: - /* NB! Only valid from an xdp_buff! */ - rcu_read_lock(); - /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */ - xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); - xa->zc_alloc->free(xa->zc_alloc, handle); - rcu_read_unlock(); - break; case MEM_TYPE_XSK_BUFF_POOL: /* NB! Only valid from an xdp_buff! */ xsk_buff_free(xdp); @@ -404,19 +372,19 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, void xdp_return_frame(struct xdp_frame *xdpf) { - __xdp_return(xdpf->data, &xdpf->mem, false, 0, NULL); + __xdp_return(xdpf->data, &xdpf->mem, false, NULL); } EXPORT_SYMBOL_GPL(xdp_return_frame); void xdp_return_frame_rx_napi(struct xdp_frame *xdpf) { - __xdp_return(xdpf->data, &xdpf->mem, true, 0, NULL); + __xdp_return(xdpf->data, &xdpf->mem, true, NULL); } EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi); void xdp_return_buff(struct xdp_buff *xdp) { - __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp->handle, xdp); + __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp); } EXPORT_SYMBOL_GPL(xdp_return_buff); diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 7f04688045d5..19e59d1a5e9f 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -179,37 +179,6 @@ void xdp_umem_clear_dev(struct xdp_umem *umem) umem->zc = false; } -static void xdp_umem_unmap_pages(struct xdp_umem *umem) -{ - unsigned int i; - - for (i = 0; i < umem->npgs; i++) - if (PageHighMem(umem->pgs[i])) - vunmap(umem->pages[i].addr); -} - -static int xdp_umem_map_pages(struct xdp_umem *umem) -{ - unsigned int i; - void *addr; - - for (i = 0; i < umem->npgs; i++) { - if (PageHighMem(umem->pgs[i])) - addr = vmap(&umem->pgs[i], 1, VM_MAP, PAGE_KERNEL); - else - addr = page_address(umem->pgs[i]); - - if (!addr) { - xdp_umem_unmap_pages(umem); - return -ENOMEM; - } - - umem->pages[i].addr = addr; - } - - return 0; -} - static void xdp_umem_unpin_pages(struct xdp_umem *umem) { unpin_user_pages_dirty_lock(umem->pgs, umem->npgs, true); @@ -244,14 +213,9 @@ static void xdp_umem_release(struct xdp_umem *umem) umem->cq = NULL; } - xsk_reuseq_destroy(umem); xp_destroy(umem->pool); - xdp_umem_unmap_pages(umem); xdp_umem_unpin_pages(umem); - kvfree(umem->pages); - umem->pages = NULL; - xdp_umem_unaccount_pages(umem); kfree(umem); } @@ -385,11 +349,8 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) if (headroom >= chunk_size - XDP_PACKET_HEADROOM) return -EINVAL; - umem->chunk_mask = unaligned_chunks ? XSK_UNALIGNED_BUF_ADDR_MASK - : ~((u64)chunk_size - 1); umem->size = size; umem->headroom = headroom; - umem->chunk_size_nohr = chunk_size - headroom; umem->chunk_size = chunk_size; umem->npgs = size / PAGE_SIZE; umem->pgs = NULL; @@ -408,29 +369,14 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) if (err) goto out_account; - umem->pages = kvcalloc(umem->npgs, sizeof(*umem->pages), - GFP_KERNEL_ACCOUNT); - if (!umem->pages) { - err = -ENOMEM; - goto out_pin; - } - - err = xdp_umem_map_pages(umem); - if (err) - goto out_pages; - umem->pool = xp_create(umem->pgs, umem->npgs, chunks, chunk_size, headroom, size, unaligned_chunks); if (!umem->pool) { err = -ENOMEM; - goto out_unmap; + goto out_pin; } return 0; -out_unmap: - xdp_umem_unmap_pages(umem); -out_pages: - kvfree(umem->pages); out_pin: xdp_umem_unpin_pages(umem); out_account: diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 6933f0d494ba..3f2ab732ab8b 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -39,24 +39,6 @@ bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) READ_ONCE(xs->umem->fq); } -bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt) -{ - return xskq_cons_has_entries(umem->fq, cnt); -} -EXPORT_SYMBOL(xsk_umem_has_addrs); - -bool xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr) -{ - return xskq_cons_peek_addr(umem->fq, addr, umem); -} -EXPORT_SYMBOL(xsk_umem_peek_addr); - -void xsk_umem_release_addr(struct xdp_umem *umem) -{ - xskq_cons_release(umem->fq); -} -EXPORT_SYMBOL(xsk_umem_release_addr); - void xsk_set_rx_need_wakeup(struct xdp_umem *umem) { if (umem->need_wakeup & XDP_WAKEUP_RX) @@ -203,8 +185,7 @@ static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, len = xdp->data_end - xdp->data; - return xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY || - xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL ? + return xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL ? __xsk_rcv_zc(xs, xdp, len) : __xsk_rcv(xs, xdp, len, explicit_free); } @@ -588,24 +569,6 @@ static struct socket *xsk_lookup_xsk_from_fd(int fd) return sock; } -/* Check if umem pages are contiguous. - * If zero-copy mode, use the DMA address to do the page contiguity check - * For all other modes we use addr (kernel virtual address) - * Store the result in the low bits of addr. - */ -static void xsk_check_page_contiguity(struct xdp_umem *umem, u32 flags) -{ - struct xdp_umem_page *pgs = umem->pages; - int i, is_contig; - - for (i = 0; i < umem->npgs - 1; i++) { - is_contig = (flags & XDP_ZEROCOPY) ? - (pgs[i].dma + PAGE_SIZE == pgs[i + 1].dma) : - (pgs[i].addr + PAGE_SIZE == pgs[i + 1].addr); - pgs[i].addr += is_contig << XSK_NEXT_PG_CONTIG_SHIFT; - } -} - static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr; @@ -688,23 +651,14 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) goto out_unlock; } else { /* This xsk has its own umem. */ - xskq_set_umem(xs->umem->fq, xs->umem->size, - xs->umem->chunk_mask); - xskq_set_umem(xs->umem->cq, xs->umem->size, - xs->umem->chunk_mask); - err = xdp_umem_assign_dev(xs->umem, dev, qid, flags); if (err) goto out_unlock; - - xsk_check_page_contiguity(xs->umem, flags); } xs->dev = dev; xs->zc = xs->umem->zc; xs->queue_id = qid; - xskq_set_umem(xs->rx, xs->umem->size, xs->umem->chunk_mask); - xskq_set_umem(xs->tx, xs->umem->size, xs->umem->chunk_mask); xdp_add_sk_umem(xs->umem, xs); out_unlock: diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index e214a5795a62..89dae78865e7 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -8,6 +8,13 @@ #include "xsk_queue.h" +/* Masks for xdp_umem_page flags. + * The low 12-bits of the addr will be 0 since this is the page address, so we + * can use them for flags. + */ +#define XSK_NEXT_PG_CONTIG_SHIFT 0 +#define XSK_NEXT_PG_CONTIG_MASK BIT_ULL(XSK_NEXT_PG_CONTIG_SHIFT) + struct xsk_buff_pool { struct xsk_queue *fq; struct list_head free_list; diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c index 554b1ebb4d02..6cf9586e5027 100644 --- a/net/xdp/xsk_queue.c +++ b/net/xdp/xsk_queue.c @@ -10,15 +10,6 @@ #include "xsk_queue.h" -void xskq_set_umem(struct xsk_queue *q, u64 umem_size, u64 chunk_mask) -{ - if (!q) - return; - - q->umem_size = umem_size; - q->chunk_mask = chunk_mask; -} - static size_t xskq_get_ring_size(struct xsk_queue *q, bool umem_queue) { struct xdp_umem_ring *umem_ring; @@ -64,56 +55,3 @@ void xskq_destroy(struct xsk_queue *q) page_frag_free(q->ring); kfree(q); } - -struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries) -{ - struct xdp_umem_fq_reuse *newq; - - /* Check for overflow */ - if (nentries > (u32)roundup_pow_of_two(nentries)) - return NULL; - nentries = roundup_pow_of_two(nentries); - - newq = kvmalloc(struct_size(newq, handles, nentries), GFP_KERNEL); - if (!newq) - return NULL; - memset(newq, 0, offsetof(typeof(*newq), handles)); - - newq->nentries = nentries; - return newq; -} -EXPORT_SYMBOL_GPL(xsk_reuseq_prepare); - -struct xdp_umem_fq_reuse *xsk_reuseq_swap(struct xdp_umem *umem, - struct xdp_umem_fq_reuse *newq) -{ - struct xdp_umem_fq_reuse *oldq = umem->fq_reuse; - - if (!oldq) { - umem->fq_reuse = newq; - return NULL; - } - - if (newq->nentries < oldq->length) - return newq; - - memcpy(newq->handles, oldq->handles, - array_size(oldq->length, sizeof(u64))); - newq->length = oldq->length; - - umem->fq_reuse = newq; - return oldq; -} -EXPORT_SYMBOL_GPL(xsk_reuseq_swap); - -void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq) -{ - kvfree(rq); -} -EXPORT_SYMBOL_GPL(xsk_reuseq_free); - -void xsk_reuseq_destroy(struct xdp_umem *umem) -{ - xsk_reuseq_free(umem->fq_reuse); - umem->fq_reuse = NULL; -} diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 9151aef7dbca..16bf15864788 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -32,8 +32,6 @@ struct xdp_umem_ring { }; struct xsk_queue { - u64 chunk_mask; - u64 umem_size; u32 ring_mask; u32 nentries; u32 cached_prod; @@ -106,90 +104,6 @@ struct xsk_queue { /* Functions that read and validate content from consumer rings. */ -static inline bool xskq_cons_crosses_non_contig_pg(struct xdp_umem *umem, - u64 addr, - u64 length) -{ - bool cross_pg = (addr & (PAGE_SIZE - 1)) + length > PAGE_SIZE; - bool next_pg_contig = - (unsigned long)umem->pages[(addr >> PAGE_SHIFT)].addr & - XSK_NEXT_PG_CONTIG_MASK; - - return cross_pg && !next_pg_contig; -} - -static inline bool xskq_cons_is_valid_unaligned(struct xsk_queue *q, - u64 addr, - u64 length, - struct xdp_umem *umem) -{ - u64 base_addr = xsk_umem_extract_addr(addr); - - addr = xsk_umem_add_offset_to_addr(addr); - if (base_addr >= q->umem_size || addr >= q->umem_size || - xskq_cons_crosses_non_contig_pg(umem, addr, length)) { - q->invalid_descs++; - return false; - } - - return true; -} - -static inline bool xskq_cons_is_valid_addr(struct xsk_queue *q, u64 addr) -{ - if (addr >= q->umem_size) { - q->invalid_descs++; - return false; - } - - return true; -} - -static inline bool xskq_cons_read_addr(struct xsk_queue *q, u64 *addr, - struct xdp_umem *umem) -{ - struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; - - while (q->cached_cons != q->cached_prod) { - u32 idx = q->cached_cons & q->ring_mask; - - *addr = ring->desc[idx] & q->chunk_mask; - - if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) { - if (xskq_cons_is_valid_unaligned(q, *addr, - umem->chunk_size_nohr, - umem)) - return true; - goto out; - } - - if (xskq_cons_is_valid_addr(q, *addr)) - return true; - -out: - q->cached_cons++; - } - - return false; -} - -static inline bool xskq_cons_read_addr_aligned(struct xsk_queue *q, u64 *addr) -{ - struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; - - while (q->cached_cons != q->cached_prod) { - u32 idx = q->cached_cons & q->ring_mask; - - *addr = ring->desc[idx]; - if (xskq_cons_is_valid_addr(q, *addr)) - return true; - - q->cached_cons++; - } - - return false; -} - static inline bool xskq_cons_read_addr_unchecked(struct xsk_queue *q, u64 *addr) { struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; @@ -267,21 +181,6 @@ static inline bool xskq_cons_has_entries(struct xsk_queue *q, u32 cnt) return entries >= cnt; } -static inline bool xskq_cons_peek_addr(struct xsk_queue *q, u64 *addr, - struct xdp_umem *umem) -{ - if (q->cached_prod == q->cached_cons) - xskq_cons_get_entries(q); - return xskq_cons_read_addr(q, addr, umem); -} - -static inline bool xskq_cons_peek_addr_aligned(struct xsk_queue *q, u64 *addr) -{ - if (q->cached_prod == q->cached_cons) - xskq_cons_get_entries(q); - return xskq_cons_read_addr_aligned(q, addr); -} - static inline bool xskq_cons_peek_addr_unchecked(struct xsk_queue *q, u64 *addr) { if (q->cached_prod == q->cached_cons) @@ -410,11 +309,7 @@ static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q) return q ? q->invalid_descs : 0; } -void xskq_set_umem(struct xsk_queue *q, u64 umem_size, u64 chunk_mask); struct xsk_queue *xskq_create(u32 nentries, bool umem_queue); void xskq_destroy(struct xsk_queue *q_ops); -/* Executed by the core when the entire UMEM gets freed */ -void xsk_reuseq_destroy(struct xdp_umem *umem); - #endif /* _LINUX_XSK_QUEUE_H */ -- cgit v1.2.3 From 82c41671ca4f597b6ff05bd5d118161deec26e07 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Wed, 20 May 2020 21:21:01 +0200 Subject: xdp: Simplify xdp_return_{frame, frame_rx_napi, buff} MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The xdp_return_{frame,frame_rx_napi,buff} function are never used, except in xdp_convert_zc_to_xdp_frame(), by the MEM_TYPE_XSK_BUFF_POOL memory type. To simplify and reduce code, change so that xdp_convert_zc_to_xdp_frame() calls xsk_buff_free() directly since the type is know, and remove MEM_TYPE_XSK_BUFF_POOL from the switch statement in __xdp_return() function. Suggested-by: Maxim Mikityanskiy Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200520192103.355233-14-bjorn.topel@gmail.com --- net/core/xdp.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/core/xdp.c b/net/core/xdp.c index a8c2f243367d..90f44f382115 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -335,10 +335,11 @@ EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); * scenarios (e.g. queue full), it is possible to return the xdp_frame * while still leveraging this protection. The @napi_direct boolean * is used for those calls sites. Thus, allowing for faster recycling - * of xdp_frames/pages in those cases. + * of xdp_frames/pages in those cases. This path is never used by the + * MEM_TYPE_XSK_BUFF_POOL memory type, so it's explicitly not part of + * the switch-statement. */ -static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, - struct xdp_buff *xdp) +static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct) { struct xdp_mem_allocator *xa; struct page *page; @@ -360,33 +361,29 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, page = virt_to_page(data); /* Assumes order0 page*/ put_page(page); break; - case MEM_TYPE_XSK_BUFF_POOL: - /* NB! Only valid from an xdp_buff! */ - xsk_buff_free(xdp); - break; default: /* Not possible, checked in xdp_rxq_info_reg_mem_model() */ + WARN(1, "Incorrect XDP memory type (%d) usage", mem->type); break; } } void xdp_return_frame(struct xdp_frame *xdpf) { - __xdp_return(xdpf->data, &xdpf->mem, false, NULL); + __xdp_return(xdpf->data, &xdpf->mem, false); } EXPORT_SYMBOL_GPL(xdp_return_frame); void xdp_return_frame_rx_napi(struct xdp_frame *xdpf) { - __xdp_return(xdpf->data, &xdpf->mem, true, NULL); + __xdp_return(xdpf->data, &xdpf->mem, true); } EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi); void xdp_return_buff(struct xdp_buff *xdp) { - __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp); + __xdp_return(xdp->data, &xdp->rxq->mem, true); } -EXPORT_SYMBOL_GPL(xdp_return_buff); /* Only called for MEM_TYPE_PAGE_POOL see xdp.h */ void __xdp_release_frame(void *data, struct xdp_mem_info *mem) @@ -467,7 +464,7 @@ struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp) xdpf->metasize = metasize; xdpf->mem.type = MEM_TYPE_PAGE_ORDER0; - xdp_return_buff(xdp); + xsk_buff_free(xdp); return xdpf; } EXPORT_SYMBOL_GPL(xdp_convert_zc_to_xdp_frame); -- cgit v1.2.3 From 26062b185eee49142adc45f9aa187d909d02d961 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Wed, 20 May 2020 21:21:02 +0200 Subject: xsk: Explicitly inline functions and move definitions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In order to reduce the number of function calls, the struct xsk_buff_pool definition is moved to xsk_buff_pool.h. The functions xp_get_dma(), xp_dma_sync_for_cpu(), xp_dma_sync_for_device(), xp_validate_desc() and various helper functions are explicitly inlined. Further, move xp_get_handle() and xp_release() to xsk.c, to allow for the compiler to perform inlining. rfc->v1: Make sure xp_validate_desc() is inlined for Tx perf. (Maxim) Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200520192103.355233-15-bjorn.topel@gmail.com --- include/net/xsk_buff_pool.h | 98 ++++++++++++++++++++++++++--- net/xdp/xsk.c | 15 +++++ net/xdp/xsk_buff_pool.c | 148 ++------------------------------------------ net/xdp/xsk_queue.h | 45 ++++++++++++++ 4 files changed, 156 insertions(+), 150 deletions(-) (limited to 'net') diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index 9f221b36e405..a4ff226505c9 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -4,6 +4,7 @@ #ifndef XSK_BUFF_POOL_H_ #define XSK_BUFF_POOL_H_ +#include #include #include #include @@ -25,6 +26,27 @@ struct xdp_buff_xsk { struct list_head free_list_node; }; +struct xsk_buff_pool { + struct xsk_queue *fq; + struct list_head free_list; + dma_addr_t *dma_pages; + struct xdp_buff_xsk *heads; + u64 chunk_mask; + u64 addrs_cnt; + u32 free_list_cnt; + u32 dma_pages_cnt; + u32 heads_cnt; + u32 free_heads_cnt; + u32 headroom; + u32 chunk_size; + u32 frame_len; + bool cheap_dma; + bool unaligned; + void *addrs; + struct device *dev; + struct xdp_buff_xsk *free_heads[]; +}; + /* AF_XDP core. */ struct xsk_buff_pool *xp_create(struct page **pages, u32 nr_pages, u32 chunks, u32 chunk_size, u32 headroom, u64 size, @@ -32,8 +54,6 @@ struct xsk_buff_pool *xp_create(struct page **pages, u32 nr_pages, u32 chunks, void xp_set_fq(struct xsk_buff_pool *pool, struct xsk_queue *fq); void xp_destroy(struct xsk_buff_pool *pool); void xp_release(struct xdp_buff_xsk *xskb); -u64 xp_get_handle(struct xdp_buff_xsk *xskb); -bool xp_validate_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc); /* AF_XDP, and XDP core. */ void xp_free(struct xdp_buff_xsk *xskb); @@ -47,10 +67,74 @@ struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool); bool xp_can_alloc(struct xsk_buff_pool *pool, u32 count); void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr); dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr); -dma_addr_t xp_get_dma(struct xdp_buff_xsk *xskb); -dma_addr_t xp_get_frame_dma(struct xdp_buff_xsk *xskb); -void xp_dma_sync_for_cpu(struct xdp_buff_xsk *xskb); -void xp_dma_sync_for_device(struct xsk_buff_pool *pool, dma_addr_t dma, - size_t size); +static inline dma_addr_t xp_get_dma(struct xdp_buff_xsk *xskb) +{ + return xskb->dma; +} + +static inline dma_addr_t xp_get_frame_dma(struct xdp_buff_xsk *xskb) +{ + return xskb->frame_dma; +} + +void xp_dma_sync_for_cpu_slow(struct xdp_buff_xsk *xskb); +static inline void xp_dma_sync_for_cpu(struct xdp_buff_xsk *xskb) +{ + if (xskb->pool->cheap_dma) + return; + + xp_dma_sync_for_cpu_slow(xskb); +} + +void xp_dma_sync_for_device_slow(struct xsk_buff_pool *pool, dma_addr_t dma, + size_t size); +static inline void xp_dma_sync_for_device(struct xsk_buff_pool *pool, + dma_addr_t dma, size_t size) +{ + if (pool->cheap_dma) + return; + + xp_dma_sync_for_device_slow(pool, dma, size); +} + +/* Masks for xdp_umem_page flags. + * The low 12-bits of the addr will be 0 since this is the page address, so we + * can use them for flags. + */ +#define XSK_NEXT_PG_CONTIG_SHIFT 0 +#define XSK_NEXT_PG_CONTIG_MASK BIT_ULL(XSK_NEXT_PG_CONTIG_SHIFT) + +static inline bool xp_desc_crosses_non_contig_pg(struct xsk_buff_pool *pool, + u64 addr, u32 len) +{ + bool cross_pg = (addr & (PAGE_SIZE - 1)) + len > PAGE_SIZE; + + if (pool->dma_pages_cnt && cross_pg) { + return !(pool->dma_pages[addr >> PAGE_SHIFT] & + XSK_NEXT_PG_CONTIG_MASK); + } + return false; +} + +static inline u64 xp_aligned_extract_addr(struct xsk_buff_pool *pool, u64 addr) +{ + return addr & pool->chunk_mask; +} + +static inline u64 xp_unaligned_extract_addr(u64 addr) +{ + return addr & XSK_UNALIGNED_BUF_ADDR_MASK; +} + +static inline u64 xp_unaligned_extract_offset(u64 addr) +{ + return addr >> XSK_UNALIGNED_BUF_OFFSET_SHIFT; +} + +static inline u64 xp_unaligned_add_offset_to_addr(u64 addr) +{ + return xp_unaligned_extract_addr(addr) + + xp_unaligned_extract_offset(addr); +} #endif /* XSK_BUFF_POOL_H_ */ diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 3f2ab732ab8b..b6c0f08bd80d 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -99,6 +99,21 @@ bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem) } EXPORT_SYMBOL(xsk_umem_uses_need_wakeup); +void xp_release(struct xdp_buff_xsk *xskb) +{ + xskb->pool->free_heads[xskb->pool->free_heads_cnt++] = xskb; +} + +static u64 xp_get_handle(struct xdp_buff_xsk *xskb) +{ + u64 offset = xskb->xdp.data - xskb->xdp.data_hard_start; + + offset += xskb->pool->headroom; + if (!xskb->pool->unaligned) + return xskb->orig_addr + offset; + return xskb->orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT); +} + static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 89dae78865e7..540ed75e4482 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -8,34 +8,6 @@ #include "xsk_queue.h" -/* Masks for xdp_umem_page flags. - * The low 12-bits of the addr will be 0 since this is the page address, so we - * can use them for flags. - */ -#define XSK_NEXT_PG_CONTIG_SHIFT 0 -#define XSK_NEXT_PG_CONTIG_MASK BIT_ULL(XSK_NEXT_PG_CONTIG_SHIFT) - -struct xsk_buff_pool { - struct xsk_queue *fq; - struct list_head free_list; - dma_addr_t *dma_pages; - struct xdp_buff_xsk *heads; - u64 chunk_mask; - u64 addrs_cnt; - u32 free_list_cnt; - u32 dma_pages_cnt; - u32 heads_cnt; - u32 free_heads_cnt; - u32 headroom; - u32 chunk_size; - u32 frame_len; - bool cheap_dma; - bool unaligned; - void *addrs; - struct device *dev; - struct xdp_buff_xsk *free_heads[]; -}; - static void xp_addr_unmap(struct xsk_buff_pool *pool) { vunmap(pool->addrs); @@ -228,50 +200,12 @@ int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev, } EXPORT_SYMBOL(xp_dma_map); -static bool xp_desc_crosses_non_contig_pg(struct xsk_buff_pool *pool, - u64 addr, u32 len) -{ - bool cross_pg = (addr & (PAGE_SIZE - 1)) + len > PAGE_SIZE; - - if (pool->dma_pages_cnt && cross_pg) { - return !(pool->dma_pages[addr >> PAGE_SHIFT] & - XSK_NEXT_PG_CONTIG_MASK); - } - return false; -} - static bool xp_addr_crosses_non_contig_pg(struct xsk_buff_pool *pool, u64 addr) { return xp_desc_crosses_non_contig_pg(pool, addr, pool->chunk_size); } -void xp_release(struct xdp_buff_xsk *xskb) -{ - xskb->pool->free_heads[xskb->pool->free_heads_cnt++] = xskb; -} - -static u64 xp_aligned_extract_addr(struct xsk_buff_pool *pool, u64 addr) -{ - return addr & pool->chunk_mask; -} - -static u64 xp_unaligned_extract_addr(u64 addr) -{ - return addr & XSK_UNALIGNED_BUF_ADDR_MASK; -} - -static u64 xp_unaligned_extract_offset(u64 addr) -{ - return addr >> XSK_UNALIGNED_BUF_OFFSET_SHIFT; -} - -static u64 xp_unaligned_add_offset_to_addr(u64 addr) -{ - return xp_unaligned_extract_addr(addr) + - xp_unaligned_extract_offset(addr); -} - static bool xp_check_unaligned(struct xsk_buff_pool *pool, u64 *addr) { *addr = xp_unaligned_extract_addr(*addr); @@ -370,60 +304,6 @@ void xp_free(struct xdp_buff_xsk *xskb) } EXPORT_SYMBOL(xp_free); -static bool xp_aligned_validate_desc(struct xsk_buff_pool *pool, - struct xdp_desc *desc) -{ - u64 chunk, chunk_end; - - chunk = xp_aligned_extract_addr(pool, desc->addr); - chunk_end = xp_aligned_extract_addr(pool, desc->addr + desc->len); - if (chunk != chunk_end) - return false; - - if (chunk >= pool->addrs_cnt) - return false; - - if (desc->options) - return false; - return true; -} - -static bool xp_unaligned_validate_desc(struct xsk_buff_pool *pool, - struct xdp_desc *desc) -{ - u64 addr, base_addr; - - base_addr = xp_unaligned_extract_addr(desc->addr); - addr = xp_unaligned_add_offset_to_addr(desc->addr); - - if (desc->len > pool->chunk_size) - return false; - - if (base_addr >= pool->addrs_cnt || addr >= pool->addrs_cnt || - xp_desc_crosses_non_contig_pg(pool, addr, desc->len)) - return false; - - if (desc->options) - return false; - return true; -} - -bool xp_validate_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) -{ - return pool->unaligned ? xp_unaligned_validate_desc(pool, desc) : - xp_aligned_validate_desc(pool, desc); -} - -u64 xp_get_handle(struct xdp_buff_xsk *xskb) -{ - u64 offset = xskb->xdp.data - xskb->xdp.data_hard_start; - - offset += xskb->pool->headroom; - if (!xskb->pool->unaligned) - return xskb->orig_addr + offset; - return xskb->orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT); -} - void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr) { addr = pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr; @@ -440,35 +320,17 @@ dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr) } EXPORT_SYMBOL(xp_raw_get_dma); -dma_addr_t xp_get_dma(struct xdp_buff_xsk *xskb) -{ - return xskb->dma; -} -EXPORT_SYMBOL(xp_get_dma); - -dma_addr_t xp_get_frame_dma(struct xdp_buff_xsk *xskb) +void xp_dma_sync_for_cpu_slow(struct xdp_buff_xsk *xskb) { - return xskb->frame_dma; -} -EXPORT_SYMBOL(xp_get_frame_dma); - -void xp_dma_sync_for_cpu(struct xdp_buff_xsk *xskb) -{ - if (xskb->pool->cheap_dma) - return; - dma_sync_single_range_for_cpu(xskb->pool->dev, xskb->dma, 0, xskb->pool->frame_len, DMA_BIDIRECTIONAL); } -EXPORT_SYMBOL(xp_dma_sync_for_cpu); +EXPORT_SYMBOL(xp_dma_sync_for_cpu_slow); -void xp_dma_sync_for_device(struct xsk_buff_pool *pool, dma_addr_t dma, - size_t size) +void xp_dma_sync_for_device_slow(struct xsk_buff_pool *pool, dma_addr_t dma, + size_t size) { - if (pool->cheap_dma) - return; - dma_sync_single_range_for_device(pool->dev, dma, 0, size, DMA_BIDIRECTIONAL); } -EXPORT_SYMBOL(xp_dma_sync_for_device); +EXPORT_SYMBOL(xp_dma_sync_for_device_slow); diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 16bf15864788..5b5d24d2dd37 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -118,6 +118,51 @@ static inline bool xskq_cons_read_addr_unchecked(struct xsk_queue *q, u64 *addr) return false; } +static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool, + struct xdp_desc *desc) +{ + u64 chunk, chunk_end; + + chunk = xp_aligned_extract_addr(pool, desc->addr); + chunk_end = xp_aligned_extract_addr(pool, desc->addr + desc->len); + if (chunk != chunk_end) + return false; + + if (chunk >= pool->addrs_cnt) + return false; + + if (desc->options) + return false; + return true; +} + +static inline bool xp_unaligned_validate_desc(struct xsk_buff_pool *pool, + struct xdp_desc *desc) +{ + u64 addr, base_addr; + + base_addr = xp_unaligned_extract_addr(desc->addr); + addr = xp_unaligned_add_offset_to_addr(desc->addr); + + if (desc->len > pool->chunk_size) + return false; + + if (base_addr >= pool->addrs_cnt || addr >= pool->addrs_cnt || + xp_desc_crosses_non_contig_pg(pool, addr, desc->len)) + return false; + + if (desc->options) + return false; + return true; +} + +static inline bool xp_validate_desc(struct xsk_buff_pool *pool, + struct xdp_desc *desc) +{ + return pool->unaligned ? xp_unaligned_validate_desc(pool, desc) : + xp_aligned_validate_desc(pool, desc); +} + static inline bool xskq_cons_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d, struct xdp_umem *umem) -- cgit v1.2.3 From 2092c910e2399c9e13b199c07421133681b9eaff Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Wed, 6 May 2020 22:13:30 +0200 Subject: batman-adv: Revert "Drop lockdep.h include for soft-interface.c" The commit 1a33e10e4a95 ("net: partially revert dynamic lockdep key changes") reverts the commit ab92d68fc22f ("net: core: add generic lockdep keys"). But it forgot to also revert the commit 5759af0682b3 ("batman-adv: Drop lockdep.h include for soft-interface.c") which depends on the latter. Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/soft-interface.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 822af540b854..0ddd80130ea3 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3 From cf78bb0bbcef3fbe1abf118f14b81dad36eaa94e Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Wed, 20 May 2020 10:41:40 +0200 Subject: batman-adv: use rcu_replace_pointer() where appropriate In commit a63fc6b75cca ("rcu: Upgrade rcu_swap_protected() to rcu_replace_pointer()") a new helper macro named rcu_replace_pointer() was introduced to simplify code requiring to switch an rcu pointer to a new value while extracting the old one. Use rcu_replace_pointer() where appropriate to make code slimer. Signed-off-by: Antonio Quartulli Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/gateway_client.c | 4 ++-- net/batman-adv/hard-interface.c | 4 ++-- net/batman-adv/routing.c | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index e22e49289677..a18dcc686dc3 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -146,8 +146,8 @@ static void batadv_gw_select(struct batadv_priv *bat_priv, if (new_gw_node) kref_get(&new_gw_node->refcount); - curr_gw_node = rcu_dereference_protected(bat_priv->gw.curr_gw, 1); - rcu_assign_pointer(bat_priv->gw.curr_gw, new_gw_node); + curr_gw_node = rcu_replace_pointer(bat_priv->gw.curr_gw, new_gw_node, + true); if (curr_gw_node) batadv_gw_node_put(curr_gw_node); diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index c7e98a40dd33..3a256af92784 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -473,8 +473,8 @@ static void batadv_primary_if_select(struct batadv_priv *bat_priv, if (new_hard_iface) kref_get(&new_hard_iface->refcount); - curr_hard_iface = rcu_dereference_protected(bat_priv->primary_if, 1); - rcu_assign_pointer(bat_priv->primary_if, new_hard_iface); + curr_hard_iface = rcu_replace_pointer(bat_priv->primary_if, + new_hard_iface, 1); if (!new_hard_iface) goto out; diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index 3632bd976c56..d343382e9664 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -71,13 +71,13 @@ static void _batadv_update_route(struct batadv_priv *bat_priv, * the code needs to ensure the curr_router variable contains a pointer * to the replaced best neighbor. */ - curr_router = rcu_dereference_protected(orig_ifinfo->router, true); /* increase refcount of new best neighbor */ if (neigh_node) kref_get(&neigh_node->refcount); - rcu_assign_pointer(orig_ifinfo->router, neigh_node); + curr_router = rcu_replace_pointer(orig_ifinfo->router, neigh_node, + true); spin_unlock_bh(&orig_node->neigh_list_lock); batadv_orig_ifinfo_put(orig_ifinfo); -- cgit v1.2.3 From 38428d68719c454d269cb03b776d8a4b0ad66111 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Thu, 21 May 2020 22:26:13 -0700 Subject: nexthop: support for fdb ecmp nexthops This patch introduces ecmp nexthops and nexthop groups for mac fdb entries. In subsequent patches this is used by the vxlan driver fdb entries. The use case is E-VPN multihoming [1,2,3] which requires bridged vxlan traffic to be load balanced to remote switches (vteps) belonging to the same multi-homed ethernet segment (This is analogous to a multi-homed LAG but over vxlan). Changes include new nexthop flag NHA_FDB for nexthops referenced by fdb entries. These nexthops only have ip. This patch includes appropriate checks to avoid routes referencing such nexthops. example: $ip nexthop add id 12 via 172.16.1.2 fdb $ip nexthop add id 13 via 172.16.1.3 fdb $ip nexthop add id 102 group 12/13 fdb $bridge fdb add 02:02:00:00:00:13 dev vxlan1000 nhid 101 self [1] E-VPN https://tools.ietf.org/html/rfc7432 [2] E-VPN VxLAN: https://tools.ietf.org/html/rfc8365 [3] LPC talk with mention of nexthop groups for L2 ecmp http://vger.kernel.org/lpc_net2018_talks/scaling_bridge_fdb_database_slidesV3.pdf v4 - fixed uninitialized variable reported by kernel test robot Reported-by: kernel test robot Signed-off-by: Roopa Prabhu Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 1 + include/net/nexthop.h | 32 +++++++++++ include/uapi/linux/nexthop.h | 3 + net/ipv4/nexthop.c | 132 +++++++++++++++++++++++++++++++++++-------- net/ipv6/route.c | 5 ++ 5 files changed, 148 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index fdaf975e3331..3f615a29766e 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -65,6 +65,7 @@ struct fib6_config { struct nl_info fc_nlinfo; struct nlattr *fc_encap; u16 fc_encap_type; + bool fc_is_fdb; }; struct fib6_node { diff --git a/include/net/nexthop.h b/include/net/nexthop.h index c440ccc861fc..d929c98931ad 100644 --- a/include/net/nexthop.h +++ b/include/net/nexthop.h @@ -26,6 +26,7 @@ struct nh_config { u8 nh_family; u8 nh_protocol; u8 nh_blackhole; + u8 nh_fdb; u32 nh_flags; int nh_ifindex; @@ -52,6 +53,7 @@ struct nh_info { u8 family; bool reject_nh; + bool fdb_nh; union { struct fib_nh_common fib_nhc; @@ -80,6 +82,7 @@ struct nexthop { struct rb_node rb_node; /* entry on netns rbtree */ struct list_head fi_list; /* v4 entries using nh */ struct list_head f6i_list; /* v6 entries using nh */ + struct list_head fdb_list; /* fdb entries using this nh */ struct list_head grp_list; /* nh group entries using this nh */ struct net *net; @@ -88,6 +91,7 @@ struct nexthop { u8 protocol; /* app managing this nh */ u8 nh_flags; bool is_group; + bool is_fdb_nh; refcount_t refcnt; struct rcu_head rcu; @@ -304,4 +308,32 @@ static inline void nexthop_path_fib6_result(struct fib6_result *res, int hash) int nexthop_for_each_fib6_nh(struct nexthop *nh, int (*cb)(struct fib6_nh *nh, void *arg), void *arg); + +static inline int nexthop_get_family(struct nexthop *nh) +{ + struct nh_info *nhi = rcu_dereference_rtnl(nh->nh_info); + + return nhi->family; +} + +static inline +struct fib_nh_common *nexthop_fdb_nhc(struct nexthop *nh) +{ + struct nh_info *nhi = rcu_dereference_rtnl(nh->nh_info); + + return &nhi->fib_nhc; +} + +static inline struct fib_nh_common *nexthop_path_fdb_result(struct nexthop *nh, + int hash) +{ + struct nh_info *nhi; + struct nexthop *nhp; + + nhp = nexthop_select_path(nh, hash); + if (unlikely(!nhp)) + return NULL; + nhi = rcu_dereference(nhp->nh_info); + return &nhi->fib_nhc; +} #endif diff --git a/include/uapi/linux/nexthop.h b/include/uapi/linux/nexthop.h index 7b61867e9848..2d4a1e784cf0 100644 --- a/include/uapi/linux/nexthop.h +++ b/include/uapi/linux/nexthop.h @@ -49,6 +49,9 @@ enum { NHA_GROUPS, /* flag; only return nexthop groups in dump */ NHA_MASTER, /* u32; only return nexthops with given master dev */ + NHA_FDB, /* flag; nexthop belongs to a bridge fdb */ + /* if NHA_FDB is added, OIF, BLACKHOLE, ENCAP cannot be set */ + __NHA_MAX, }; diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 3957364d556c..bf91edc04631 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -33,6 +33,7 @@ static const struct nla_policy rtm_nh_policy[NHA_MAX + 1] = { [NHA_ENCAP] = { .type = NLA_NESTED }, [NHA_GROUPS] = { .type = NLA_FLAG }, [NHA_MASTER] = { .type = NLA_U32 }, + [NHA_FDB] = { .type = NLA_FLAG }, }; static unsigned int nh_dev_hashfn(unsigned int val) @@ -107,6 +108,7 @@ static struct nexthop *nexthop_alloc(void) INIT_LIST_HEAD(&nh->fi_list); INIT_LIST_HEAD(&nh->f6i_list); INIT_LIST_HEAD(&nh->grp_list); + INIT_LIST_HEAD(&nh->fdb_list); } return nh; } @@ -227,6 +229,9 @@ static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh, if (nla_put_u32(skb, NHA_ID, nh->id)) goto nla_put_failure; + if (nh->is_fdb_nh && nla_put_flag(skb, NHA_FDB)) + goto nla_put_failure; + if (nh->is_group) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); @@ -241,7 +246,7 @@ static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh, if (nla_put_flag(skb, NHA_BLACKHOLE)) goto nla_put_failure; goto out; - } else { + } else if (!nh->is_fdb_nh) { const struct net_device *dev; dev = nhi->fib_nhc.nhc_dev; @@ -387,12 +392,35 @@ static bool valid_group_nh(struct nexthop *nh, unsigned int npaths, return true; } +static int nh_check_attr_fdb_group(struct nexthop *nh, u8 *nh_family, + struct netlink_ext_ack *extack) +{ + struct nh_info *nhi; + + if (!nh->is_fdb_nh) { + NL_SET_ERR_MSG(extack, "FDB nexthop group can only have fdb nexthops"); + return -EINVAL; + } + + nhi = rtnl_dereference(nh->nh_info); + if (*nh_family == AF_UNSPEC) { + *nh_family = nhi->family; + } else if (*nh_family != nhi->family) { + NL_SET_ERR_MSG(extack, "FDB nexthop group cannot have mixed family nexthops"); + return -EINVAL; + } + + return 0; +} + static int nh_check_attr_group(struct net *net, struct nlattr *tb[], struct netlink_ext_ack *extack) { unsigned int len = nla_len(tb[NHA_GROUP]); + u8 nh_family = AF_UNSPEC; struct nexthop_grp *nhg; unsigned int i, j; + u8 nhg_fdb = 0; if (len & (sizeof(struct nexthop_grp) - 1)) { NL_SET_ERR_MSG(extack, @@ -421,6 +449,8 @@ static int nh_check_attr_group(struct net *net, struct nlattr *tb[], } } + if (tb[NHA_FDB]) + nhg_fdb = 1; nhg = nla_data(tb[NHA_GROUP]); for (i = 0; i < len; ++i) { struct nexthop *nh; @@ -432,11 +462,20 @@ static int nh_check_attr_group(struct net *net, struct nlattr *tb[], } if (!valid_group_nh(nh, len, extack)) return -EINVAL; + + if (nhg_fdb && nh_check_attr_fdb_group(nh, &nh_family, extack)) + return -EINVAL; + + if (!nhg_fdb && nh->is_fdb_nh) { + NL_SET_ERR_MSG(extack, "Non FDB nexthop group cannot have fdb nexthops"); + return -EINVAL; + } } for (i = NHA_GROUP + 1; i < __NHA_MAX; ++i) { if (!tb[i]) continue; - + if (tb[NHA_FDB]) + continue; NL_SET_ERR_MSG(extack, "No other attributes can be set in nexthop groups"); return -EINVAL; @@ -495,6 +534,9 @@ struct nexthop *nexthop_select_path(struct nexthop *nh, int hash) if (hash > atomic_read(&nhge->upper_bound)) continue; + if (nhge->nh->is_fdb_nh) + return nhge->nh; + /* nexthops always check if it is good and does * not rely on a sysctl for this behavior */ @@ -564,6 +606,11 @@ int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg, { struct nh_info *nhi; + if (nh->is_fdb_nh) { + NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop"); + return -EINVAL; + } + /* fib6_src is unique to a fib6_info and limits the ability to cache * routes in fib6_nh within a nexthop that is potentially shared * across multiple fib entries. If the config wants to use source @@ -640,6 +687,12 @@ int fib_check_nexthop(struct nexthop *nh, u8 scope, { int err = 0; + if (nh->is_fdb_nh) { + NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop"); + err = -EINVAL; + goto out; + } + if (nh->is_group) { struct nh_group *nhg; @@ -1125,6 +1178,9 @@ static struct nexthop *nexthop_create_group(struct net *net, nh_group_rebalance(nhg); } + if (cfg->nh_fdb) + nh->is_fdb_nh = 1; + rcu_assign_pointer(nh->nh_grp, nhg); return nh; @@ -1152,7 +1208,7 @@ static int nh_create_ipv4(struct net *net, struct nexthop *nh, .fc_encap = cfg->nh_encap, .fc_encap_type = cfg->nh_encap_type, }; - u32 tb_id = l3mdev_fib_table(cfg->dev); + u32 tb_id = (cfg->dev ? l3mdev_fib_table(cfg->dev) : RT_TABLE_MAIN); int err; err = fib_nh_init(net, fib_nh, &fib_cfg, 1, extack); @@ -1161,6 +1217,9 @@ static int nh_create_ipv4(struct net *net, struct nexthop *nh, goto out; } + if (nh->is_fdb_nh) + goto out; + /* sets nh_dev if successful */ err = fib_check_nh(net, fib_nh, tb_id, 0, extack); if (!err) { @@ -1186,6 +1245,7 @@ static int nh_create_ipv6(struct net *net, struct nexthop *nh, .fc_flags = cfg->nh_flags, .fc_encap = cfg->nh_encap, .fc_encap_type = cfg->nh_encap_type, + .fc_is_fdb = cfg->nh_fdb, }; int err; @@ -1227,6 +1287,9 @@ static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg, nhi->family = cfg->nh_family; nhi->fib_nhc.nhc_scope = RT_SCOPE_LINK; + if (cfg->nh_fdb) + nh->is_fdb_nh = 1; + if (cfg->nh_blackhole) { nhi->reject_nh = 1; cfg->nh_ifindex = net->loopback_dev->ifindex; @@ -1248,7 +1311,8 @@ static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg, } /* add the entry to the device based hash */ - nexthop_devhash_add(net, nhi); + if (!nh->is_fdb_nh) + nexthop_devhash_add(net, nhi); rcu_assign_pointer(nh->nh_info, nhi); @@ -1352,6 +1416,19 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb, if (tb[NHA_ID]) cfg->nh_id = nla_get_u32(tb[NHA_ID]); + if (tb[NHA_FDB]) { + if (tb[NHA_OIF] || tb[NHA_BLACKHOLE] || + tb[NHA_ENCAP] || tb[NHA_ENCAP_TYPE]) { + NL_SET_ERR_MSG(extack, "Fdb attribute can not be used with encap, oif or blackhole"); + goto out; + } + if (nhm->nh_flags) { + NL_SET_ERR_MSG(extack, "Unsupported nexthop flags in ancillary header"); + goto out; + } + cfg->nh_fdb = nla_get_flag(tb[NHA_FDB]); + } + if (tb[NHA_GROUP]) { if (nhm->nh_family != AF_UNSPEC) { NL_SET_ERR_MSG(extack, "Invalid family for group"); @@ -1375,8 +1452,8 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb, if (tb[NHA_BLACKHOLE]) { if (tb[NHA_GATEWAY] || tb[NHA_OIF] || - tb[NHA_ENCAP] || tb[NHA_ENCAP_TYPE]) { - NL_SET_ERR_MSG(extack, "Blackhole attribute can not be used with gateway or oif"); + tb[NHA_ENCAP] || tb[NHA_ENCAP_TYPE] || tb[NHA_FDB]) { + NL_SET_ERR_MSG(extack, "Blackhole attribute can not be used with gateway, oif, encap or fdb"); goto out; } @@ -1385,26 +1462,28 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb, goto out; } - if (!tb[NHA_OIF]) { - NL_SET_ERR_MSG(extack, "Device attribute required for non-blackhole nexthops"); + if (!cfg->nh_fdb && !tb[NHA_OIF]) { + NL_SET_ERR_MSG(extack, "Device attribute required for non-blackhole and non-fdb nexthops"); goto out; } - cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]); - if (cfg->nh_ifindex) - cfg->dev = __dev_get_by_index(net, cfg->nh_ifindex); + if (!cfg->nh_fdb && tb[NHA_OIF]) { + cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]); + if (cfg->nh_ifindex) + cfg->dev = __dev_get_by_index(net, cfg->nh_ifindex); - if (!cfg->dev) { - NL_SET_ERR_MSG(extack, "Invalid device index"); - goto out; - } else if (!(cfg->dev->flags & IFF_UP)) { - NL_SET_ERR_MSG(extack, "Nexthop device is not up"); - err = -ENETDOWN; - goto out; - } else if (!netif_carrier_ok(cfg->dev)) { - NL_SET_ERR_MSG(extack, "Carrier for nexthop device is down"); - err = -ENETDOWN; - goto out; + if (!cfg->dev) { + NL_SET_ERR_MSG(extack, "Invalid device index"); + goto out; + } else if (!(cfg->dev->flags & IFF_UP)) { + NL_SET_ERR_MSG(extack, "Nexthop device is not up"); + err = -ENETDOWN; + goto out; + } else if (!netif_carrier_ok(cfg->dev)) { + NL_SET_ERR_MSG(extack, "Carrier for nexthop device is down"); + err = -ENETDOWN; + goto out; + } } err = -EINVAL; @@ -1633,7 +1712,7 @@ static bool nh_dump_filtered(struct nexthop *nh, int dev_idx, int master_idx, static int nh_valid_dump_req(const struct nlmsghdr *nlh, int *dev_idx, int *master_idx, bool *group_filter, - struct netlink_callback *cb) + bool *fdb_filter, struct netlink_callback *cb) { struct netlink_ext_ack *extack = cb->extack; struct nlattr *tb[NHA_MAX + 1]; @@ -1670,6 +1749,9 @@ static int nh_valid_dump_req(const struct nlmsghdr *nlh, int *dev_idx, case NHA_GROUPS: *group_filter = true; break; + case NHA_FDB: + *fdb_filter = true; + break; default: NL_SET_ERR_MSG(extack, "Unsupported attribute in dump request"); return -EINVAL; @@ -1688,17 +1770,17 @@ static int nh_valid_dump_req(const struct nlmsghdr *nlh, int *dev_idx, /* rtnl */ static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb) { + bool group_filter = false, fdb_filter = false; struct nhmsg *nhm = nlmsg_data(cb->nlh); int dev_filter_idx = 0, master_idx = 0; struct net *net = sock_net(skb->sk); struct rb_root *root = &net->nexthop.rb_root; - bool group_filter = false; struct rb_node *node; int idx = 0, s_idx; int err; err = nh_valid_dump_req(cb->nlh, &dev_filter_idx, &master_idx, - &group_filter, cb); + &group_filter, &fdb_filter, cb); if (err < 0) return err; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index a52ec1b86432..82cbb46a2a4f 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3421,6 +3421,11 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, #ifdef CONFIG_IPV6_ROUTER_PREF fib6_nh->last_probe = jiffies; #endif + if (cfg->fc_is_fdb) { + fib6_nh->fib_nh_gw6 = cfg->fc_gateway; + fib6_nh->fib_nh_gw_family = AF_INET6; + return 0; + } err = -ENODEV; if (cfg->fc_ifindex) { -- cgit v1.2.3 From 1274e1cc42264d4e629841e4f182795cb0becfd2 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Thu, 21 May 2020 22:26:14 -0700 Subject: vxlan: ecmp support for mac fdb entries Todays vxlan mac fdb entries can point to multiple remote ips (rdsts) with the sole purpose of replicating broadcast-multicast and unknown unicast packets to those remote ips. E-VPN multihoming [1,2,3] requires bridged vxlan traffic to be load balanced to remote switches (vteps) belonging to the same multi-homed ethernet segment (E-VPN multihoming is analogous to multi-homed LAG implementations, but with the inter-switch peerlink replaced with a vxlan tunnel). In other words it needs support for mac ecmp. Furthermore, for faster convergence, E-VPN multihoming needs the ability to update fdb ecmp nexthops independent of the fdb entries. New route nexthop API is perfect for this usecase. This patch extends the vxlan fdb code to take a nexthop id pointing to an ecmp nexthop group. Changes include: - New NDA_NH_ID attribute for fdbs - Use the newly added fdb nexthop groups - makes vxlan rdsts and nexthop handling code mutually exclusive - since this is a new use-case and the requirement is for ecmp nexthop groups, the fdb add and update path checks that the nexthop is really an ecmp nexthop group. This check can be relaxed in the future, if we want to introduce replication fdb nexthop groups and allow its use in lieu of current rdst lists. - fdb update requests with nexthop id's only allowed for existing fdb's that have nexthop id's - learning will not override an existing fdb entry with nexthop group - I have wrapped the switchdev offload code around the presence of rdst [1] E-VPN RFC https://tools.ietf.org/html/rfc7432 [2] E-VPN with vxlan https://tools.ietf.org/html/rfc8365 [3] http://vger.kernel.org/lpc_net2018_talks/scaling_bridge_fdb_database_slidesV3.pdf Includes a null check fix in vxlan_xmit from Nikolay v2 - Fixed build issue: Reported-by: kbuild test robot Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 306 +++++++++++++++++++++++++++++++++-------- include/net/vxlan.h | 25 ++++ include/uapi/linux/neighbour.h | 1 + net/core/neighbour.c | 2 + 4 files changed, 275 insertions(+), 59 deletions(-) (limited to 'net') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index a5b415fed11e..754e00240eea 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -26,6 +26,7 @@ #include #include #include +#include #if IS_ENABLED(CONFIG_IPV6) #include @@ -78,6 +79,8 @@ struct vxlan_fdb { u16 state; /* see ndm_state */ __be32 vni; u16 flags; /* see ndm_flags and below */ + struct list_head nh_list; + struct nexthop __rcu *nh; }; #define NTF_VXLAN_ADDED_BY_USER 0x100 @@ -174,11 +177,15 @@ static inline struct hlist_head *vs_head(struct net *net, __be16 port) */ static inline struct vxlan_rdst *first_remote_rcu(struct vxlan_fdb *fdb) { + if (rcu_access_pointer(fdb->nh)) + return NULL; return list_entry_rcu(fdb->remotes.next, struct vxlan_rdst, list); } static inline struct vxlan_rdst *first_remote_rtnl(struct vxlan_fdb *fdb) { + if (rcu_access_pointer(fdb->nh)) + return NULL; return list_first_entry(&fdb->remotes, struct vxlan_rdst, list); } @@ -251,9 +258,10 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, { unsigned long now = jiffies; struct nda_cacheinfo ci; + bool send_ip, send_eth; struct nlmsghdr *nlh; + struct nexthop *nh; struct ndmsg *ndm; - bool send_ip, send_eth; nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags); if (nlh == NULL) @@ -264,16 +272,21 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, send_eth = send_ip = true; + nh = rcu_dereference_rtnl(fdb->nh); if (type == RTM_GETNEIGH) { - send_ip = !vxlan_addr_any(&rdst->remote_ip); + if (rdst) { + send_ip = !vxlan_addr_any(&rdst->remote_ip); + ndm->ndm_family = send_ip ? rdst->remote_ip.sa.sa_family : AF_INET; + } else if (nh) { + ndm->ndm_family = nexthop_get_family(nh); + } send_eth = !is_zero_ether_addr(fdb->eth_addr); - ndm->ndm_family = send_ip ? rdst->remote_ip.sa.sa_family : AF_INET; } else ndm->ndm_family = AF_BRIDGE; ndm->ndm_state = fdb->state; ndm->ndm_ifindex = vxlan->dev->ifindex; ndm->ndm_flags = fdb->flags; - if (rdst->offloaded) + if (rdst && rdst->offloaded) ndm->ndm_flags |= NTF_OFFLOADED; ndm->ndm_type = RTN_UNICAST; @@ -284,23 +297,30 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr)) goto nla_put_failure; + if (nh) { + if (nla_put_u32(skb, NDA_NH_ID, nh->id)) + goto nla_put_failure; + } else if (rdst) { + if (send_ip && vxlan_nla_put_addr(skb, NDA_DST, + &rdst->remote_ip)) + goto nla_put_failure; + + if (rdst->remote_port && + rdst->remote_port != vxlan->cfg.dst_port && + nla_put_be16(skb, NDA_PORT, rdst->remote_port)) + goto nla_put_failure; + if (rdst->remote_vni != vxlan->default_dst.remote_vni && + nla_put_u32(skb, NDA_VNI, be32_to_cpu(rdst->remote_vni))) + goto nla_put_failure; + if (rdst->remote_ifindex && + nla_put_u32(skb, NDA_IFINDEX, rdst->remote_ifindex)) + goto nla_put_failure; + } - if (send_ip && vxlan_nla_put_addr(skb, NDA_DST, &rdst->remote_ip)) - goto nla_put_failure; - - if (rdst->remote_port && rdst->remote_port != vxlan->cfg.dst_port && - nla_put_be16(skb, NDA_PORT, rdst->remote_port)) - goto nla_put_failure; - if (rdst->remote_vni != vxlan->default_dst.remote_vni && - nla_put_u32(skb, NDA_VNI, be32_to_cpu(rdst->remote_vni))) - goto nla_put_failure; if ((vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) && fdb->vni && nla_put_u32(skb, NDA_SRC_VNI, be32_to_cpu(fdb->vni))) goto nla_put_failure; - if (rdst->remote_ifindex && - nla_put_u32(skb, NDA_IFINDEX, rdst->remote_ifindex)) - goto nla_put_failure; ci.ndm_used = jiffies_to_clock_t(now - fdb->used); ci.ndm_confirmed = 0; @@ -401,7 +421,7 @@ static int vxlan_fdb_notify(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb, { int err; - if (swdev_notify) { + if (swdev_notify && rd) { switch (type) { case RTM_NEWNEIGH: err = vxlan_fdb_switchdev_call_notifiers(vxlan, fdb, rd, @@ -805,6 +825,8 @@ static struct vxlan_fdb *vxlan_fdb_alloc(const u8 *mac, __u16 state, f->flags = ndm_flags; f->updated = f->used = jiffies; f->vni = src_vni; + f->nh = NULL; + INIT_LIST_HEAD(&f->nh_list); INIT_LIST_HEAD(&f->remotes); memcpy(f->eth_addr, mac, ETH_ALEN); @@ -819,11 +841,78 @@ static void vxlan_fdb_insert(struct vxlan_dev *vxlan, const u8 *mac, vxlan_fdb_head(vxlan, mac, src_vni)); } +static int vxlan_fdb_nh_update(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb, + u32 nhid, struct netlink_ext_ack *extack) +{ + struct nexthop *old_nh = rtnl_dereference(fdb->nh); + struct nh_group *nhg; + struct nexthop *nh; + int err = -EINVAL; + + if (old_nh && old_nh->id == nhid) + return 0; + + nh = nexthop_find_by_id(vxlan->net, nhid); + if (!nh) { + NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); + goto err_inval; + } + + if (nh) { + if (!nexthop_get(nh)) { + NL_SET_ERR_MSG(extack, "Nexthop has been deleted"); + nh = NULL; + goto err_inval; + } + if (!nh->is_fdb_nh) { + NL_SET_ERR_MSG(extack, "Nexthop is not a fdb nexthop"); + goto err_inval; + } + + if (!nh->is_group || !nh->nh_grp->mpath) { + NL_SET_ERR_MSG(extack, "Nexthop is not a multipath group"); + goto err_inval; + } + + /* check nexthop group family */ + nhg = rtnl_dereference(nh->nh_grp); + switch (vxlan->default_dst.remote_ip.sa.sa_family) { + case AF_INET: + if (!nhg->has_v4) { + err = -EAFNOSUPPORT; + NL_SET_ERR_MSG(extack, "Nexthop group family not supported"); + goto err_inval; + } + break; + case AF_INET6: + if (nhg->has_v4) { + err = -EAFNOSUPPORT; + NL_SET_ERR_MSG(extack, "Nexthop group family not supported"); + goto err_inval; + } + } + } + + if (old_nh) { + list_del_rcu(&fdb->nh_list); + nexthop_put(old_nh); + } + rcu_assign_pointer(fdb->nh, nh); + list_add_tail_rcu(&fdb->nh_list, &nh->fdb_list); + return 1; + +err_inval: + if (nh) + nexthop_put(nh); + return err; +} + static int vxlan_fdb_create(struct vxlan_dev *vxlan, const u8 *mac, union vxlan_addr *ip, __u16 state, __be16 port, __be32 src_vni, __be32 vni, __u32 ifindex, __u16 ndm_flags, - struct vxlan_fdb **fdb) + u32 nhid, struct vxlan_fdb **fdb, + struct netlink_ext_ack *extack) { struct vxlan_rdst *rd = NULL; struct vxlan_fdb *f; @@ -838,20 +927,33 @@ static int vxlan_fdb_create(struct vxlan_dev *vxlan, if (!f) return -ENOMEM; - rc = vxlan_fdb_append(f, ip, port, vni, ifindex, &rd); - if (rc < 0) { - kfree(f); - return rc; - } + if (nhid) + rc = vxlan_fdb_nh_update(vxlan, f, nhid, extack); + else + rc = vxlan_fdb_append(f, ip, port, vni, ifindex, &rd); + if (rc < 0) + goto errout; *fdb = f; return 0; + +errout: + kfree(f); + return rc; } static void __vxlan_fdb_free(struct vxlan_fdb *f) { struct vxlan_rdst *rd, *nd; + struct nexthop *nh; + + nh = rcu_dereference_raw(f->nh); + if (nh) { + rcu_assign_pointer(f->nh, NULL); + list_del_rcu(&f->nh_list); + nexthop_put(nh); + } list_for_each_entry_safe(rd, nd, &f->remotes, list) { dst_cache_destroy(&rd->dst_cache); @@ -875,10 +977,15 @@ static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f, netdev_dbg(vxlan->dev, "delete %pM\n", f->eth_addr); --vxlan->addrcnt; - if (do_notify) - list_for_each_entry(rd, &f->remotes, list) - vxlan_fdb_notify(vxlan, f, rd, RTM_DELNEIGH, + if (do_notify) { + if (rcu_access_pointer(f->nh)) + vxlan_fdb_notify(vxlan, f, NULL, RTM_DELNEIGH, swdev_notify, NULL); + else + list_for_each_entry(rd, &f->remotes, list) + vxlan_fdb_notify(vxlan, f, rd, RTM_DELNEIGH, + swdev_notify, NULL); + } hlist_del_rcu(&f->hlist); call_rcu(&f->rcu, vxlan_fdb_free); @@ -897,7 +1004,7 @@ static int vxlan_fdb_update_existing(struct vxlan_dev *vxlan, __u16 state, __u16 flags, __be16 port, __be32 vni, __u32 ifindex, __u16 ndm_flags, - struct vxlan_fdb *f, + struct vxlan_fdb *f, u32 nhid, bool swdev_notify, struct netlink_ext_ack *extack) { @@ -908,6 +1015,18 @@ static int vxlan_fdb_update_existing(struct vxlan_dev *vxlan, int rc = 0; int err; + if (nhid && !rcu_access_pointer(f->nh)) { + NL_SET_ERR_MSG(extack, + "Cannot replace an existing non nexthop fdb with a nexthop"); + return -EOPNOTSUPP; + } + + if (nhid && (flags & NLM_F_APPEND)) { + NL_SET_ERR_MSG(extack, + "Cannot append to a nexthop fdb"); + return -EOPNOTSUPP; + } + /* Do not allow an externally learned entry to take over an entry added * by the user. */ @@ -929,10 +1048,17 @@ static int vxlan_fdb_update_existing(struct vxlan_dev *vxlan, /* Only change unicasts */ if (!(is_multicast_ether_addr(f->eth_addr) || is_zero_ether_addr(f->eth_addr))) { - rc = vxlan_fdb_replace(f, ip, port, vni, - ifindex, &oldrd); + if (nhid) { + rc = vxlan_fdb_nh_update(vxlan, f, nhid, extack); + if (rc < 0) + return rc; + } else { + rc = vxlan_fdb_replace(f, ip, port, vni, + ifindex, &oldrd); + } notify |= rc; } else { + NL_SET_ERR_MSG(extack, "Cannot replace non-unicast fdb entries"); return -EOPNOTSUPP; } } @@ -962,6 +1088,8 @@ static int vxlan_fdb_update_existing(struct vxlan_dev *vxlan, return 0; err_notify: + if (nhid) + return err; if ((flags & NLM_F_REPLACE) && rc) *rd = oldrd; else if ((flags & NLM_F_APPEND) && rc) { @@ -975,7 +1103,7 @@ static int vxlan_fdb_update_create(struct vxlan_dev *vxlan, const u8 *mac, union vxlan_addr *ip, __u16 state, __u16 flags, __be16 port, __be32 src_vni, __be32 vni, - __u32 ifindex, __u16 ndm_flags, + __u32 ifindex, __u16 ndm_flags, u32 nhid, bool swdev_notify, struct netlink_ext_ack *extack) { @@ -990,7 +1118,7 @@ static int vxlan_fdb_update_create(struct vxlan_dev *vxlan, netdev_dbg(vxlan->dev, "add %pM -> %pIS\n", mac, ip); rc = vxlan_fdb_create(vxlan, mac, ip, state, port, src_vni, - vni, ifindex, fdb_flags, &f); + vni, ifindex, fdb_flags, nhid, &f, extack); if (rc < 0) return rc; @@ -1012,7 +1140,7 @@ static int vxlan_fdb_update(struct vxlan_dev *vxlan, const u8 *mac, union vxlan_addr *ip, __u16 state, __u16 flags, __be16 port, __be32 src_vni, __be32 vni, - __u32 ifindex, __u16 ndm_flags, + __u32 ifindex, __u16 ndm_flags, u32 nhid, bool swdev_notify, struct netlink_ext_ack *extack) { @@ -1028,14 +1156,15 @@ static int vxlan_fdb_update(struct vxlan_dev *vxlan, return vxlan_fdb_update_existing(vxlan, ip, state, flags, port, vni, ifindex, ndm_flags, f, - swdev_notify, extack); + nhid, swdev_notify, extack); } else { if (!(flags & NLM_F_CREATE)) return -ENOENT; return vxlan_fdb_update_create(vxlan, mac, ip, state, flags, port, src_vni, vni, ifindex, - ndm_flags, swdev_notify, extack); + ndm_flags, nhid, swdev_notify, + extack); } } @@ -1049,7 +1178,7 @@ static void vxlan_fdb_dst_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f, static int vxlan_fdb_parse(struct nlattr *tb[], struct vxlan_dev *vxlan, union vxlan_addr *ip, __be16 *port, __be32 *src_vni, - __be32 *vni, u32 *ifindex) + __be32 *vni, u32 *ifindex, u32 *nhid) { struct net *net = dev_net(vxlan->dev); int err; @@ -1109,6 +1238,11 @@ static int vxlan_fdb_parse(struct nlattr *tb[], struct vxlan_dev *vxlan, *ifindex = 0; } + if (tb[NDA_NH_ID]) + *nhid = nla_get_u32(tb[NDA_NH_ID]); + else + *nhid = 0; + return 0; } @@ -1123,7 +1257,7 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], union vxlan_addr ip; __be16 port; __be32 src_vni, vni; - u32 ifindex; + u32 ifindex, nhid; u32 hash_index; int err; @@ -1133,10 +1267,11 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], return -EINVAL; } - if (tb[NDA_DST] == NULL) + if (!tb || (!tb[NDA_DST] && !tb[NDA_NH_ID])) return -EINVAL; - err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex); + err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex, + &nhid); if (err) return err; @@ -1148,7 +1283,7 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], err = vxlan_fdb_update(vxlan, addr, &ip, ndm->ndm_state, flags, port, src_vni, vni, ifindex, ndm->ndm_flags | NTF_VXLAN_ADDED_BY_USER, - true, extack); + nhid, true, extack); spin_unlock_bh(&vxlan->hash_lock[hash_index]); return err; @@ -1159,8 +1294,8 @@ static int __vxlan_fdb_delete(struct vxlan_dev *vxlan, __be16 port, __be32 src_vni, __be32 vni, u32 ifindex, bool swdev_notify) { - struct vxlan_fdb *f; struct vxlan_rdst *rd = NULL; + struct vxlan_fdb *f; int err = -ENOENT; f = vxlan_find_mac(vxlan, addr, src_vni); @@ -1195,12 +1330,13 @@ static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], struct vxlan_dev *vxlan = netdev_priv(dev); union vxlan_addr ip; __be32 src_vni, vni; - __be16 port; - u32 ifindex; + u32 ifindex, nhid; u32 hash_index; + __be16 port; int err; - err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex); + err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex, + &nhid); if (err) return err; @@ -1228,6 +1364,17 @@ static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, hlist_for_each_entry_rcu(f, &vxlan->fdb_head[h], hlist) { struct vxlan_rdst *rd; + if (rcu_access_pointer(f->nh)) { + err = vxlan_fdb_info(skb, vxlan, f, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWNEIGH, + NLM_F_MULTI, NULL); + if (err < 0) + goto out; + continue; + } + list_for_each_entry_rcu(rd, &f->remotes, list) { if (*idx < cb->args[2]) goto skip; @@ -1311,6 +1458,10 @@ static bool vxlan_snoop(struct net_device *dev, if (f->state & (NUD_PERMANENT | NUD_NOARP)) return true; + /* Don't override an fdb with nexthop with a learnt entry */ + if (rcu_access_pointer(f->nh)) + return true; + if (net_ratelimit()) netdev_info(dev, "%pM migrated from %pIS to %pIS\n", @@ -1333,7 +1484,7 @@ static bool vxlan_snoop(struct net_device *dev, vxlan->cfg.dst_port, vni, vxlan->default_dst.remote_vni, - ifindex, NTF_SELF, true, NULL); + ifindex, NTF_SELF, 0, true, NULL); spin_unlock(&vxlan->hash_lock[hash_index]); } @@ -2616,6 +2767,38 @@ tx_error: kfree_skb(skb); } +static void vxlan_xmit_nh(struct sk_buff *skb, struct net_device *dev, + struct vxlan_fdb *f, __be32 vni, bool did_rsc) +{ + struct vxlan_rdst nh_rdst; + struct nexthop *nh; + bool do_xmit; + u32 hash; + + memset(&nh_rdst, 0, sizeof(struct vxlan_rdst)); + hash = skb_get_hash(skb); + + rcu_read_lock(); + nh = rcu_dereference(f->nh); + if (!nh) { + rcu_read_unlock(); + goto drop; + } + do_xmit = vxlan_fdb_nh_path_select(nh, hash, &nh_rdst); + rcu_read_unlock(); + + if (likely(do_xmit)) + vxlan_xmit_one(skb, dev, vni, &nh_rdst, did_rsc); + else + goto drop; + + return; + +drop: + dev->stats.tx_dropped++; + dev_kfree_skb(skb); +} + /* Transmit local packets over Vxlan * * Outer IP header inherits ECN and DF from inner header. @@ -2692,22 +2875,27 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) } } - list_for_each_entry_rcu(rdst, &f->remotes, list) { - struct sk_buff *skb1; + if (rcu_access_pointer(f->nh)) { + vxlan_xmit_nh(skb, dev, f, + (vni ? : vxlan->default_dst.remote_vni), did_rsc); + } else { + list_for_each_entry_rcu(rdst, &f->remotes, list) { + struct sk_buff *skb1; - if (!fdst) { - fdst = rdst; - continue; + if (!fdst) { + fdst = rdst; + continue; + } + skb1 = skb_clone(skb, GFP_ATOMIC); + if (skb1) + vxlan_xmit_one(skb1, dev, vni, rdst, did_rsc); } - skb1 = skb_clone(skb, GFP_ATOMIC); - if (skb1) - vxlan_xmit_one(skb1, dev, vni, rdst, did_rsc); + if (fdst) + vxlan_xmit_one(skb, dev, vni, fdst, did_rsc); + else + kfree_skb(skb); } - if (fdst) - vxlan_xmit_one(skb, dev, vni, fdst, did_rsc); - else - kfree_skb(skb); return NETDEV_TX_OK; } @@ -3615,7 +3803,7 @@ static int __vxlan_dev_create(struct net *net, struct net_device *dev, dst->remote_vni, dst->remote_vni, dst->remote_ifindex, - NTF_SELF, &f); + NTF_SELF, 0, &f, extack); if (err) return err; } @@ -4013,7 +4201,7 @@ static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[], vxlan->cfg.dst_port, conf.vni, conf.vni, conf.remote_ifindex, - NTF_SELF, true, extack); + NTF_SELF, 0, true, extack); if (err) { spin_unlock_bh(&vxlan->hash_lock[hash_index]); netdev_adjacent_change_abort(dst->remote_dev, @@ -4335,7 +4523,7 @@ vxlan_fdb_external_learn_add(struct net_device *dev, fdb_info->remote_vni, fdb_info->remote_ifindex, NTF_USE | NTF_SELF | NTF_EXT_LEARNED, - false, extack); + 0, false, extack); spin_unlock_bh(&vxlan->hash_lock[hash_index]); return err; diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 373aadcfea21..3a41627cbdfe 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -7,6 +7,7 @@ #include #include #include +#include #define IANA_VXLAN_UDP_PORT 4789 @@ -487,4 +488,28 @@ static inline void vxlan_flag_attr_error(int attrtype, #undef VXLAN_FLAG } +static inline bool vxlan_fdb_nh_path_select(struct nexthop *nh, + int hash, + struct vxlan_rdst *rdst) +{ + struct fib_nh_common *nhc; + + nhc = nexthop_path_fdb_result(nh, hash); + if (unlikely(!nhc)) + return false; + + switch (nhc->nhc_gw_family) { + case AF_INET: + rdst->remote_ip.sin.sin_addr.s_addr = nhc->nhc_gw.ipv4; + rdst->remote_ip.sa.sa_family = AF_INET; + break; + case AF_INET6: + rdst->remote_ip.sin6.sin6_addr = nhc->nhc_gw.ipv6; + rdst->remote_ip.sa.sa_family = AF_INET6; + break; + } + + return true; +} + #endif diff --git a/include/uapi/linux/neighbour.h b/include/uapi/linux/neighbour.h index cd144e3099a3..eefcda8ca44e 100644 --- a/include/uapi/linux/neighbour.h +++ b/include/uapi/linux/neighbour.h @@ -29,6 +29,7 @@ enum { NDA_LINK_NETNSID, NDA_SRC_VNI, NDA_PROTOCOL, /* Originator of entry */ + NDA_NH_ID, __NDA_MAX }; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index b607ea602774..37e4dba62460 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1771,6 +1771,7 @@ static struct neigh_table *neigh_find_table(int family) } const struct nla_policy nda_policy[NDA_MAX+1] = { + [NDA_UNSPEC] = { .strict_start_type = NDA_NH_ID }, [NDA_DST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, [NDA_LLADDR] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, [NDA_CACHEINFO] = { .len = sizeof(struct nda_cacheinfo) }, @@ -1781,6 +1782,7 @@ const struct nla_policy nda_policy[NDA_MAX+1] = { [NDA_IFINDEX] = { .type = NLA_U32 }, [NDA_MASTER] = { .type = NLA_U32 }, [NDA_PROTOCOL] = { .type = NLA_U8 }, + [NDA_NH_ID] = { .type = NLA_U32 }, }; static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, -- cgit v1.2.3 From 8590ceedb70181ad9de5a3dc2cfe50ca33a9576a Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Thu, 21 May 2020 22:26:15 -0700 Subject: nexthop: add support for notifiers This patch adds nexthop add/del notifiers. To be used by vxlan driver in a later patch. Could possibly be used by switchdev drivers in the future. Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/net/netns/nexthop.h | 1 + include/net/nexthop.h | 12 ++++++++++++ net/ipv4/nexthop.c | 27 +++++++++++++++++++++++++++ 3 files changed, 40 insertions(+) (limited to 'net') diff --git a/include/net/netns/nexthop.h b/include/net/netns/nexthop.h index c712ee5eebd9..1937476c94a0 100644 --- a/include/net/netns/nexthop.h +++ b/include/net/netns/nexthop.h @@ -14,5 +14,6 @@ struct netns_nexthop { unsigned int seq; /* protected by rtnl_mutex */ u32 last_id_allocated; + struct atomic_notifier_head notifier_chain; }; #endif diff --git a/include/net/nexthop.h b/include/net/nexthop.h index d929c98931ad..4c951680f6f9 100644 --- a/include/net/nexthop.h +++ b/include/net/nexthop.h @@ -10,6 +10,7 @@ #define __LINUX_NEXTHOP_H #include +#include #include #include #include @@ -102,6 +103,17 @@ struct nexthop { }; }; +enum nexthop_event_type { + NEXTHOP_EVENT_ADD, + NEXTHOP_EVENT_DEL +}; + +int call_nexthop_notifier(struct notifier_block *nb, struct net *net, + enum nexthop_event_type event_type, + struct nexthop *nh); +int register_nexthop_notifier(struct net *net, struct notifier_block *nb); +int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb); + /* caller is holding rcu or rtnl; no reference taken to nexthop */ struct nexthop *nexthop_find_by_id(struct net *net, u32 id); void nexthop_free_rcu(struct rcu_head *head); diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index bf91edc04631..c337e73e02dd 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -36,6 +36,17 @@ static const struct nla_policy rtm_nh_policy[NHA_MAX + 1] = { [NHA_FDB] = { .type = NLA_FLAG }, }; +static int call_nexthop_notifiers(struct net *net, + enum fib_event_type event_type, + struct nexthop *nh) +{ + int err; + + err = atomic_notifier_call_chain(&net->nexthop.notifier_chain, + event_type, nh); + return notifier_to_errno(err); +} + static unsigned int nh_dev_hashfn(unsigned int val) { unsigned int mask = NH_DEV_HASHSIZE - 1; @@ -826,6 +837,8 @@ static void __remove_nexthop_fib(struct net *net, struct nexthop *nh) bool do_flush = false; struct fib_info *fi; + call_nexthop_notifiers(net, NEXTHOP_EVENT_DEL, nh); + list_for_each_entry(fi, &nh->fi_list, nh_list) { fi->fib_flags |= RTNH_F_DEAD; do_flush = true; @@ -1865,6 +1878,19 @@ static struct notifier_block nh_netdev_notifier = { .notifier_call = nh_netdev_event, }; +int register_nexthop_notifier(struct net *net, struct notifier_block *nb) +{ + return atomic_notifier_chain_register(&net->nexthop.notifier_chain, nb); +} +EXPORT_SYMBOL(register_nexthop_notifier); + +int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb) +{ + return atomic_notifier_chain_unregister(&net->nexthop.notifier_chain, + nb); +} +EXPORT_SYMBOL(unregister_nexthop_notifier); + static void __net_exit nexthop_net_exit(struct net *net) { rtnl_lock(); @@ -1881,6 +1907,7 @@ static int __net_init nexthop_net_init(struct net *net) net->nexthop.devhash = kzalloc(sz, GFP_KERNEL); if (!net->nexthop.devhash) return -ENOMEM; + ATOMIC_INIT_NOTIFIER_HEAD(&net->nexthop.notifier_chain); return 0; } -- cgit v1.2.3 From e7bb18e6c8b7ed35746af8c7c708447a66ee385c Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Wed, 20 May 2020 18:21:35 +0300 Subject: ip6_tunnel: simplify transmit path Merge ip{4,6}ip6_tnl_xmit functions into one universal ipxip6_tnl_xmit in preparation for adding MPLS support. Signed-off-by: Vadim Fedorenko Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 182 ++++++++++++++++++++++---------------------------- 1 file changed, 79 insertions(+), 103 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 4703b09808d0..dae6f7146b49 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1253,22 +1253,22 @@ tx_err_dst_release: EXPORT_SYMBOL(ip6_tnl_xmit); static inline int -ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) +ipxip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, + u8 protocol) { struct ip6_tnl *t = netdev_priv(dev); + struct ipv6hdr *ipv6h; const struct iphdr *iph; int encap_limit = -1; + __u16 offset; struct flowi6 fl6; - __u8 dsfield; + __u8 dsfield, orig_dsfield; __u32 mtu; u8 tproto; int err; - iph = ip_hdr(skb); - memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - tproto = READ_ONCE(t->parms.proto); - if (tproto != IPPROTO_IPIP && tproto != 0) + if (tproto != protocol && tproto != 0) return -1; if (t->parms.collect_md) { @@ -1281,129 +1281,101 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) return -1; key = &tun_info->key; memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_proto = IPPROTO_IPIP; + fl6.flowi6_proto = protocol; fl6.saddr = key->u.ipv6.src; fl6.daddr = key->u.ipv6.dst; fl6.flowlabel = key->label; dsfield = key->tos; + switch (protocol) { + case IPPROTO_IPIP: + iph = ip_hdr(skb); + orig_dsfield = ipv4_get_dsfield(iph); + break; + case IPPROTO_IPV6: + ipv6h = ipv6_hdr(skb); + orig_dsfield = ipv6_get_dsfield(ipv6h); + break; + default: + orig_dsfield = dsfield; + break; + } } else { if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) encap_limit = t->parms.encap_limit; + if (protocol == IPPROTO_IPV6) { + offset = ip6_tnl_parse_tlv_enc_lim(skb, + skb_network_header(skb)); + /* ip6_tnl_parse_tlv_enc_lim() might have + * reallocated skb->head + */ + if (offset > 0) { + struct ipv6_tlv_tnl_enc_lim *tel; - memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); - fl6.flowi6_proto = IPPROTO_IPIP; - - if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) - dsfield = ipv4_get_dsfield(iph); - else - dsfield = ip6_tclass(t->parms.flowinfo); - if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) - fl6.flowi6_mark = skb->mark; - else - fl6.flowi6_mark = t->parms.fwmark; - } - - fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL); - dsfield = INET_ECN_encapsulate(dsfield, ipv4_get_dsfield(iph)); - - if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6)) - return -1; - - skb_set_inner_ipproto(skb, IPPROTO_IPIP); - - err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, - IPPROTO_IPIP); - if (err != 0) { - /* XXX: send ICMP error even if DF is not set. */ - if (err == -EMSGSIZE) - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, - htonl(mtu)); - return -1; - } - - return 0; -} - -static inline int -ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) -{ - struct ip6_tnl *t = netdev_priv(dev); - struct ipv6hdr *ipv6h; - int encap_limit = -1; - __u16 offset; - struct flowi6 fl6; - __u8 dsfield; - __u32 mtu; - u8 tproto; - int err; - - ipv6h = ipv6_hdr(skb); - tproto = READ_ONCE(t->parms.proto); - if ((tproto != IPPROTO_IPV6 && tproto != 0) || - ip6_tnl_addr_conflict(t, ipv6h)) - return -1; - - if (t->parms.collect_md) { - struct ip_tunnel_info *tun_info; - const struct ip_tunnel_key *key; - - tun_info = skb_tunnel_info(skb); - if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || - ip_tunnel_info_af(tun_info) != AF_INET6)) - return -1; - key = &tun_info->key; - memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_proto = IPPROTO_IPV6; - fl6.saddr = key->u.ipv6.src; - fl6.daddr = key->u.ipv6.dst; - fl6.flowlabel = key->label; - dsfield = key->tos; - } else { - offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb)); - /* ip6_tnl_parse_tlv_enc_lim() might have reallocated skb->head */ - ipv6h = ipv6_hdr(skb); - if (offset > 0) { - struct ipv6_tlv_tnl_enc_lim *tel; - - tel = (void *)&skb_network_header(skb)[offset]; - if (tel->encap_limit == 0) { - icmpv6_send(skb, ICMPV6_PARAMPROB, - ICMPV6_HDR_FIELD, offset + 2); - return -1; + tel = (void *)&skb_network_header(skb)[offset]; + if (tel->encap_limit == 0) { + icmpv6_send(skb, ICMPV6_PARAMPROB, + ICMPV6_HDR_FIELD, offset + 2); + return -1; + } + encap_limit = tel->encap_limit - 1; } - encap_limit = tel->encap_limit - 1; - } else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) { - encap_limit = t->parms.encap_limit; } memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); - fl6.flowi6_proto = IPPROTO_IPV6; + fl6.flowi6_proto = protocol; - if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) - dsfield = ipv6_get_dsfield(ipv6h); - else - dsfield = ip6_tclass(t->parms.flowinfo); - if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL) - fl6.flowlabel |= ip6_flowlabel(ipv6h); if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) fl6.flowi6_mark = skb->mark; else fl6.flowi6_mark = t->parms.fwmark; + switch (protocol) { + case IPPROTO_IPIP: + iph = ip_hdr(skb); + orig_dsfield = ipv4_get_dsfield(iph); + if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) + dsfield = orig_dsfield; + else + dsfield = ip6_tclass(t->parms.flowinfo); + break; + case IPPROTO_IPV6: + ipv6h = ipv6_hdr(skb); + orig_dsfield = ipv6_get_dsfield(ipv6h); + if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) + dsfield = orig_dsfield; + else + dsfield = ip6_tclass(t->parms.flowinfo); + if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL) + fl6.flowlabel |= ip6_flowlabel(ipv6h); + break; + default: + break; + } } fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL); - dsfield = INET_ECN_encapsulate(dsfield, ipv6_get_dsfield(ipv6h)); + dsfield = INET_ECN_encapsulate(dsfield, orig_dsfield); if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6)) return -1; - skb_set_inner_ipproto(skb, IPPROTO_IPV6); + skb_set_inner_ipproto(skb, protocol); err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, - IPPROTO_IPV6); + protocol); if (err != 0) { + /* XXX: send ICMP error even if DF is not set. */ if (err == -EMSGSIZE) - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + switch (protocol) { + case IPPROTO_IPIP: + icmp_send(skb, ICMP_DEST_UNREACH, + ICMP_FRAG_NEEDED, htonl(mtu)); + break; + case IPPROTO_IPV6: + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + break; + default: + break; + } return -1; } @@ -1415,6 +1387,7 @@ ip6_tnl_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct net_device_stats *stats = &t->dev->stats; + u8 ipproto; int ret; if (!pskb_inet_may_pull(skb)) @@ -1422,15 +1395,18 @@ ip6_tnl_start_xmit(struct sk_buff *skb, struct net_device *dev) switch (skb->protocol) { case htons(ETH_P_IP): - ret = ip4ip6_tnl_xmit(skb, dev); + ipproto = IPPROTO_IPIP; break; case htons(ETH_P_IPV6): - ret = ip6ip6_tnl_xmit(skb, dev); + if (ip6_tnl_addr_conflict(t, ipv6_hdr(skb))) + goto tx_err; + ipproto = IPPROTO_IPV6; break; default: goto tx_err; } + ret = ipxip6_tnl_xmit(skb, dev, ipproto); if (ret < 0) goto tx_err; -- cgit v1.2.3 From 6c11fbf97e69d2164406fc634758f90d34501ece Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Wed, 20 May 2020 18:21:36 +0300 Subject: ip6_tunnel: add MPLS transmit support Add ETH_P_MPLS_UC as supported protocol. Signed-off-by: Vadim Fedorenko Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index dae6f7146b49..6b94c870693c 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1232,6 +1232,8 @@ route_lookup: ipv6_push_frag_opts(skb, &opt.ops, &proto); } + skb_set_inner_ipproto(skb, proto); + skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); ipv6h = ipv6_hdr(skb); @@ -1348,6 +1350,7 @@ ipxip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, fl6.flowlabel |= ip6_flowlabel(ipv6h); break; default: + orig_dsfield = dsfield = ip6_tclass(t->parms.flowinfo); break; } } @@ -1358,8 +1361,6 @@ ipxip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6)) return -1; - skb_set_inner_ipproto(skb, protocol); - err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, protocol); if (err != 0) { @@ -1402,6 +1403,9 @@ ip6_tnl_start_xmit(struct sk_buff *skb, struct net_device *dev) goto tx_err; ipproto = IPPROTO_IPV6; break; + case htons(ETH_P_MPLS_UC): + ipproto = IPPROTO_MPLS; + break; default: goto tx_err; } -- cgit v1.2.3 From f234efac2c6220f32cbc446e75dc1d27b04166c3 Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Wed, 20 May 2020 18:21:37 +0300 Subject: tunnel6: support for IPPROTO_MPLS This patch is just preparation for MPLS support in ip6_tunnel Signed-off-by: Vadim Fedorenko Signed-off-by: David S. Miller --- net/ipv6/tunnel6.c | 87 +++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 83 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv6/tunnel6.c b/net/ipv6/tunnel6.c index 21e7b95ddbfa..06c02ebe6b9b 100644 --- a/net/ipv6/tunnel6.c +++ b/net/ipv6/tunnel6.c @@ -21,8 +21,14 @@ static struct xfrm6_tunnel __rcu *tunnel6_handlers __read_mostly; static struct xfrm6_tunnel __rcu *tunnel46_handlers __read_mostly; +static struct xfrm6_tunnel __rcu *tunnelmpls6_handlers __read_mostly; static DEFINE_MUTEX(tunnel6_mutex); +static inline int xfrm6_tunnel_mpls_supported(void) +{ + return IS_ENABLED(CONFIG_MPLS); +} + int xfrm6_tunnel_register(struct xfrm6_tunnel *handler, unsigned short family) { struct xfrm6_tunnel __rcu **pprev; @@ -32,8 +38,21 @@ int xfrm6_tunnel_register(struct xfrm6_tunnel *handler, unsigned short family) mutex_lock(&tunnel6_mutex); - for (pprev = (family == AF_INET6) ? &tunnel6_handlers : &tunnel46_handlers; - (t = rcu_dereference_protected(*pprev, + switch (family) { + case AF_INET6: + pprev = &tunnel6_handlers; + break; + case AF_INET: + pprev = &tunnel46_handlers; + break; + case AF_MPLS: + pprev = &tunnelmpls6_handlers; + break; + default: + goto err; + } + + for (; (t = rcu_dereference_protected(*pprev, lockdep_is_held(&tunnel6_mutex))) != NULL; pprev = &t->next) { if (t->priority > priority) @@ -62,8 +81,21 @@ int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler, unsigned short family) mutex_lock(&tunnel6_mutex); - for (pprev = (family == AF_INET6) ? &tunnel6_handlers : &tunnel46_handlers; - (t = rcu_dereference_protected(*pprev, + switch (family) { + case AF_INET6: + pprev = &tunnel6_handlers; + break; + case AF_INET: + pprev = &tunnel46_handlers; + break; + case AF_MPLS: + pprev = &tunnelmpls6_handlers; + break; + default: + goto err; + } + + for (; (t = rcu_dereference_protected(*pprev, lockdep_is_held(&tunnel6_mutex))) != NULL; pprev = &t->next) { if (t == handler) { @@ -73,6 +105,7 @@ int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler, unsigned short family) } } +err: mutex_unlock(&tunnel6_mutex); synchronize_net(); @@ -86,6 +119,24 @@ EXPORT_SYMBOL(xfrm6_tunnel_deregister); handler != NULL; \ handler = rcu_dereference(handler->next)) \ +static int tunnelmpls6_rcv(struct sk_buff *skb) +{ + struct xfrm6_tunnel *handler; + + if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) + goto drop; + + for_each_tunnel_rcu(tunnelmpls6_handlers, handler) + if (!handler->handler(skb)) + return 0; + + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); + +drop: + kfree_skb(skb); + return 0; +} + static int tunnel6_rcv(struct sk_buff *skb) { struct xfrm6_tunnel *handler; @@ -146,6 +197,18 @@ static int tunnel46_err(struct sk_buff *skb, struct inet6_skb_parm *opt, return -ENOENT; } +static int tunnelmpls6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) +{ + struct xfrm6_tunnel *handler; + + for_each_tunnel_rcu(tunnelmpls6_handlers, handler) + if (!handler->err_handler(skb, opt, type, code, offset, info)) + return 0; + + return -ENOENT; +} + static const struct inet6_protocol tunnel6_protocol = { .handler = tunnel6_rcv, .err_handler = tunnel6_err, @@ -158,6 +221,12 @@ static const struct inet6_protocol tunnel46_protocol = { .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, }; +static const struct inet6_protocol tunnelmpls6_protocol = { + .handler = tunnelmpls6_rcv, + .err_handler = tunnelmpls6_err, + .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, +}; + static int __init tunnel6_init(void) { if (inet6_add_protocol(&tunnel6_protocol, IPPROTO_IPV6)) { @@ -169,6 +238,13 @@ static int __init tunnel6_init(void) inet6_del_protocol(&tunnel6_protocol, IPPROTO_IPV6); return -EAGAIN; } + if (xfrm6_tunnel_mpls_supported() && + inet6_add_protocol(&tunnelmpls6_protocol, IPPROTO_MPLS)) { + pr_err("%s: can't add protocol\n", __func__); + inet6_del_protocol(&tunnel6_protocol, IPPROTO_IPV6); + inet6_del_protocol(&tunnel46_protocol, IPPROTO_IPIP); + return -EAGAIN; + } return 0; } @@ -178,6 +254,9 @@ static void __exit tunnel6_fini(void) pr_err("%s: can't remove protocol\n", __func__); if (inet6_del_protocol(&tunnel6_protocol, IPPROTO_IPV6)) pr_err("%s: can't remove protocol\n", __func__); + if (xfrm6_tunnel_mpls_supported() && + inet6_del_protocol(&tunnelmpls6_protocol, IPPROTO_MPLS)) + pr_err("%s: can't remove protocol\n", __func__); } module_init(tunnel6_init); -- cgit v1.2.3 From f200e98d9716ce52464838dbaa2856e5ecc52194 Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Wed, 20 May 2020 18:21:38 +0300 Subject: ip6_tunnel: add generic MPLS receive support Add support for MPLS in receive side. Signed-off-by: Vadim Fedorenko Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) (limited to 'net') diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 6b94c870693c..821d96c720b9 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -89,6 +89,11 @@ struct ip6_tnl_net { struct ip6_tnl __rcu *collect_md_tun; }; +static inline int ip6_tnl_mpls_supported(void) +{ + return IS_ENABLED(CONFIG_MPLS); +} + static struct net_device_stats *ip6_get_stats(struct net_device *dev) { struct pcpu_sw_netstats tmp, sum = { 0 }; @@ -718,6 +723,20 @@ ip6ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, return 0; } +static int +mplsip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) +{ + __u32 rel_info = ntohl(info); + int err, rel_msg = 0; + u8 rel_type = type; + u8 rel_code = code; + + err = ip6_tnl_err(skb, IPPROTO_MPLS, opt, &rel_type, &rel_code, + &rel_msg, &rel_info, offset); + return err; +} + static int ip4ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t, const struct ipv6hdr *ipv6h, struct sk_buff *skb) @@ -740,6 +759,14 @@ static int ip6ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t, return IP6_ECN_decapsulate(ipv6h, skb); } +static inline int mplsip6_dscp_ecn_decapsulate(const struct ip6_tnl *t, + const struct ipv6hdr *ipv6h, + struct sk_buff *skb) +{ + /* ECN is not supported in AF_MPLS */ + return 0; +} + __u32 ip6_tnl_get_cap(struct ip6_tnl *t, const struct in6_addr *laddr, const struct in6_addr *raddr) @@ -901,6 +928,11 @@ static const struct tnl_ptk_info tpi_v4 = { .proto = htons(ETH_P_IP), }; +static const struct tnl_ptk_info tpi_mpls = { + /* no tunnel info required for mplsip6. */ + .proto = htons(ETH_P_MPLS_UC), +}; + static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto, const struct tnl_ptk_info *tpi, int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t, @@ -958,6 +990,12 @@ static int ip6ip6_rcv(struct sk_buff *skb) ip6ip6_dscp_ecn_decapsulate); } +static int mplsip6_rcv(struct sk_buff *skb) +{ + return ipxip6_rcv(skb, IPPROTO_MPLS, &tpi_mpls, + mplsip6_dscp_ecn_decapsulate); +} + struct ipv6_tel_txoption { struct ipv6_txoptions ops; __u8 dst_opt[8]; @@ -2198,6 +2236,12 @@ static struct xfrm6_tunnel ip6ip6_handler __read_mostly = { .priority = 1, }; +static struct xfrm6_tunnel mplsip6_handler __read_mostly = { + .handler = mplsip6_rcv, + .err_handler = mplsip6_err, + .priority = 1, +}; + static void __net_exit ip6_tnl_destroy_tunnels(struct net *net, struct list_head *list) { struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); @@ -2312,6 +2356,15 @@ static int __init ip6_tunnel_init(void) pr_err("%s: can't register ip6ip6\n", __func__); goto out_ip6ip6; } + + if (ip6_tnl_mpls_supported()) { + err = xfrm6_tunnel_register(&mplsip6_handler, AF_MPLS); + if (err < 0) { + pr_err("%s: can't register mplsip6\n", __func__); + goto out_mplsip6; + } + } + err = rtnl_link_register(&ip6_link_ops); if (err < 0) goto rtnl_link_failed; @@ -2319,6 +2372,9 @@ static int __init ip6_tunnel_init(void) return 0; rtnl_link_failed: + if (ip6_tnl_mpls_supported()) + xfrm6_tunnel_deregister(&mplsip6_handler, AF_MPLS); +out_mplsip6: xfrm6_tunnel_deregister(&ip6ip6_handler, AF_INET6); out_ip6ip6: xfrm6_tunnel_deregister(&ip4ip6_handler, AF_INET); @@ -2341,6 +2397,9 @@ static void __exit ip6_tunnel_cleanup(void) if (xfrm6_tunnel_deregister(&ip6ip6_handler, AF_INET6)) pr_info("%s: can't deregister ip6ip6\n", __func__); + if (ip6_tnl_mpls_supported() && + xfrm6_tunnel_deregister(&mplsip6_handler, AF_MPLS)) + pr_info("%s: can't deregister mplsip6\n", __func__); unregister_pernet_device(&ip6_tnl_net_ops); } -- cgit v1.2.3 From 1515aa70c04151676a7dfefecfcf1d36ff52bc67 Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Wed, 20 May 2020 18:21:39 +0300 Subject: mpls: Add support for IPv6 tunnels Add support for IPv6 tunnel devices in AF_MPLS. Signed-off-by: Vadim Fedorenko Signed-off-by: David S. Miller --- net/mpls/af_mpls.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index a42e4ed5ab0e..fd30ea61336e 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -1593,7 +1593,8 @@ static int mpls_dev_notify(struct notifier_block *this, unsigned long event, dev->type == ARPHRD_IPGRE || dev->type == ARPHRD_IP6GRE || dev->type == ARPHRD_SIT || - dev->type == ARPHRD_TUNNEL) { + dev->type == ARPHRD_TUNNEL || + dev->type == ARPHRD_TUNNEL6) { mdev = mpls_add_dev(dev); if (IS_ERR(mdev)) return notifier_from_errno(PTR_ERR(mdev)); -- cgit v1.2.3 From 060b6381efe58478e1d7dfff7a1e76a73a6377db Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Wed, 20 May 2020 19:18:10 +0100 Subject: net: flow_offload: simplify hw stats check handling Make FLOW_ACTION_HW_STATS_DONT_CARE be all bits, rather than none, so that drivers and __flow_action_hw_stats_check can use simple bitwise checks. Pre-fill all actions with DONT_CARE in flow_rule_alloc(), rather than relying on implicit semantics of zero from kzalloc, so that callers which don't configure action stats themselves (i.e. netfilter) get the correct behaviour by default. Only the kernel's internal API semantics change; the TC uAPI is unaffected. v4: move DONT_CARE setting to flow_rule_alloc() for robustness and simplicity. v3: set DONT_CARE in nft and ct offload. v2: rebased on net-next, removed RFC tags. Signed-off-by: Edward Cree Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c | 8 ++++---- include/net/flow_offload.h | 11 +++++++---- net/core/flow_offload.c | 6 ++++++ 3 files changed, 17 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c index b286fe158820..51e1b3930c56 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c @@ -30,14 +30,14 @@ static int mlxsw_sp_flower_parse_actions(struct mlxsw_sp *mlxsw_sp, return -EOPNOTSUPP; act = flow_action_first_entry_get(flow_action); - if (act->hw_stats == FLOW_ACTION_HW_STATS_ANY || - act->hw_stats == FLOW_ACTION_HW_STATS_IMMEDIATE) { + if (act->hw_stats & FLOW_ACTION_HW_STATS_DISABLED) { + /* Nothing to do */ + } else if (act->hw_stats & FLOW_ACTION_HW_STATS_IMMEDIATE) { /* Count action is inserted first */ err = mlxsw_sp_acl_rulei_act_count(mlxsw_sp, rulei, extack); if (err) return err; - } else if (act->hw_stats != FLOW_ACTION_HW_STATS_DISABLED && - act->hw_stats != FLOW_ACTION_HW_STATS_DONT_CARE) { + } else { NL_SET_ERR_MSG_MOD(extack, "Unsupported action HW stats type"); return -EOPNOTSUPP; } diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index 4001ffb04f0d..95d633785ef9 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -168,10 +168,11 @@ enum flow_action_hw_stats_bit { FLOW_ACTION_HW_STATS_IMMEDIATE_BIT, FLOW_ACTION_HW_STATS_DELAYED_BIT, FLOW_ACTION_HW_STATS_DISABLED_BIT, + + FLOW_ACTION_HW_STATS_NUM_BITS }; enum flow_action_hw_stats { - FLOW_ACTION_HW_STATS_DONT_CARE = 0, FLOW_ACTION_HW_STATS_IMMEDIATE = BIT(FLOW_ACTION_HW_STATS_IMMEDIATE_BIT), FLOW_ACTION_HW_STATS_DELAYED = BIT(FLOW_ACTION_HW_STATS_DELAYED_BIT), @@ -179,6 +180,7 @@ enum flow_action_hw_stats { FLOW_ACTION_HW_STATS_DELAYED, FLOW_ACTION_HW_STATS_DISABLED = BIT(FLOW_ACTION_HW_STATS_DISABLED_BIT), + FLOW_ACTION_HW_STATS_DONT_CARE = BIT(FLOW_ACTION_HW_STATS_NUM_BITS) - 1, }; typedef void (*action_destr)(void *priv); @@ -340,11 +342,12 @@ __flow_action_hw_stats_check(const struct flow_action *action, return false; action_entry = flow_action_first_entry_get(action); - if (action_entry->hw_stats == FLOW_ACTION_HW_STATS_DONT_CARE) - return true; + + /* Zero is not a legal value for hw_stats, catch anyone passing it */ + WARN_ON_ONCE(!action_entry->hw_stats); if (!check_allow_bit && - action_entry->hw_stats != FLOW_ACTION_HW_STATS_ANY) { + ~action_entry->hw_stats & FLOW_ACTION_HW_STATS_ANY) { NL_SET_ERR_MSG_MOD(extack, "Driver supports only default HW stats type \"any\""); return false; } else if (check_allow_bit && diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c index e951b743bed3..e64941c526b1 100644 --- a/net/core/flow_offload.c +++ b/net/core/flow_offload.c @@ -8,6 +8,7 @@ struct flow_rule *flow_rule_alloc(unsigned int num_actions) { struct flow_rule *rule; + int i; rule = kzalloc(struct_size(rule, action.entries, num_actions), GFP_KERNEL); @@ -15,6 +16,11 @@ struct flow_rule *flow_rule_alloc(unsigned int num_actions) return NULL; rule->action.num_entries = num_actions; + /* Pre-fill each action hw_stats with DONT_CARE. + * Caller can override this if it wants stats for a given action. + */ + for (i = 0; i < num_actions; i++) + rule->action.entries[i].hw_stats = FLOW_ACTION_HW_STATS_DONT_CARE; return rule; } -- cgit v1.2.3 From 7aa38018be1fa4cff3e631f26bc821086ba90d29 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Thu, 21 May 2020 23:19:05 +0000 Subject: bridge: mrp: Add br_mrp_unique_ifindex function It is not allow to have the same net bridge port part of multiple MRP rings. Therefore add a check if the port is used already in a different MRP. In that case return failure. Fixes: 9a9f26e8f7ea ("bridge: mrp: Connect MRP API with the switchdev API") Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- net/bridge/br_mrp.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'net') diff --git a/net/bridge/br_mrp.c b/net/bridge/br_mrp.c index d7bc09de4c13..854e31bf0151 100644 --- a/net/bridge/br_mrp.c +++ b/net/bridge/br_mrp.c @@ -37,6 +37,26 @@ static struct br_mrp *br_mrp_find_id(struct net_bridge *br, u32 ring_id) return res; } +static bool br_mrp_unique_ifindex(struct net_bridge *br, u32 ifindex) +{ + struct br_mrp *mrp; + + list_for_each_entry_rcu(mrp, &br->mrp_list, list, + lockdep_rtnl_is_held()) { + struct net_bridge_port *p; + + p = rtnl_dereference(mrp->p_port); + if (p && p->dev->ifindex == ifindex) + return false; + + p = rtnl_dereference(mrp->s_port); + if (p && p->dev->ifindex == ifindex) + return false; + } + + return true; +} + static struct br_mrp *br_mrp_find_port(struct net_bridge *br, struct net_bridge_port *p) { @@ -255,6 +275,11 @@ int br_mrp_add(struct net_bridge *br, struct br_mrp_instance *instance) !br_mrp_get_port(br, instance->s_ifindex)) return -EINVAL; + /* It is not possible to have the same port part of multiple rings */ + if (!br_mrp_unique_ifindex(br, instance->p_ifindex) || + !br_mrp_unique_ifindex(br, instance->s_ifindex)) + return -EINVAL; + mrp = kzalloc(sizeof(*mrp), GFP_KERNEL); if (!mrp) return -ENOMEM; -- cgit v1.2.3 From 4fb13499d3a0cc74cf9820c052481f0ccda2bb23 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Thu, 21 May 2020 23:19:07 +0000 Subject: bridge: mrp: Restore port state when deleting MRP instance When a MRP instance is deleted, then restore the port according to the bridge state. If the bridge is up then the ports will be in forwarding state otherwise will be in disabled state. Fixes: 9a9f26e8f7ea ("bridge: mrp: Connect MRP API with the switchdev API") Acked-by: Nikolay Aleksandrov Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- net/bridge/br_mrp.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/bridge/br_mrp.c b/net/bridge/br_mrp.c index 854e31bf0151..528d767eb026 100644 --- a/net/bridge/br_mrp.c +++ b/net/bridge/br_mrp.c @@ -223,6 +223,7 @@ out: static void br_mrp_del_impl(struct net_bridge *br, struct br_mrp *mrp) { struct net_bridge_port *p; + u8 state; /* Stop sending MRP_Test frames */ cancel_delayed_work_sync(&mrp->test_work); @@ -234,20 +235,24 @@ static void br_mrp_del_impl(struct net_bridge *br, struct br_mrp *mrp) p = rtnl_dereference(mrp->p_port); if (p) { spin_lock_bh(&br->lock); - p->state = BR_STATE_FORWARDING; + state = netif_running(br->dev) ? + BR_STATE_FORWARDING : BR_STATE_DISABLED; + p->state = state; p->flags &= ~BR_MRP_AWARE; spin_unlock_bh(&br->lock); - br_mrp_port_switchdev_set_state(p, BR_STATE_FORWARDING); + br_mrp_port_switchdev_set_state(p, state); rcu_assign_pointer(mrp->p_port, NULL); } p = rtnl_dereference(mrp->s_port); if (p) { spin_lock_bh(&br->lock); - p->state = BR_STATE_FORWARDING; + state = netif_running(br->dev) ? + BR_STATE_FORWARDING : BR_STATE_DISABLED; + p->state = state; p->flags &= ~BR_MRP_AWARE; spin_unlock_bh(&br->lock); - br_mrp_port_switchdev_set_state(p, BR_STATE_FORWARDING); + br_mrp_port_switchdev_set_state(p, state); rcu_assign_pointer(mrp->s_port, NULL); } -- cgit v1.2.3 From 07a7f30819475a6f058df6bba5150c50e7942cfb Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 22 May 2020 13:05:26 -0700 Subject: net: psample: fix build error when CONFIG_INET is not enabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix psample build error when CONFIG_INET is not set/enabled by bracketing the tunnel code in #ifdef CONFIG_NET / #endif. ../net/psample/psample.c: In function ‘__psample_ip_tun_to_nlattr’: ../net/psample/psample.c:216:25: error: implicit declaration of function ‘ip_tunnel_info_opts’; did you mean ‘ip_tunnel_info_opts_set’? [-Werror=implicit-function-declaration] Signed-off-by: Randy Dunlap Cc: Yotam Gigi Cc: Cong Wang Signed-off-by: David S. Miller --- net/psample/psample.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'net') diff --git a/net/psample/psample.c b/net/psample/psample.c index 34a74043840b..a042261a45c5 100644 --- a/net/psample/psample.c +++ b/net/psample/psample.c @@ -209,6 +209,7 @@ void psample_group_put(struct psample_group *group) } EXPORT_SYMBOL_GPL(psample_group_put); +#ifdef CONFIG_INET static int __psample_ip_tun_to_nlattr(struct sk_buff *skb, struct ip_tunnel_info *tun_info) { @@ -352,12 +353,15 @@ static int psample_tunnel_meta_len(struct ip_tunnel_info *tun_info) return sum; } +#endif void psample_sample_packet(struct psample_group *group, struct sk_buff *skb, u32 trunc_size, int in_ifindex, int out_ifindex, u32 sample_rate) { +#ifdef CONFIG_INET struct ip_tunnel_info *tun_info; +#endif struct sk_buff *nl_skb; int data_len; int meta_len; @@ -371,9 +375,11 @@ void psample_sample_packet(struct psample_group *group, struct sk_buff *skb, nla_total_size(sizeof(u32)) + /* group_num */ nla_total_size(sizeof(u32)); /* seq */ +#ifdef CONFIG_INET tun_info = skb_tunnel_info(skb); if (tun_info) meta_len += psample_tunnel_meta_len(tun_info); +#endif data_len = min(skb->len, trunc_size); if (meta_len + nla_total_size(data_len) > PSAMPLE_MAX_PACKET_SIZE) @@ -429,11 +435,13 @@ void psample_sample_packet(struct psample_group *group, struct sk_buff *skb, goto error; } +#ifdef CONFIG_INET if (tun_info) { ret = psample_ip_tun_to_nlattr(nl_skb, tun_info); if (unlikely(ret < 0)) goto error; } +#endif genlmsg_end(nl_skb, data); genlmsg_multicast_netns(&psample_nl_family, group->net, nl_skb, 0, -- cgit v1.2.3 From cb8a14b205699fee1053a406e1e3fce330b6bdc3 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Sat, 23 May 2020 15:27:08 +0200 Subject: net: move devres helpers into a separate source file There's currently only a single devres helper in net/ - devm variant of alloc_etherdev. Let's move it to net/devres.c with the intention of assing a second one: devm_register_netdev(). This new routine will need to know the address of the release function of devm_alloc_etherdev() so that it can verify (using devres_find()) that the struct net_device that's being passed to it is also resource managed. Signed-off-by: Bartosz Golaszewski Signed-off-by: David S. Miller --- net/Makefile | 2 +- net/devres.c | 36 ++++++++++++++++++++++++++++++++++++ net/ethernet/eth.c | 28 ---------------------------- 3 files changed, 37 insertions(+), 29 deletions(-) create mode 100644 net/devres.c (limited to 'net') diff --git a/net/Makefile b/net/Makefile index 07ea48160874..5744bf1997fd 100644 --- a/net/Makefile +++ b/net/Makefile @@ -6,7 +6,7 @@ # Rewritten to use lists instead of if-statements. # -obj-$(CONFIG_NET) := socket.o core/ +obj-$(CONFIG_NET) := devres.o socket.o core/ tmp-$(CONFIG_COMPAT) := compat.o obj-$(CONFIG_NET) += $(tmp-y) diff --git a/net/devres.c b/net/devres.c new file mode 100644 index 000000000000..c1465d9f9019 --- /dev/null +++ b/net/devres.c @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * This file contains all networking devres helpers. + */ + +#include +#include +#include + +static void devm_free_netdev(struct device *dev, void *res) +{ + free_netdev(*(struct net_device **)res); +} + +struct net_device *devm_alloc_etherdev_mqs(struct device *dev, int sizeof_priv, + unsigned int txqs, unsigned int rxqs) +{ + struct net_device **dr; + struct net_device *netdev; + + dr = devres_alloc(devm_free_netdev, sizeof(*dr), GFP_KERNEL); + if (!dr) + return NULL; + + netdev = alloc_etherdev_mqs(sizeof_priv, txqs, rxqs); + if (!netdev) { + devres_free(dr); + return NULL; + } + + *dr = netdev; + devres_add(dev, dr); + + return netdev; +} +EXPORT_SYMBOL(devm_alloc_etherdev_mqs); diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index c8b903302ff2..dac65180c4ef 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -400,34 +400,6 @@ struct net_device *alloc_etherdev_mqs(int sizeof_priv, unsigned int txqs, } EXPORT_SYMBOL(alloc_etherdev_mqs); -static void devm_free_netdev(struct device *dev, void *res) -{ - free_netdev(*(struct net_device **)res); -} - -struct net_device *devm_alloc_etherdev_mqs(struct device *dev, int sizeof_priv, - unsigned int txqs, unsigned int rxqs) -{ - struct net_device **dr; - struct net_device *netdev; - - dr = devres_alloc(devm_free_netdev, sizeof(*dr), GFP_KERNEL); - if (!dr) - return NULL; - - netdev = alloc_etherdev_mqs(sizeof_priv, txqs, rxqs); - if (!netdev) { - devres_free(dr); - return NULL; - } - - *dr = netdev; - devres_add(dev, dr); - - return netdev; -} -EXPORT_SYMBOL(devm_alloc_etherdev_mqs); - ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len) { return scnprintf(buf, PAGE_SIZE, "%*phC\n", len, addr); -- cgit v1.2.3 From f75063abc39441585a13fcc5b9ef3af993e9ac40 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Sat, 23 May 2020 15:27:09 +0200 Subject: net: devres: define a separate devres structure for devm_alloc_etherdev() Not using a proxy structure to store struct net_device doesn't save anything in terms of compiled code size or memory usage but significantly decreases the readability of the code with all the pointer casting. Define struct net_device_devres and use it in devm_alloc_etherdev_mqs(). Signed-off-by: Bartosz Golaszewski Signed-off-by: David S. Miller --- net/devres.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/devres.c b/net/devres.c index c1465d9f9019..b97b0c5a8216 100644 --- a/net/devres.c +++ b/net/devres.c @@ -7,30 +7,34 @@ #include #include -static void devm_free_netdev(struct device *dev, void *res) +struct net_device_devres { + struct net_device *ndev; +}; + +static void devm_free_netdev(struct device *dev, void *this) { - free_netdev(*(struct net_device **)res); + struct net_device_devres *res = this; + + free_netdev(res->ndev); } struct net_device *devm_alloc_etherdev_mqs(struct device *dev, int sizeof_priv, unsigned int txqs, unsigned int rxqs) { - struct net_device **dr; - struct net_device *netdev; + struct net_device_devres *dr; dr = devres_alloc(devm_free_netdev, sizeof(*dr), GFP_KERNEL); if (!dr) return NULL; - netdev = alloc_etherdev_mqs(sizeof_priv, txqs, rxqs); - if (!netdev) { + dr->ndev = alloc_etherdev_mqs(sizeof_priv, txqs, rxqs); + if (!dr->ndev) { devres_free(dr); return NULL; } - *dr = netdev; devres_add(dev, dr); - return netdev; + return dr->ndev; } EXPORT_SYMBOL(devm_alloc_etherdev_mqs); -- cgit v1.2.3 From cd16627fc0468564fdd60f20ad52420b87195127 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Sat, 23 May 2020 15:27:10 +0200 Subject: net: devres: provide devm_register_netdev() Provide devm_register_netdev() - a device resource managed variant of register_netdev(). This new helper will only work for net_device structs that are also already managed by devres. Signed-off-by: Bartosz Golaszewski Signed-off-by: David S. Miller --- Documentation/driver-api/driver-model/devres.rst | 1 + include/linux/netdevice.h | 2 + net/devres.c | 55 ++++++++++++++++++++++++ 3 files changed, 58 insertions(+) (limited to 'net') diff --git a/Documentation/driver-api/driver-model/devres.rst b/Documentation/driver-api/driver-model/devres.rst index 50df28d20fa7..fc242ed4bde5 100644 --- a/Documentation/driver-api/driver-model/devres.rst +++ b/Documentation/driver-api/driver-model/devres.rst @@ -375,6 +375,7 @@ MUX NET devm_alloc_etherdev() devm_alloc_etherdev_mqs() + devm_register_netdev() PER-CPU MEM devm_alloc_percpu() diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a18f8fdf4260..1a96e9c4ec36 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4280,6 +4280,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, int register_netdev(struct net_device *dev); void unregister_netdev(struct net_device *dev); +int devm_register_netdev(struct device *dev, struct net_device *ndev); + /* General hardware address lists handling functions */ int __hw_addr_sync(struct netdev_hw_addr_list *to_list, struct netdev_hw_addr_list *from_list, int addr_len); diff --git a/net/devres.c b/net/devres.c index b97b0c5a8216..57a6a88d11f6 100644 --- a/net/devres.c +++ b/net/devres.c @@ -38,3 +38,58 @@ struct net_device *devm_alloc_etherdev_mqs(struct device *dev, int sizeof_priv, return dr->ndev; } EXPORT_SYMBOL(devm_alloc_etherdev_mqs); + +static void devm_netdev_release(struct device *dev, void *this) +{ + struct net_device_devres *res = this; + + unregister_netdev(res->ndev); +} + +static int netdev_devres_match(struct device *dev, void *this, void *match_data) +{ + struct net_device_devres *res = this; + struct net_device *ndev = match_data; + + return ndev == res->ndev; +} + +/** + * devm_register_netdev - resource managed variant of register_netdev() + * @dev: managing device for this netdev - usually the parent device + * @ndev: device to register + * + * This is a devres variant of register_netdev() for which the unregister + * function will be call automatically when the managing device is + * detached. Note: the net_device used must also be resource managed by + * the same struct device. + */ +int devm_register_netdev(struct device *dev, struct net_device *ndev) +{ + struct net_device_devres *dr; + int ret; + + /* struct net_device must itself be managed. For now a managed netdev + * can only be allocated by devm_alloc_etherdev_mqs() so the check is + * straightforward. + */ + if (WARN_ON(!devres_find(dev, devm_free_netdev, + netdev_devres_match, ndev))) + return -EINVAL; + + dr = devres_alloc(devm_netdev_release, sizeof(*dr), GFP_KERNEL); + if (!dr) + return -ENOMEM; + + ret = register_netdev(ndev); + if (ret) { + devres_free(dr); + return ret; + } + + dr->ndev = ndev; + devres_add(ndev->dev.parent, dr); + + return 0; +} +EXPORT_SYMBOL(devm_register_netdev); -- cgit v1.2.3 From 316107119f473e764cf5e50437333c8b83bec0da Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sat, 23 May 2020 17:40:25 +0200 Subject: ethtool: propagate get_coalesce return value get_coalesce returns 0 or ERRNO, but the return value isn't checked. The returned coalesce data may be invalid if an ERRNO is set, therefore better check and propagate the return value. Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- net/ethtool/ioctl.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 31e0b4e88a9d..b5df90c981c2 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -1510,11 +1510,14 @@ static noinline_for_stack int ethtool_get_coalesce(struct net_device *dev, void __user *useraddr) { struct ethtool_coalesce coalesce = { .cmd = ETHTOOL_GCOALESCE }; + int ret; if (!dev->ethtool_ops->get_coalesce) return -EOPNOTSUPP; - dev->ethtool_ops->get_coalesce(dev, &coalesce); + ret = dev->ethtool_ops->get_coalesce(dev, &coalesce); + if (ret) + return ret; if (copy_to_user(useraddr, &coalesce, sizeof(coalesce))) return -EFAULT; -- cgit v1.2.3 From 6a1015b0b4b1f3a0de9e40d2ba86877d13f50918 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 23 May 2020 20:46:48 +0300 Subject: ipv4: potential underflow in compat_ip_setsockopt() The value of "n" is capped at 0x1ffffff but it checked for negative values. I don't think this causes a problem but I'm not certain and it's harmless to prevent it. Fixes: 2e04172875c9 ("ipv4: do compat setsockopt for MCAST_MSFILTER directly") Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- net/ipv4/ip_sockglue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index a2469bc57cfe..f43d5f12aa86 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -1347,8 +1347,8 @@ int compat_ip_setsockopt(struct sock *sk, int level, int optname, { const int size0 = offsetof(struct compat_group_filter, gf_slist); struct compat_group_filter *gf32; + unsigned int n; void *p; - int n; if (optlen < size0) return -EINVAL; -- cgit v1.2.3 From 45af29ca761c275e350cca659856bc56f1035ef9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 24 May 2020 11:00:02 -0700 Subject: tcp: allow traceroute -Mtcp for unpriv users MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Unpriv users can use traceroute over plain UDP sockets, but not TCP ones. $ traceroute -Mtcp 8.8.8.8 You do not have enough privileges to use this traceroute method. $ traceroute -n -Mudp 8.8.8.8 traceroute to 8.8.8.8 (8.8.8.8), 30 hops max, 60 byte packets 1 192.168.86.1 3.631 ms 3.512 ms 3.405 ms 2 10.1.10.1 4.183 ms 4.125 ms 4.072 ms 3 96.120.88.125 20.621 ms 19.462 ms 20.553 ms 4 96.110.177.65 24.271 ms 25.351 ms 25.250 ms 5 69.139.199.197 44.492 ms 43.075 ms 44.346 ms 6 68.86.143.93 27.969 ms 25.184 ms 25.092 ms 7 96.112.146.18 25.323 ms 96.112.146.22 25.583 ms 96.112.146.26 24.502 ms 8 72.14.239.204 24.405 ms 74.125.37.224 16.326 ms 17.194 ms 9 209.85.251.9 18.154 ms 209.85.247.55 14.449 ms 209.85.251.9 26.296 ms^C We can easily support traceroute over TCP, by queueing an error message into socket error queue. Note that applications need to set IP_RECVERR/IPV6_RECVERR option to enable this feature, and that the error message is only queued while in SYN_SNT state. socket(AF_INET6, SOCK_STREAM, IPPROTO_IP) = 3 setsockopt(3, SOL_IPV6, IPV6_RECVERR, [1], 4) = 0 setsockopt(3, SOL_SOCKET, SO_TIMESTAMP_OLD, [1], 4) = 0 setsockopt(3, SOL_IPV6, IPV6_UNICAST_HOPS, [5], 4) = 0 connect(3, {sa_family=AF_INET6, sin6_port=htons(8787), sin6_flowinfo=htonl(0), inet_pton(AF_INET6, "2002:a05:6608:297::", &sin6_addr), sin6_scope_id=0}, 28) = -1 EHOSTUNREACH (No route to host) recvmsg(3, {msg_name={sa_family=AF_INET6, sin6_port=htons(8787), sin6_flowinfo=htonl(0), inet_pton(AF_INET6, "2002:a05:6608:297::", &sin6_addr), sin6_scope_id=0}, msg_namelen=1024->28, msg_iov=[{iov_base="`\r\337\320\0004\6\1&\7\370\260\200\231\16\27\0\0\0\0\0\0\0\0 \2\n\5f\10\2\227"..., iov_len=1024}], msg_iovlen=1, msg_control=[{cmsg_len=32, cmsg_level=SOL_SOCKET, cmsg_type=SO_TIMESTAMP_OLD, cmsg_data={tv_sec=1590340680, tv_usec=272424}}, {cmsg_len=60, cmsg_level=SOL_IPV6, cmsg_type=IPV6_RECVERR}], msg_controllen=96, msg_flags=MSG_ERRQUEUE}, MSG_ERRQUEUE) = 144 Suggested-by: Maciej Żenczykowski Cc: Willem de Bruijn Reviewed-by: Maciej Żenczykowski Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 2 ++ net/ipv6/tcp_ipv6.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'net') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 6c05f1ceb538..900c6d154cbc 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -573,6 +573,8 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info) if (fastopen && !fastopen->sk) break; + ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th); + if (!sock_owned_by_user(sk)) { sk->sk_err = err; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 413b3425ac66..01a6f5111a77 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -463,6 +463,8 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (fastopen && !fastopen->sk) break; + ipv6_icmp_error(sk, skb, err, th->dest, ntohl(info), (u8 *)th); + if (!sock_owned_by_user(sk)) { sk->sk_err = err; sk->sk_error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ -- cgit v1.2.3 From 617504c67e01d30310558442777a4112ea6d587d Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Mon, 25 May 2020 09:55:41 +0000 Subject: bridge: mrp: Fix out-of-bounds read in br_mrp_parse The issue was reported by syzbot. When the function br_mrp_parse was called with a valid net_bridge_port, the net_bridge was an invalid pointer. Therefore the check br->stp_enabled could pass/fail depending where it was pointing in memory. The fix consists of setting the net_bridge pointer if the port is a valid pointer. Reported-by: syzbot+9c6f0f1f8e32223df9a4@syzkaller.appspotmail.com Fixes: 6536993371fa ("bridge: mrp: Integrate MRP into the bridge") Signed-off-by: Horatiu Vultur Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_mrp_netlink.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/bridge/br_mrp_netlink.c b/net/bridge/br_mrp_netlink.c index 397e7f710772..4a08a99519b0 100644 --- a/net/bridge/br_mrp_netlink.c +++ b/net/bridge/br_mrp_netlink.c @@ -27,6 +27,12 @@ int br_mrp_parse(struct net_bridge *br, struct net_bridge_port *p, struct nlattr *tb[IFLA_BRIDGE_MRP_MAX + 1]; int err; + /* When this function is called for a port then the br pointer is + * invalid, therefor set the br to point correctly + */ + if (p) + br = p->br; + if (br->stp_enabled != BR_NO_STP) { NL_SET_ERR_MSG_MOD(extack, "MRP can't be enabled if STP is already enabled"); return -EINVAL; -- cgit v1.2.3 From 9ad346c90509ebd983f60da7d082f261ad329507 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Mon, 25 Nov 2019 10:46:50 +0100 Subject: batman-adv: Revert "disable ethtool link speed detection when auto negotiation off" The commit 8c46fcd78308 ("batman-adv: disable ethtool link speed detection when auto negotiation off") disabled the usage of ethtool's link_ksetting when auto negotation was enabled due to invalid values when used with tun/tap virtual net_devices. According to the patch, automatic measurements should be used for these kind of interfaces. But there are major flaws with this argumentation: * automatic measurements are not implemented * auto negotiation has nothing to do with the validity of the retrieved values The first point has to be fixed by a longer patch series. The "validity" part of the second point must be addressed in the same patch series by dropping the usage of ethtool's link_ksetting (thus always doing automatic measurements over ethernet). Drop the patch again to have more default values for various net_device types/configurations. The user can still overwrite them using the batadv_hardif's BATADV_ATTR_THROUGHPUT_OVERRIDE. Reported-by: Matthias Schiffer Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/bat_v_elp.c | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) (limited to 'net') diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index 353e49c40e7f..0bdefa35da98 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -127,20 +127,7 @@ static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh) rtnl_lock(); ret = __ethtool_get_link_ksettings(hard_iface->net_dev, &link_settings); rtnl_unlock(); - - /* Virtual interface drivers such as tun / tap interfaces, VLAN, etc - * tend to initialize the interface throughput with some value for the - * sake of having a throughput number to export via ethtool. This - * exported throughput leaves batman-adv to conclude the interface - * throughput is genuine (reflecting reality), thus no measurements - * are necessary. - * - * Based on the observation that those interface types also tend to set - * the link auto-negotiation to 'off', batman-adv shall check this - * setting to differentiate between genuine link throughput information - * and placeholders installed by virtual interfaces. - */ - if (ret == 0 && link_settings.base.autoneg == AUTONEG_ENABLE) { + if (ret == 0) { /* link characteristics might change over time */ if (link_settings.base.duplex == DUPLEX_FULL) hard_iface->bat_v.flags |= BATADV_FULL_DUPLEX; -- cgit v1.2.3 From 239174945dac8cb9613db7755103d5fb6c32241d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 25 May 2020 20:15:24 -0700 Subject: tcp: tcp_v4_err() icmp skb is named icmp_skb I missed the fact that tcp_v4_err() differs from tcp_v6_err(). After commit 4d1a2d9ec1c1 ("Rename skb to icmp_skb in tcp_v4_err()") the skb argument has been renamed to icmp_skb only in one function. I will in a future patch reconciliate these functions to avoid this kind of confusion. Fixes: 45af29ca761c ("tcp: allow traceroute -Mtcp for unpriv users") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 900c6d154cbc..6789671f0f5a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -573,7 +573,7 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info) if (fastopen && !fastopen->sk) break; - ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th); + ip_icmp_error(sk, icmp_skb, err, th->dest, info, (u8 *)th); if (!sock_owned_by_user(sk)) { sk->sk_err = err; -- cgit v1.2.3 From d7626b5acff9227e2a65da636a53e09bdafdc0aa Mon Sep 17 00:00:00 2001 From: Tuong Lien Date: Tue, 26 May 2020 16:38:34 +0700 Subject: tipc: introduce Gap ACK blocks for broadcast link As achieved through commit 9195948fbf34 ("tipc: improve TIPC throughput by Gap ACK blocks"), we apply the same mechanism for the broadcast link as well. The 'Gap ACK blocks' data field in a 'PROTOCOL/STATE_MSG' will consist of two parts built for both the broadcast and unicast types: 31 16 15 0 +-------------+-------------+-------------+-------------+ | bgack_cnt | ugack_cnt | len | +-------------+-------------+-------------+-------------+ - | gap | ack | | +-------------+-------------+-------------+-------------+ > bc gacks : : : | +-------------+-------------+-------------+-------------+ - | gap | ack | | +-------------+-------------+-------------+-------------+ > uc gacks : : : | +-------------+-------------+-------------+-------------+ - which is "automatically" backward-compatible. We also increase the max number of Gap ACK blocks to 128, allowing upto 64 blocks per type (total buffer size = 516 bytes). Besides, the 'tipc_link_advance_transmq()' function is refactored which is applicable for both the unicast and broadcast cases now, so some old functions can be removed and the code is optimized. With the patch, TIPC broadcast is more robust regardless of packet loss or disorder, latency, ... in the underlying network. Its performance is boost up significantly. For example, experiment with a 5% packet loss rate results: $ time tipc-pipe --mc --rdm --data_size 123 --data_num 1500000 real 0m 42.46s user 0m 1.16s sys 0m 17.67s Without the patch: $ time tipc-pipe --mc --rdm --data_size 123 --data_num 1500000 real 8m 27.94s user 0m 0.55s sys 0m 2.38s Acked-by: Ying Xue Acked-by: Jon Maloy Signed-off-by: Tuong Lien Signed-off-by: David S. Miller --- net/tipc/bcast.c | 9 +- net/tipc/link.c | 425 ++++++++++++++++++++++++++++++++----------------------- net/tipc/link.h | 7 +- net/tipc/msg.h | 27 +++- net/tipc/node.c | 10 +- 5 files changed, 293 insertions(+), 185 deletions(-) (limited to 'net') diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 4c20be08b9c4..3ce690a96ee9 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -474,7 +474,7 @@ void tipc_bcast_ack_rcv(struct net *net, struct tipc_link *l, __skb_queue_head_init(&xmitq); tipc_bcast_lock(net); - tipc_link_bc_ack_rcv(l, acked, &xmitq); + tipc_link_bc_ack_rcv(l, acked, 0, NULL, &xmitq); tipc_bcast_unlock(net); tipc_bcbase_xmit(net, &xmitq); @@ -492,6 +492,7 @@ int tipc_bcast_sync_rcv(struct net *net, struct tipc_link *l, struct tipc_msg *hdr) { struct sk_buff_head *inputq = &tipc_bc_base(net)->inputq; + struct tipc_gap_ack_blks *ga; struct sk_buff_head xmitq; int rc = 0; @@ -501,8 +502,10 @@ int tipc_bcast_sync_rcv(struct net *net, struct tipc_link *l, if (msg_type(hdr) != STATE_MSG) { tipc_link_bc_init_rcv(l, hdr); } else if (!msg_bc_ack_invalid(hdr)) { - tipc_link_bc_ack_rcv(l, msg_bcast_ack(hdr), &xmitq); - rc = tipc_link_bc_sync_rcv(l, hdr, &xmitq); + tipc_get_gap_ack_blks(&ga, l, hdr, false); + rc = tipc_link_bc_ack_rcv(l, msg_bcast_ack(hdr), + msg_bc_gap(hdr), ga, &xmitq); + rc |= tipc_link_bc_sync_rcv(l, hdr, &xmitq); } tipc_bcast_unlock(net); diff --git a/net/tipc/link.c b/net/tipc/link.c index d4675e922a8f..d29b9c531171 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -188,6 +188,8 @@ struct tipc_link { /* Broadcast */ u16 ackers; u16 acked; + u16 last_gap; + struct tipc_gap_ack_blks *last_ga; struct tipc_link *bc_rcvlink; struct tipc_link *bc_sndlink; u8 nack_state; @@ -249,11 +251,14 @@ static int tipc_link_build_nack_msg(struct tipc_link *l, struct sk_buff_head *xmitq); static void tipc_link_build_bc_init_msg(struct tipc_link *l, struct sk_buff_head *xmitq); -static int tipc_link_release_pkts(struct tipc_link *l, u16 to); -static u16 tipc_build_gap_ack_blks(struct tipc_link *l, void *data, u16 gap); -static int tipc_link_advance_transmq(struct tipc_link *l, u16 acked, u16 gap, +static u8 __tipc_build_gap_ack_blks(struct tipc_gap_ack_blks *ga, + struct tipc_link *l, u8 start_index); +static u16 tipc_build_gap_ack_blks(struct tipc_link *l, struct tipc_msg *hdr); +static int tipc_link_advance_transmq(struct tipc_link *l, struct tipc_link *r, + u16 acked, u16 gap, struct tipc_gap_ack_blks *ga, - struct sk_buff_head *xmitq); + struct sk_buff_head *xmitq, + bool *retransmitted, int *rc); static void tipc_link_update_cwin(struct tipc_link *l, int released, bool retransmitted); /* @@ -370,7 +375,7 @@ void tipc_link_remove_bc_peer(struct tipc_link *snd_l, snd_l->ackers--; rcv_l->bc_peer_is_up = true; rcv_l->state = LINK_ESTABLISHED; - tipc_link_bc_ack_rcv(rcv_l, ack, xmitq); + tipc_link_bc_ack_rcv(rcv_l, ack, 0, NULL, xmitq); trace_tipc_link_reset(rcv_l, TIPC_DUMP_ALL, "bclink removed!"); tipc_link_reset(rcv_l); rcv_l->state = LINK_RESET; @@ -784,8 +789,6 @@ bool tipc_link_too_silent(struct tipc_link *l) return (l->silent_intv_cnt + 2 > l->abort_limit); } -static int tipc_link_bc_retrans(struct tipc_link *l, struct tipc_link *r, - u16 from, u16 to, struct sk_buff_head *xmitq); /* tipc_link_timeout - perform periodic task as instructed from node timeout */ int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq) @@ -948,6 +951,9 @@ void tipc_link_reset(struct tipc_link *l) l->snd_nxt_state = 1; l->rcv_nxt_state = 1; l->acked = 0; + l->last_gap = 0; + kfree(l->last_ga); + l->last_ga = NULL; l->silent_intv_cnt = 0; l->rst_cnt = 0; l->bc_peer_is_up = false; @@ -1183,68 +1189,14 @@ static bool link_retransmit_failure(struct tipc_link *l, struct tipc_link *r, if (link_is_bc_sndlink(l)) { r->state = LINK_RESET; - *rc = TIPC_LINK_DOWN_EVT; + *rc |= TIPC_LINK_DOWN_EVT; } else { - *rc = tipc_link_fsm_evt(l, LINK_FAILURE_EVT); + *rc |= tipc_link_fsm_evt(l, LINK_FAILURE_EVT); } return true; } -/* tipc_link_bc_retrans() - retransmit zero or more packets - * @l: the link to transmit on - * @r: the receiving link ordering the retransmit. Same as l if unicast - * @from: retransmit from (inclusive) this sequence number - * @to: retransmit to (inclusive) this sequence number - * xmitq: queue for accumulating the retransmitted packets - */ -static int tipc_link_bc_retrans(struct tipc_link *l, struct tipc_link *r, - u16 from, u16 to, struct sk_buff_head *xmitq) -{ - struct sk_buff *_skb, *skb = skb_peek(&l->transmq); - u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; - u16 ack = l->rcv_nxt - 1; - int retransmitted = 0; - struct tipc_msg *hdr; - int rc = 0; - - if (!skb) - return 0; - if (less(to, from)) - return 0; - - trace_tipc_link_retrans(r, from, to, &l->transmq); - - if (link_retransmit_failure(l, r, &rc)) - return rc; - - skb_queue_walk(&l->transmq, skb) { - hdr = buf_msg(skb); - if (less(msg_seqno(hdr), from)) - continue; - if (more(msg_seqno(hdr), to)) - break; - if (time_before(jiffies, TIPC_SKB_CB(skb)->nxt_retr)) - continue; - TIPC_SKB_CB(skb)->nxt_retr = TIPC_BC_RETR_LIM; - _skb = pskb_copy(skb, GFP_ATOMIC); - if (!_skb) - return 0; - hdr = buf_msg(_skb); - msg_set_ack(hdr, ack); - msg_set_bcast_ack(hdr, bc_ack); - _skb->priority = TC_PRIO_CONTROL; - __skb_queue_tail(xmitq, _skb); - l->stats.retransmitted++; - retransmitted++; - /* Increase actual retrans counter & mark first time */ - if (!TIPC_SKB_CB(skb)->retr_cnt++) - TIPC_SKB_CB(skb)->retr_stamp = jiffies; - } - tipc_link_update_cwin(l, 0, retransmitted); - return 0; -} - /* tipc_data_input - deliver data and name distr msgs to upper layer * * Consumes buffer if message is of right type @@ -1402,46 +1354,71 @@ static int tipc_link_tnl_rcv(struct tipc_link *l, struct sk_buff *skb, return rc; } -static int tipc_link_release_pkts(struct tipc_link *l, u16 acked) -{ - int released = 0; - struct sk_buff *skb, *tmp; - - skb_queue_walk_safe(&l->transmq, skb, tmp) { - if (more(buf_seqno(skb), acked)) - break; - __skb_unlink(skb, &l->transmq); - kfree_skb(skb); - released++; +/** + * tipc_get_gap_ack_blks - get Gap ACK blocks from PROTOCOL/STATE_MSG + * @ga: returned pointer to the Gap ACK blocks if any + * @l: the tipc link + * @hdr: the PROTOCOL/STATE_MSG header + * @uc: desired Gap ACK blocks type, i.e. unicast (= 1) or broadcast (= 0) + * + * Return: the total Gap ACK blocks size + */ +u16 tipc_get_gap_ack_blks(struct tipc_gap_ack_blks **ga, struct tipc_link *l, + struct tipc_msg *hdr, bool uc) +{ + struct tipc_gap_ack_blks *p; + u16 sz = 0; + + /* Does peer support the Gap ACK blocks feature? */ + if (l->peer_caps & TIPC_GAP_ACK_BLOCK) { + p = (struct tipc_gap_ack_blks *)msg_data(hdr); + sz = ntohs(p->len); + /* Sanity check */ + if (sz == tipc_gap_ack_blks_sz(p->ugack_cnt + p->bgack_cnt)) { + /* Good, check if the desired type exists */ + if ((uc && p->ugack_cnt) || (!uc && p->bgack_cnt)) + goto ok; + /* Backward compatible: peer might not support bc, but uc? */ + } else if (uc && sz == tipc_gap_ack_blks_sz(p->ugack_cnt)) { + if (p->ugack_cnt) { + p->bgack_cnt = 0; + goto ok; + } + } } - return released; + /* Other cases: ignore! */ + p = NULL; + +ok: + *ga = p; + return sz; } -/* tipc_build_gap_ack_blks - build Gap ACK blocks - * @l: tipc link that data have come with gaps in sequence if any - * @data: data buffer to store the Gap ACK blocks after built - * - * returns the actual allocated memory size - */ -static u16 tipc_build_gap_ack_blks(struct tipc_link *l, void *data, u16 gap) +static u8 __tipc_build_gap_ack_blks(struct tipc_gap_ack_blks *ga, + struct tipc_link *l, u8 start_index) { + struct tipc_gap_ack *gacks = &ga->gacks[start_index]; struct sk_buff *skb = skb_peek(&l->deferdq); - struct tipc_gap_ack_blks *ga = data; - u16 len, expect, seqno = 0; + u16 expect, seqno = 0; u8 n = 0; - if (!skb || !gap) - goto exit; + if (!skb) + return 0; expect = buf_seqno(skb); skb_queue_walk(&l->deferdq, skb) { seqno = buf_seqno(skb); if (unlikely(more(seqno, expect))) { - ga->gacks[n].ack = htons(expect - 1); - ga->gacks[n].gap = htons(seqno - expect); - if (++n >= MAX_GAP_ACK_BLKS) { - pr_info_ratelimited("Too few Gap ACK blocks!\n"); - goto exit; + gacks[n].ack = htons(expect - 1); + gacks[n].gap = htons(seqno - expect); + if (++n >= MAX_GAP_ACK_BLKS / 2) { + char buf[TIPC_MAX_LINK_NAME]; + + pr_info_ratelimited("Gacks on %s: %d, ql: %d!\n", + tipc_link_name_ext(l, buf), + n, + skb_queue_len(&l->deferdq)); + return n; } } else if (unlikely(less(seqno, expect))) { pr_warn("Unexpected skb in deferdq!\n"); @@ -1451,14 +1428,44 @@ static u16 tipc_build_gap_ack_blks(struct tipc_link *l, void *data, u16 gap) } /* last block */ - ga->gacks[n].ack = htons(seqno); - ga->gacks[n].gap = 0; + gacks[n].ack = htons(seqno); + gacks[n].gap = 0; n++; + return n; +} -exit: - len = tipc_gap_ack_blks_sz(n); +/* tipc_build_gap_ack_blks - build Gap ACK blocks + * @l: tipc unicast link + * @hdr: the tipc message buffer to store the Gap ACK blocks after built + * + * The function builds Gap ACK blocks for both the unicast & broadcast receiver + * links of a certain peer, the buffer after built has the network data format + * as found at the struct tipc_gap_ack_blks definition. + * + * returns the actual allocated memory size + */ +static u16 tipc_build_gap_ack_blks(struct tipc_link *l, struct tipc_msg *hdr) +{ + struct tipc_link *bcl = l->bc_rcvlink; + struct tipc_gap_ack_blks *ga; + u16 len; + + ga = (struct tipc_gap_ack_blks *)msg_data(hdr); + + /* Start with broadcast link first */ + tipc_bcast_lock(bcl->net); + msg_set_bcast_ack(hdr, bcl->rcv_nxt - 1); + msg_set_bc_gap(hdr, link_bc_rcv_gap(bcl)); + ga->bgack_cnt = __tipc_build_gap_ack_blks(ga, bcl, 0); + tipc_bcast_unlock(bcl->net); + + /* Now for unicast link, but an explicit NACK only (???) */ + ga->ugack_cnt = (msg_seq_gap(hdr)) ? + __tipc_build_gap_ack_blks(ga, l, ga->bgack_cnt) : 0; + + /* Total len */ + len = tipc_gap_ack_blks_sz(ga->bgack_cnt + ga->ugack_cnt); ga->len = htons(len); - ga->gack_cnt = n; return len; } @@ -1466,47 +1473,109 @@ exit: * acked packets, also doing retransmissions if * gaps found * @l: tipc link with transmq queue to be advanced + * @r: tipc link "receiver" i.e. in case of broadcast (= "l" if unicast) * @acked: seqno of last packet acked by peer without any gaps before * @gap: # of gap packets * @ga: buffer pointer to Gap ACK blocks from peer * @xmitq: queue for accumulating the retransmitted packets if any + * @retransmitted: returned boolean value if a retransmission is really issued + * @rc: returned code e.g. TIPC_LINK_DOWN_EVT if a repeated retransmit failures + * happens (- unlikely case) * - * In case of a repeated retransmit failures, the call will return shortly - * with a returned code (e.g. TIPC_LINK_DOWN_EVT) + * Return: the number of packets released from the link transmq */ -static int tipc_link_advance_transmq(struct tipc_link *l, u16 acked, u16 gap, +static int tipc_link_advance_transmq(struct tipc_link *l, struct tipc_link *r, + u16 acked, u16 gap, struct tipc_gap_ack_blks *ga, - struct sk_buff_head *xmitq) + struct sk_buff_head *xmitq, + bool *retransmitted, int *rc) { + struct tipc_gap_ack_blks *last_ga = r->last_ga, *this_ga = NULL; + struct tipc_gap_ack *gacks = NULL; struct sk_buff *skb, *_skb, *tmp; struct tipc_msg *hdr; + u32 qlen = skb_queue_len(&l->transmq); + u16 nacked = acked, ngap = gap, gack_cnt = 0; u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; - bool retransmitted = false; u16 ack = l->rcv_nxt - 1; - bool passed = false; - u16 released = 0; u16 seqno, n = 0; - int rc = 0; + u16 end = r->acked, start = end, offset = r->last_gap; + u16 si = (last_ga) ? last_ga->start_index : 0; + bool is_uc = !link_is_bc_sndlink(l); + bool bc_has_acked = false; + + /* Determine Gap ACK blocks if any for the particular link */ + if (ga && is_uc) { + /* Get the Gap ACKs, uc part */ + gack_cnt = ga->ugack_cnt; + gacks = &ga->gacks[ga->bgack_cnt]; + } else if (ga) { + /* Copy the Gap ACKs, bc part, for later renewal if needed */ + this_ga = kmemdup(ga, tipc_gap_ack_blks_sz(ga->bgack_cnt), + GFP_ATOMIC); + if (likely(this_ga)) { + this_ga->start_index = 0; + /* Start with the bc Gap ACKs */ + gack_cnt = this_ga->bgack_cnt; + gacks = &this_ga->gacks[0]; + } else { + /* Hmm, we can get in trouble..., simply ignore it */ + pr_warn_ratelimited("Ignoring bc Gap ACKs, no memory\n"); + } + } + /* Advance the link transmq */ skb_queue_walk_safe(&l->transmq, skb, tmp) { seqno = buf_seqno(skb); next_gap_ack: - if (less_eq(seqno, acked)) { + if (less_eq(seqno, nacked)) { + if (is_uc) + goto release; + /* Skip packets peer has already acked */ + if (!more(seqno, r->acked)) + continue; + /* Get the next of last Gap ACK blocks */ + while (more(seqno, end)) { + if (!last_ga || si >= last_ga->bgack_cnt) + break; + start = end + offset + 1; + end = ntohs(last_ga->gacks[si].ack); + offset = ntohs(last_ga->gacks[si].gap); + si++; + WARN_ONCE(more(start, end) || + (!offset && + si < last_ga->bgack_cnt) || + si > MAX_GAP_ACK_BLKS, + "Corrupted Gap ACK: %d %d %d %d %d\n", + start, end, offset, si, + last_ga->bgack_cnt); + } + /* Check against the last Gap ACK block */ + if (in_range(seqno, start, end)) + continue; + /* Update/release the packet peer is acking */ + bc_has_acked = true; + if (--TIPC_SKB_CB(skb)->ackers) + continue; +release: /* release skb */ __skb_unlink(skb, &l->transmq); kfree_skb(skb); - released++; - } else if (less_eq(seqno, acked + gap)) { - /* First, check if repeated retrans failures occurs? */ - if (!passed && link_retransmit_failure(l, l, &rc)) - return rc; - passed = true; - + } else if (less_eq(seqno, nacked + ngap)) { + /* First gap: check if repeated retrans failures? */ + if (unlikely(seqno == acked + 1 && + link_retransmit_failure(l, r, rc))) { + /* Ignore this bc Gap ACKs if any */ + kfree(this_ga); + this_ga = NULL; + break; + } /* retransmit skb if unrestricted*/ if (time_before(jiffies, TIPC_SKB_CB(skb)->nxt_retr)) continue; - TIPC_SKB_CB(skb)->nxt_retr = TIPC_UC_RETR_TIME; + TIPC_SKB_CB(skb)->nxt_retr = (is_uc) ? + TIPC_UC_RETR_TIME : TIPC_BC_RETR_LIM; _skb = pskb_copy(skb, GFP_ATOMIC); if (!_skb) continue; @@ -1516,25 +1585,51 @@ next_gap_ack: _skb->priority = TC_PRIO_CONTROL; __skb_queue_tail(xmitq, _skb); l->stats.retransmitted++; - retransmitted = true; + *retransmitted = true; /* Increase actual retrans counter & mark first time */ if (!TIPC_SKB_CB(skb)->retr_cnt++) TIPC_SKB_CB(skb)->retr_stamp = jiffies; } else { /* retry with Gap ACK blocks if any */ - if (!ga || n >= ga->gack_cnt) + if (n >= gack_cnt) break; - acked = ntohs(ga->gacks[n].ack); - gap = ntohs(ga->gacks[n].gap); + nacked = ntohs(gacks[n].ack); + ngap = ntohs(gacks[n].gap); n++; goto next_gap_ack; } } - if (released || retransmitted) - tipc_link_update_cwin(l, released, retransmitted); - if (released) - tipc_link_advance_backlog(l, xmitq); - return 0; + + /* Renew last Gap ACK blocks for bc if needed */ + if (bc_has_acked) { + if (this_ga) { + kfree(last_ga); + r->last_ga = this_ga; + r->last_gap = gap; + } else if (last_ga) { + if (less(acked, start)) { + si--; + offset = start - acked - 1; + } else if (less(acked, end)) { + acked = end; + } + if (si < last_ga->bgack_cnt) { + last_ga->start_index = si; + r->last_gap = offset; + } else { + kfree(last_ga); + r->last_ga = NULL; + r->last_gap = 0; + } + } else { + r->last_gap = 0; + } + r->acked = acked; + } else { + kfree(this_ga); + } + + return qlen - skb_queue_len(&l->transmq); } /* tipc_link_build_state_msg: prepare link state message for transmission @@ -1651,7 +1746,8 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb, kfree_skb(skb); break; } - released += tipc_link_release_pkts(l, msg_ack(hdr)); + released += tipc_link_advance_transmq(l, l, msg_ack(hdr), 0, + NULL, NULL, NULL, NULL); /* Defer delivery if sequence gap */ if (unlikely(seqno != rcv_nxt)) { @@ -1739,7 +1835,7 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, msg_set_probe(hdr, probe); msg_set_is_keepalive(hdr, probe || probe_reply); if (l->peer_caps & TIPC_GAP_ACK_BLOCK) - glen = tipc_build_gap_ack_blks(l, data, rcvgap); + glen = tipc_build_gap_ack_blks(l, hdr); tipc_mon_prep(l->net, data + glen, &dlen, mstate, l->bearer_id); msg_set_size(hdr, INT_H_SIZE + glen + dlen); skb_trim(skb, INT_H_SIZE + glen + dlen); @@ -2027,20 +2123,19 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, { struct tipc_msg *hdr = buf_msg(skb); struct tipc_gap_ack_blks *ga = NULL; - u16 rcvgap = 0; - u16 ack = msg_ack(hdr); - u16 gap = msg_seq_gap(hdr); + bool reply = msg_probe(hdr), retransmitted = false; + u16 dlen = msg_data_sz(hdr), glen = 0; u16 peers_snd_nxt = msg_next_sent(hdr); u16 peers_tol = msg_link_tolerance(hdr); u16 peers_prio = msg_linkprio(hdr); + u16 gap = msg_seq_gap(hdr); + u16 ack = msg_ack(hdr); u16 rcv_nxt = l->rcv_nxt; - u16 dlen = msg_data_sz(hdr); + u16 rcvgap = 0; int mtyp = msg_type(hdr); - bool reply = msg_probe(hdr); - u16 glen = 0; - void *data; + int rc = 0, released; char *if_name; - int rc = 0; + void *data; trace_tipc_proto_rcv(skb, false, l->name); if (tipc_link_is_blocked(l) || !xmitq) @@ -2137,13 +2232,7 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, } /* Receive Gap ACK blocks from peer if any */ - if (l->peer_caps & TIPC_GAP_ACK_BLOCK) { - ga = (struct tipc_gap_ack_blks *)data; - glen = ntohs(ga->len); - /* sanity check: if failed, ignore Gap ACK blocks */ - if (glen != tipc_gap_ack_blks_sz(ga->gack_cnt)) - ga = NULL; - } + glen = tipc_get_gap_ack_blks(&ga, l, hdr, true); tipc_mon_rcv(l->net, data + glen, dlen - glen, l->addr, &l->mon_state, l->bearer_id); @@ -2158,9 +2247,14 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, tipc_link_build_proto_msg(l, STATE_MSG, 0, reply, rcvgap, 0, 0, xmitq); - rc |= tipc_link_advance_transmq(l, ack, gap, ga, xmitq); + released = tipc_link_advance_transmq(l, l, ack, gap, ga, xmitq, + &retransmitted, &rc); if (gap) l->stats.recv_nacks++; + if (released || retransmitted) + tipc_link_update_cwin(l, released, retransmitted); + if (released) + tipc_link_advance_backlog(l, xmitq); if (unlikely(!skb_queue_empty(&l->wakeupq))) link_prepare_wakeup(l); } @@ -2246,10 +2340,7 @@ void tipc_link_bc_init_rcv(struct tipc_link *l, struct tipc_msg *hdr) int tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr, struct sk_buff_head *xmitq) { - struct tipc_link *snd_l = l->bc_sndlink; u16 peers_snd_nxt = msg_bc_snd_nxt(hdr); - u16 from = msg_bcast_ack(hdr) + 1; - u16 to = from + msg_bc_gap(hdr) - 1; int rc = 0; if (!link_is_up(l)) @@ -2271,8 +2362,6 @@ int tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr, if (more(peers_snd_nxt, l->rcv_nxt + l->window)) return rc; - rc = tipc_link_bc_retrans(snd_l, l, from, to, xmitq); - l->snd_nxt = peers_snd_nxt; if (link_bc_rcv_gap(l)) rc |= TIPC_LINK_SND_STATE; @@ -2307,38 +2396,27 @@ int tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr, return 0; } -void tipc_link_bc_ack_rcv(struct tipc_link *l, u16 acked, - struct sk_buff_head *xmitq) +int tipc_link_bc_ack_rcv(struct tipc_link *r, u16 acked, u16 gap, + struct tipc_gap_ack_blks *ga, + struct sk_buff_head *xmitq) { - struct sk_buff *skb, *tmp; - struct tipc_link *snd_l = l->bc_sndlink; + struct tipc_link *l = r->bc_sndlink; + bool unused = false; + int rc = 0; - if (!link_is_up(l) || !l->bc_peer_is_up) - return; + if (!link_is_up(r) || !r->bc_peer_is_up) + return 0; - if (!more(acked, l->acked)) - return; + if (less(acked, r->acked) || (acked == r->acked && !gap && !ga)) + return 0; - trace_tipc_link_bc_ack(l, l->acked, acked, &snd_l->transmq); - /* Skip over packets peer has already acked */ - skb_queue_walk(&snd_l->transmq, skb) { - if (more(buf_seqno(skb), l->acked)) - break; - } + tipc_link_advance_transmq(l, r, acked, gap, ga, xmitq, &unused, &rc); - /* Update/release the packets peer is acking now */ - skb_queue_walk_from_safe(&snd_l->transmq, skb, tmp) { - if (more(buf_seqno(skb), acked)) - break; - if (!--TIPC_SKB_CB(skb)->ackers) { - __skb_unlink(skb, &snd_l->transmq); - kfree_skb(skb); - } - } - l->acked = acked; - tipc_link_advance_backlog(snd_l, xmitq); - if (unlikely(!skb_queue_empty(&snd_l->wakeupq))) - link_prepare_wakeup(snd_l); + tipc_link_advance_backlog(l, xmitq); + if (unlikely(!skb_queue_empty(&l->wakeupq))) + link_prepare_wakeup(l); + + return rc; } /* tipc_link_bc_nack_rcv(): receive broadcast nack message @@ -2366,8 +2444,7 @@ int tipc_link_bc_nack_rcv(struct tipc_link *l, struct sk_buff *skb, return 0; if (dnode == tipc_own_addr(l->net)) { - tipc_link_bc_ack_rcv(l, acked, xmitq); - rc = tipc_link_bc_retrans(l->bc_sndlink, l, from, to, xmitq); + rc = tipc_link_bc_ack_rcv(l, acked, to - acked, NULL, xmitq); l->stats.recv_nacks++; return rc; } diff --git a/net/tipc/link.h b/net/tipc/link.h index d3c1c3fc1659..0a0fa7350722 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -143,8 +143,11 @@ int tipc_link_bc_peers(struct tipc_link *l); void tipc_link_set_mtu(struct tipc_link *l, int mtu); int tipc_link_mtu(struct tipc_link *l); int tipc_link_mss(struct tipc_link *l); -void tipc_link_bc_ack_rcv(struct tipc_link *l, u16 acked, - struct sk_buff_head *xmitq); +u16 tipc_get_gap_ack_blks(struct tipc_gap_ack_blks **ga, struct tipc_link *l, + struct tipc_msg *hdr, bool uc); +int tipc_link_bc_ack_rcv(struct tipc_link *l, u16 acked, u16 gap, + struct tipc_gap_ack_blks *ga, + struct sk_buff_head *xmitq); void tipc_link_build_bc_sync_msg(struct tipc_link *l, struct sk_buff_head *xmitq); void tipc_link_bc_init_rcv(struct tipc_link *l, struct tipc_msg *hdr); diff --git a/net/tipc/msg.h b/net/tipc/msg.h index 871feadbbc19..ca5f8689a33b 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -160,20 +160,39 @@ struct tipc_gap_ack { /* struct tipc_gap_ack_blks * @len: actual length of the record - * @gack_cnt: number of Gap ACK blocks in the record + * @ugack_cnt: number of Gap ACK blocks for unicast (following the broadcast + * ones) + * @start_index: starting index for "valid" broadcast Gap ACK blocks + * @bgack_cnt: number of Gap ACK blocks for broadcast in the record * @gacks: array of Gap ACK blocks + * + * 31 16 15 0 + * +-------------+-------------+-------------+-------------+ + * | bgack_cnt | ugack_cnt | len | + * +-------------+-------------+-------------+-------------+ - + * | gap | ack | | + * +-------------+-------------+-------------+-------------+ > bc gacks + * : : : | + * +-------------+-------------+-------------+-------------+ - + * | gap | ack | | + * +-------------+-------------+-------------+-------------+ > uc gacks + * : : : | + * +-------------+-------------+-------------+-------------+ - */ struct tipc_gap_ack_blks { __be16 len; - u8 gack_cnt; - u8 reserved; + union { + u8 ugack_cnt; + u8 start_index; + }; + u8 bgack_cnt; struct tipc_gap_ack gacks[]; }; #define tipc_gap_ack_blks_sz(n) (sizeof(struct tipc_gap_ack_blks) + \ sizeof(struct tipc_gap_ack) * (n)) -#define MAX_GAP_ACK_BLKS 32 +#define MAX_GAP_ACK_BLKS 128 #define MAX_GAP_ACK_BLKS_SZ tipc_gap_ack_blks_sz(MAX_GAP_ACK_BLKS) static inline struct tipc_msg *buf_msg(struct sk_buff *skb) diff --git a/net/tipc/node.c b/net/tipc/node.c index 803a3a6d0f50..6a49b3eeaae9 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -2071,10 +2071,16 @@ rcv: le = &n->links[bearer_id]; /* Ensure broadcast reception is in synch with peer's send state */ - if (unlikely(usr == LINK_PROTOCOL)) + if (unlikely(usr == LINK_PROTOCOL)) { + if (unlikely(skb_linearize(skb))) { + tipc_node_put(n); + goto discard; + } + hdr = buf_msg(skb); tipc_node_bc_sync_rcv(n, hdr, bearer_id, &xmitq); - else if (unlikely(tipc_link_acked(n->bc_entry.link) != bc_ack)) + } else if (unlikely(tipc_link_acked(n->bc_entry.link) != bc_ack)) { tipc_bcast_ack_rcv(net, n->bc_entry.link, hdr); + } /* Receive packet directly if conditions permit */ tipc_node_read_lock(n); -- cgit v1.2.3 From c6ed7a5cc2d68c36287c09260dc211173e0447d7 Mon Sep 17 00:00:00 2001 From: Tuong Lien Date: Tue, 26 May 2020 16:38:35 +0700 Subject: tipc: add back link trace events In the previous commit ("tipc: add Gap ACK blocks support for broadcast link"), we have removed the following link trace events due to the code changes: - tipc_link_bc_ack - tipc_link_retrans This commit adds them back along with some minor changes to adapt to the new code. Acked-by: Ying Xue Acked-by: Jon Maloy Signed-off-by: Tuong Lien Signed-off-by: David S. Miller --- net/tipc/link.c | 3 +++ net/tipc/trace.h | 13 ++++++++----- 2 files changed, 11 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index d29b9c531171..288c5670cfa5 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1504,6 +1504,8 @@ static int tipc_link_advance_transmq(struct tipc_link *l, struct tipc_link *r, bool is_uc = !link_is_bc_sndlink(l); bool bc_has_acked = false; + trace_tipc_link_retrans(r, acked + 1, acked + gap, &l->transmq); + /* Determine Gap ACK blocks if any for the particular link */ if (ga && is_uc) { /* Get the Gap ACKs, uc part */ @@ -2410,6 +2412,7 @@ int tipc_link_bc_ack_rcv(struct tipc_link *r, u16 acked, u16 gap, if (less(acked, r->acked) || (acked == r->acked && !gap && !ga)) return 0; + trace_tipc_link_bc_ack(r, acked, gap, &l->transmq); tipc_link_advance_transmq(l, r, acked, gap, ga, xmitq, &unused, &rc); tipc_link_advance_backlog(l, xmitq); diff --git a/net/tipc/trace.h b/net/tipc/trace.h index 4d8e00483afc..e7535ab75255 100644 --- a/net/tipc/trace.h +++ b/net/tipc/trace.h @@ -299,8 +299,10 @@ DECLARE_EVENT_CLASS(tipc_link_transmq_class, __entry->from = f; __entry->to = t; __entry->len = skb_queue_len(tq); - __entry->fseqno = msg_seqno(buf_msg(skb_peek(tq))); - __entry->lseqno = msg_seqno(buf_msg(skb_peek_tail(tq))); + __entry->fseqno = __entry->len ? + msg_seqno(buf_msg(skb_peek(tq))) : 0; + __entry->lseqno = __entry->len ? + msg_seqno(buf_msg(skb_peek_tail(tq))) : 0; ), TP_printk("<%s> retrans req: [%u-%u] transmq: %u [%u-%u]\n", @@ -308,15 +310,16 @@ DECLARE_EVENT_CLASS(tipc_link_transmq_class, __entry->len, __entry->fseqno, __entry->lseqno) ); -DEFINE_EVENT(tipc_link_transmq_class, tipc_link_retrans, +DEFINE_EVENT_CONDITION(tipc_link_transmq_class, tipc_link_retrans, TP_PROTO(struct tipc_link *r, u16 f, u16 t, struct sk_buff_head *tq), - TP_ARGS(r, f, t, tq) + TP_ARGS(r, f, t, tq), + TP_CONDITION(less_eq(f, t)) ); DEFINE_EVENT_PRINT(tipc_link_transmq_class, tipc_link_bc_ack, TP_PROTO(struct tipc_link *r, u16 f, u16 t, struct sk_buff_head *tq), TP_ARGS(r, f, t, tq), - TP_printk("<%s> acked: [%u-%u] transmq: %u [%u-%u]\n", + TP_printk("<%s> acked: %u gap: %u transmq: %u [%u-%u]\n", __entry->name, __entry->from, __entry->to, __entry->len, __entry->fseqno, __entry->lseqno) ); -- cgit v1.2.3 From a91d55d162b86fb983b88f44296149752db7efbd Mon Sep 17 00:00:00 2001 From: Tuong Lien Date: Tue, 26 May 2020 16:38:36 +0700 Subject: tipc: enable broadcast retrans via unicast In some environment, broadcast traffic is suppressed at high rate (i.e. a kind of bandwidth limit setting). When it is applied, TIPC broadcast can still run successfully. However, when it comes to a high load, some packets will be dropped first and TIPC tries to retransmit them but the packet retransmission is intentionally broadcast too, so making things worse and not helpful at all. This commit enables the broadcast retransmission via unicast which only retransmits packets to the specific peer that has really reported a gap i.e. not broadcasting to all nodes in the cluster, so will prevent from being suppressed, and also reduce some overheads on the other peers due to duplicates, finally improve the overall TIPC broadcast performance. Note: the functionality can be turned on/off via the sysctl file: echo 1 > /proc/sys/net/tipc/bc_retruni echo 0 > /proc/sys/net/tipc/bc_retruni Default is '0', i.e. the broadcast retransmission still works as usual. Acked-by: Ying Xue Acked-by: Jon Maloy Signed-off-by: Tuong Lien Signed-off-by: David S. Miller --- net/tipc/bcast.c | 11 ++++++++--- net/tipc/bcast.h | 4 +++- net/tipc/link.c | 10 ++++++---- net/tipc/link.h | 3 ++- net/tipc/node.c | 2 +- net/tipc/sysctl.c | 9 ++++++++- 6 files changed, 28 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 3ce690a96ee9..50a16f8bebd9 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -46,6 +46,7 @@ #define BCLINK_WIN_MIN 32 /* bcast minimum link window size */ const char tipc_bclink_name[] = "broadcast-link"; +unsigned long sysctl_tipc_bc_retruni __read_mostly; /** * struct tipc_bc_base - base structure for keeping broadcast send state @@ -474,7 +475,7 @@ void tipc_bcast_ack_rcv(struct net *net, struct tipc_link *l, __skb_queue_head_init(&xmitq); tipc_bcast_lock(net); - tipc_link_bc_ack_rcv(l, acked, 0, NULL, &xmitq); + tipc_link_bc_ack_rcv(l, acked, 0, NULL, &xmitq, NULL); tipc_bcast_unlock(net); tipc_bcbase_xmit(net, &xmitq); @@ -489,7 +490,8 @@ void tipc_bcast_ack_rcv(struct net *net, struct tipc_link *l, * RCU is locked, no other locks set */ int tipc_bcast_sync_rcv(struct net *net, struct tipc_link *l, - struct tipc_msg *hdr) + struct tipc_msg *hdr, + struct sk_buff_head *retrq) { struct sk_buff_head *inputq = &tipc_bc_base(net)->inputq; struct tipc_gap_ack_blks *ga; @@ -503,8 +505,11 @@ int tipc_bcast_sync_rcv(struct net *net, struct tipc_link *l, tipc_link_bc_init_rcv(l, hdr); } else if (!msg_bc_ack_invalid(hdr)) { tipc_get_gap_ack_blks(&ga, l, hdr, false); + if (!sysctl_tipc_bc_retruni) + retrq = &xmitq; rc = tipc_link_bc_ack_rcv(l, msg_bcast_ack(hdr), - msg_bc_gap(hdr), ga, &xmitq); + msg_bc_gap(hdr), ga, &xmitq, + retrq); rc |= tipc_link_bc_sync_rcv(l, hdr, &xmitq); } tipc_bcast_unlock(net); diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h index 9e847d9617d3..97d3cf9d3e4d 100644 --- a/net/tipc/bcast.h +++ b/net/tipc/bcast.h @@ -45,6 +45,7 @@ struct tipc_nl_msg; struct tipc_nlist; struct tipc_nitem; extern const char tipc_bclink_name[]; +extern unsigned long sysctl_tipc_bc_retruni; #define TIPC_METHOD_EXPIRE msecs_to_jiffies(5000) @@ -93,7 +94,8 @@ int tipc_bcast_rcv(struct net *net, struct tipc_link *l, struct sk_buff *skb); void tipc_bcast_ack_rcv(struct net *net, struct tipc_link *l, struct tipc_msg *hdr); int tipc_bcast_sync_rcv(struct net *net, struct tipc_link *l, - struct tipc_msg *hdr); + struct tipc_msg *hdr, + struct sk_buff_head *retrq); int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg); int tipc_nl_bc_link_set(struct net *net, struct nlattr *attrs[]); int tipc_bclink_reset_stats(struct net *net); diff --git a/net/tipc/link.c b/net/tipc/link.c index 288c5670cfa5..af352391e2ab 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -375,7 +375,7 @@ void tipc_link_remove_bc_peer(struct tipc_link *snd_l, snd_l->ackers--; rcv_l->bc_peer_is_up = true; rcv_l->state = LINK_ESTABLISHED; - tipc_link_bc_ack_rcv(rcv_l, ack, 0, NULL, xmitq); + tipc_link_bc_ack_rcv(rcv_l, ack, 0, NULL, xmitq, NULL); trace_tipc_link_reset(rcv_l, TIPC_DUMP_ALL, "bclink removed!"); tipc_link_reset(rcv_l); rcv_l->state = LINK_RESET; @@ -2400,7 +2400,8 @@ int tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr, int tipc_link_bc_ack_rcv(struct tipc_link *r, u16 acked, u16 gap, struct tipc_gap_ack_blks *ga, - struct sk_buff_head *xmitq) + struct sk_buff_head *xmitq, + struct sk_buff_head *retrq) { struct tipc_link *l = r->bc_sndlink; bool unused = false; @@ -2413,7 +2414,7 @@ int tipc_link_bc_ack_rcv(struct tipc_link *r, u16 acked, u16 gap, return 0; trace_tipc_link_bc_ack(r, acked, gap, &l->transmq); - tipc_link_advance_transmq(l, r, acked, gap, ga, xmitq, &unused, &rc); + tipc_link_advance_transmq(l, r, acked, gap, ga, retrq, &unused, &rc); tipc_link_advance_backlog(l, xmitq); if (unlikely(!skb_queue_empty(&l->wakeupq))) @@ -2447,7 +2448,8 @@ int tipc_link_bc_nack_rcv(struct tipc_link *l, struct sk_buff *skb, return 0; if (dnode == tipc_own_addr(l->net)) { - rc = tipc_link_bc_ack_rcv(l, acked, to - acked, NULL, xmitq); + rc = tipc_link_bc_ack_rcv(l, acked, to - acked, NULL, xmitq, + xmitq); l->stats.recv_nacks++; return rc; } diff --git a/net/tipc/link.h b/net/tipc/link.h index 0a0fa7350722..4d0768cf91d5 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -147,7 +147,8 @@ u16 tipc_get_gap_ack_blks(struct tipc_gap_ack_blks **ga, struct tipc_link *l, struct tipc_msg *hdr, bool uc); int tipc_link_bc_ack_rcv(struct tipc_link *l, u16 acked, u16 gap, struct tipc_gap_ack_blks *ga, - struct sk_buff_head *xmitq); + struct sk_buff_head *xmitq, + struct sk_buff_head *retrq); void tipc_link_build_bc_sync_msg(struct tipc_link *l, struct sk_buff_head *xmitq); void tipc_link_bc_init_rcv(struct tipc_link *l, struct tipc_msg *hdr); diff --git a/net/tipc/node.c b/net/tipc/node.c index 6a49b3eeaae9..548207fdec15 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -1772,7 +1772,7 @@ static void tipc_node_bc_sync_rcv(struct tipc_node *n, struct tipc_msg *hdr, struct tipc_link *ucl; int rc; - rc = tipc_bcast_sync_rcv(n->net, n->bc_entry.link, hdr); + rc = tipc_bcast_sync_rcv(n->net, n->bc_entry.link, hdr, xmitq); if (rc & TIPC_LINK_DOWN_EVT) { tipc_node_reset_links(n); diff --git a/net/tipc/sysctl.c b/net/tipc/sysctl.c index 58ab3d6dcdce..97a6264a2993 100644 --- a/net/tipc/sysctl.c +++ b/net/tipc/sysctl.c @@ -36,7 +36,7 @@ #include "core.h" #include "trace.h" #include "crypto.h" - +#include "bcast.h" #include static struct ctl_table_header *tipc_ctl_hdr; @@ -75,6 +75,13 @@ static struct ctl_table tipc_table[] = { .extra1 = SYSCTL_ONE, }, #endif + { + .procname = "bc_retruni", + .data = &sysctl_tipc_bc_retruni, + .maxlen = sizeof(sysctl_tipc_bc_retruni), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, {} }; -- cgit v1.2.3 From 03b6fefd9bb4844c75faeb10df8496794e2fd5da Mon Sep 17 00:00:00 2001 From: Tuong Lien Date: Tue, 26 May 2020 16:38:37 +0700 Subject: tipc: add support for broadcast rcv stats dumping This commit enables dumping the statistics of a broadcast-receiver link like the traditional 'broadcast-link' one (which is for broadcast- sender). The link dumping can be triggered via netlink (e.g. the iproute2/tipc tool) by the link flag - 'TIPC_NLA_LINK_BROADCAST' as the indicator. The name of a broadcast-receiver link of a specific peer will be in the format: 'broadcast-link:'. For example: Link Window:50 packets RX packets:7841 fragments:2408/440 bundles:0/0 TX packets:0 fragments:0/0 bundles:0/0 RX naks:0 defs:124 dups:0 TX naks:21 acks:0 retrans:0 Congestion link:0 Send queue max:0 avg:0 In addition, the broadcast-receiver link statistics can be reset in the usual way via netlink by specifying that link name in command. Note: the 'tipc_link_name_ext()' is removed because the link name can now be retrieved simply via the 'l->name'. Acked-by: Ying Xue Acked-by: Jon Maloy Signed-off-by: Tuong Lien Signed-off-by: David S. Miller --- net/tipc/bcast.c | 6 ++--- net/tipc/bcast.h | 5 +++-- net/tipc/link.c | 65 +++++++++++++++++++++++++++--------------------------- net/tipc/link.h | 3 +-- net/tipc/msg.c | 9 ++++---- net/tipc/msg.h | 2 +- net/tipc/netlink.c | 2 +- net/tipc/node.c | 61 +++++++++++++++++++++++++++++++++++++++++++------- net/tipc/trace.h | 4 ++-- 9 files changed, 101 insertions(+), 56 deletions(-) (limited to 'net') diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 50a16f8bebd9..383f87bc1061 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -563,10 +563,8 @@ void tipc_bcast_remove_peer(struct net *net, struct tipc_link *rcv_l) tipc_sk_rcv(net, inputq); } -int tipc_bclink_reset_stats(struct net *net) +int tipc_bclink_reset_stats(struct net *net, struct tipc_link *l) { - struct tipc_link *l = tipc_bc_sndlink(net); - if (!l) return -ENOPROTOOPT; @@ -694,7 +692,7 @@ int tipc_bcast_init(struct net *net) tn->bcbase = bb; spin_lock_init(&tipc_net(net)->bclock); - if (!tipc_link_bc_create(net, 0, 0, + if (!tipc_link_bc_create(net, 0, 0, NULL, FB_MTU, BCLINK_WIN_DEFAULT, BCLINK_WIN_DEFAULT, diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h index 97d3cf9d3e4d..4240c95188b1 100644 --- a/net/tipc/bcast.h +++ b/net/tipc/bcast.h @@ -96,9 +96,10 @@ void tipc_bcast_ack_rcv(struct net *net, struct tipc_link *l, int tipc_bcast_sync_rcv(struct net *net, struct tipc_link *l, struct tipc_msg *hdr, struct sk_buff_head *retrq); -int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg); +int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg, + struct tipc_link *bcl); int tipc_nl_bc_link_set(struct net *net, struct nlattr *attrs[]); -int tipc_bclink_reset_stats(struct net *net); +int tipc_bclink_reset_stats(struct net *net, struct tipc_link *l); u32 tipc_bcast_get_broadcast_mode(struct net *net); u32 tipc_bcast_get_broadcast_ratio(struct net *net); diff --git a/net/tipc/link.c b/net/tipc/link.c index af352391e2ab..ee3b8d0576b8 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -539,7 +539,7 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, * * Returns true if link was created, otherwise false */ -bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, +bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, u8 *peer_id, int mtu, u32 min_win, u32 max_win, u16 peer_caps, struct sk_buff_head *inputq, struct sk_buff_head *namedq, @@ -554,7 +554,18 @@ bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, return false; l = *link; - strcpy(l->name, tipc_bclink_name); + if (peer_id) { + char peer_str[NODE_ID_STR_LEN] = {0,}; + + tipc_nodeid2string(peer_str, peer_id); + if (strlen(peer_str) > 16) + sprintf(peer_str, "%x", peer); + /* Broadcast receiver link name: "broadcast-link:" */ + snprintf(l->name, sizeof(l->name), "%s:%s", tipc_bclink_name, + peer_str); + } else { + strcpy(l->name, tipc_bclink_name); + } trace_tipc_link_reset(l, TIPC_DUMP_ALL, "bclink created!"); tipc_link_reset(l); l->state = LINK_RESET; @@ -1412,11 +1423,8 @@ static u8 __tipc_build_gap_ack_blks(struct tipc_gap_ack_blks *ga, gacks[n].ack = htons(expect - 1); gacks[n].gap = htons(seqno - expect); if (++n >= MAX_GAP_ACK_BLKS / 2) { - char buf[TIPC_MAX_LINK_NAME]; - pr_info_ratelimited("Gacks on %s: %d, ql: %d!\n", - tipc_link_name_ext(l, buf), - n, + l->name, n, skb_queue_len(&l->deferdq)); return n; } @@ -1587,6 +1595,8 @@ release: _skb->priority = TC_PRIO_CONTROL; __skb_queue_tail(xmitq, _skb); l->stats.retransmitted++; + if (!is_uc) + r->stats.retransmitted++; *retransmitted = true; /* Increase actual retrans counter & mark first time */ if (!TIPC_SKB_CB(skb)->retr_cnt++) @@ -1753,7 +1763,8 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb, /* Defer delivery if sequence gap */ if (unlikely(seqno != rcv_nxt)) { - __tipc_skb_queue_sorted(defq, seqno, skb); + if (!__tipc_skb_queue_sorted(defq, seqno, skb)) + l->stats.duplicates++; rc |= tipc_link_build_nack_msg(l, xmitq); break; } @@ -1787,15 +1798,15 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, int tolerance, int priority, struct sk_buff_head *xmitq) { + struct tipc_mon_state *mstate = &l->mon_state; + struct sk_buff_head *dfq = &l->deferdq; struct tipc_link *bcl = l->bc_rcvlink; - struct sk_buff *skb; struct tipc_msg *hdr; - struct sk_buff_head *dfq = &l->deferdq; + struct sk_buff *skb; bool node_up = link_is_up(bcl); - struct tipc_mon_state *mstate = &l->mon_state; + u16 glen = 0, bc_rcvgap = 0; int dlen = 0; void *data; - u16 glen = 0; /* Don't send protocol message during reset or link failover */ if (tipc_link_is_blocked(l)) @@ -1833,7 +1844,8 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, if (l->peer_caps & TIPC_LINK_PROTO_SEQNO) msg_set_seqno(hdr, l->snd_nxt_state++); msg_set_seq_gap(hdr, rcvgap); - msg_set_bc_gap(hdr, link_bc_rcv_gap(bcl)); + bc_rcvgap = link_bc_rcv_gap(bcl); + msg_set_bc_gap(hdr, bc_rcvgap); msg_set_probe(hdr, probe); msg_set_is_keepalive(hdr, probe || probe_reply); if (l->peer_caps & TIPC_GAP_ACK_BLOCK) @@ -1858,6 +1870,8 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, l->stats.sent_probes++; if (rcvgap) l->stats.sent_nacks++; + if (bc_rcvgap) + bcl->stats.sent_nacks++; skb->priority = TC_PRIO_CONTROL; __skb_queue_tail(xmitq, skb); trace_tipc_proto_build(skb, false, l->name); @@ -2358,8 +2372,6 @@ int tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr, if (!l->bc_peer_is_up) return rc; - l->stats.recv_nacks++; - /* Ignore if peers_snd_nxt goes beyond receive window */ if (more(peers_snd_nxt, l->rcv_nxt + l->window)) return rc; @@ -2410,6 +2422,11 @@ int tipc_link_bc_ack_rcv(struct tipc_link *r, u16 acked, u16 gap, if (!link_is_up(r) || !r->bc_peer_is_up) return 0; + if (gap) { + l->stats.recv_nacks++; + r->stats.recv_nacks++; + } + if (less(acked, r->acked) || (acked == r->acked && !gap && !ga)) return 0; @@ -2721,16 +2738,15 @@ msg_full: return -EMSGSIZE; } -int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg) +int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg, + struct tipc_link *bcl) { int err; void *hdr; struct nlattr *attrs; struct nlattr *prop; - struct tipc_net *tn = net_generic(net, tipc_net_id); u32 bc_mode = tipc_bcast_get_broadcast_mode(net); u32 bc_ratio = tipc_bcast_get_broadcast_ratio(net); - struct tipc_link *bcl = tn->bcl; if (!bcl) return 0; @@ -2817,21 +2833,6 @@ void tipc_link_set_abort_limit(struct tipc_link *l, u32 limit) l->abort_limit = limit; } -char *tipc_link_name_ext(struct tipc_link *l, char *buf) -{ - if (!l) - scnprintf(buf, TIPC_MAX_LINK_NAME, "null"); - else if (link_is_bc_sndlink(l)) - scnprintf(buf, TIPC_MAX_LINK_NAME, "broadcast-sender"); - else if (link_is_bc_rcvlink(l)) - scnprintf(buf, TIPC_MAX_LINK_NAME, - "broadcast-receiver, peer %x", l->addr); - else - memcpy(buf, l->name, TIPC_MAX_LINK_NAME); - - return buf; -} - /** * tipc_link_dump - dump TIPC link data * @l: tipc link to be dumped diff --git a/net/tipc/link.h b/net/tipc/link.h index 4d0768cf91d5..fc07232c9a12 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -80,7 +80,7 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, struct sk_buff_head *inputq, struct sk_buff_head *namedq, struct tipc_link **link); -bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, +bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, u8 *peer_id, int mtu, u32 min_win, u32 max_win, u16 peer_caps, struct sk_buff_head *inputq, struct sk_buff_head *namedq, @@ -111,7 +111,6 @@ u16 tipc_link_rcv_nxt(struct tipc_link *l); u16 tipc_link_acked(struct tipc_link *l); u32 tipc_link_id(struct tipc_link *l); char *tipc_link_name(struct tipc_link *l); -char *tipc_link_name_ext(struct tipc_link *l, char *buf); u32 tipc_link_state(struct tipc_link *l); char tipc_link_plane(struct tipc_link *l); int tipc_link_prio(struct tipc_link *l); diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 4d0e0bdd997b..c69fb99163fc 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -825,19 +825,19 @@ bool tipc_msg_pskb_copy(u32 dst, struct sk_buff_head *msg, * @seqno: sequence number of buffer to add * @skb: buffer to add */ -void __tipc_skb_queue_sorted(struct sk_buff_head *list, u16 seqno, +bool __tipc_skb_queue_sorted(struct sk_buff_head *list, u16 seqno, struct sk_buff *skb) { struct sk_buff *_skb, *tmp; if (skb_queue_empty(list) || less(seqno, buf_seqno(skb_peek(list)))) { __skb_queue_head(list, skb); - return; + return true; } if (more(seqno, buf_seqno(skb_peek_tail(list)))) { __skb_queue_tail(list, skb); - return; + return true; } skb_queue_walk_safe(list, _skb, tmp) { @@ -846,9 +846,10 @@ void __tipc_skb_queue_sorted(struct sk_buff_head *list, u16 seqno, if (seqno == buf_seqno(_skb)) break; __skb_queue_before(list, _skb, skb); - return; + return true; } kfree_skb(skb); + return false; } void tipc_skb_reject(struct net *net, int err, struct sk_buff *skb, diff --git a/net/tipc/msg.h b/net/tipc/msg.h index ca5f8689a33b..cd4281779468 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -1145,7 +1145,7 @@ bool tipc_msg_assemble(struct sk_buff_head *list); bool tipc_msg_reassemble(struct sk_buff_head *list, struct sk_buff_head *rcvq); bool tipc_msg_pskb_copy(u32 dst, struct sk_buff_head *msg, struct sk_buff_head *cpy); -void __tipc_skb_queue_sorted(struct sk_buff_head *list, u16 seqno, +bool __tipc_skb_queue_sorted(struct sk_buff_head *list, u16 seqno, struct sk_buff *skb); bool tipc_msg_skb_clone(struct sk_buff_head *msg, struct sk_buff_head *cpy); diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index bb9862410e68..c4aee6247d55 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -188,7 +188,7 @@ static const struct genl_ops tipc_genl_v2_ops[] = { }, { .cmd = TIPC_NL_LINK_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .validate = GENL_DONT_VALIDATE_STRICT, .doit = tipc_nl_node_get_link, .dumpit = tipc_nl_node_dump_link, }, diff --git a/net/tipc/node.c b/net/tipc/node.c index 548207fdec15..0312fb181d94 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -1138,7 +1138,7 @@ void tipc_node_check_dest(struct net *net, u32 addr, if (unlikely(!n->bc_entry.link)) { snd_l = tipc_bc_sndlink(net); if (!tipc_link_bc_create(net, tipc_own_addr(net), - addr, U16_MAX, + addr, peer_id, U16_MAX, tipc_link_min_win(snd_l), tipc_link_max_win(snd_l), n->capabilities, @@ -2435,7 +2435,7 @@ int tipc_nl_node_get_link(struct sk_buff *skb, struct genl_info *info) return -ENOMEM; if (strcmp(name, tipc_bclink_name) == 0) { - err = tipc_nl_add_bc_link(net, &msg); + err = tipc_nl_add_bc_link(net, &msg, tipc_net(net)->bcl); if (err) goto err_free; } else { @@ -2479,6 +2479,7 @@ int tipc_nl_node_reset_link_stats(struct sk_buff *skb, struct genl_info *info) struct tipc_node *node; struct nlattr *attrs[TIPC_NLA_LINK_MAX + 1]; struct net *net = sock_net(skb->sk); + struct tipc_net *tn = tipc_net(net); struct tipc_link_entry *le; if (!info->attrs[TIPC_NLA_LINK]) @@ -2495,11 +2496,26 @@ int tipc_nl_node_reset_link_stats(struct sk_buff *skb, struct genl_info *info) link_name = nla_data(attrs[TIPC_NLA_LINK_NAME]); - if (strcmp(link_name, tipc_bclink_name) == 0) { - err = tipc_bclink_reset_stats(net); + err = -EINVAL; + if (!strcmp(link_name, tipc_bclink_name)) { + err = tipc_bclink_reset_stats(net, tipc_bc_sndlink(net)); if (err) return err; return 0; + } else if (strstr(link_name, tipc_bclink_name)) { + rcu_read_lock(); + list_for_each_entry_rcu(node, &tn->node_list, list) { + tipc_node_read_lock(node); + link = node->bc_entry.link; + if (link && !strcmp(link_name, tipc_link_name(link))) { + err = tipc_bclink_reset_stats(net, link); + tipc_node_read_unlock(node); + break; + } + tipc_node_read_unlock(node); + } + rcu_read_unlock(); + return err; } node = tipc_node_find_by_name(net, link_name, &bearer_id); @@ -2523,7 +2539,8 @@ int tipc_nl_node_reset_link_stats(struct sk_buff *skb, struct genl_info *info) /* Caller should hold node lock */ static int __tipc_nl_add_node_links(struct net *net, struct tipc_nl_msg *msg, - struct tipc_node *node, u32 *prev_link) + struct tipc_node *node, u32 *prev_link, + bool bc_link) { u32 i; int err; @@ -2539,6 +2556,14 @@ static int __tipc_nl_add_node_links(struct net *net, struct tipc_nl_msg *msg, if (err) return err; } + + if (bc_link) { + *prev_link = i; + err = tipc_nl_add_bc_link(net, msg, node->bc_entry.link); + if (err) + return err; + } + *prev_link = 0; return 0; @@ -2547,17 +2572,36 @@ static int __tipc_nl_add_node_links(struct net *net, struct tipc_nl_msg *msg, int tipc_nl_node_dump_link(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); + struct nlattr **attrs = genl_dumpit_info(cb)->attrs; + struct nlattr *link[TIPC_NLA_LINK_MAX + 1]; struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_node *node; struct tipc_nl_msg msg; u32 prev_node = cb->args[0]; u32 prev_link = cb->args[1]; int done = cb->args[2]; + bool bc_link = cb->args[3]; int err; if (done) return 0; + if (!prev_node) { + /* Check if broadcast-receiver links dumping is needed */ + if (attrs && attrs[TIPC_NLA_LINK]) { + err = nla_parse_nested_deprecated(link, + TIPC_NLA_LINK_MAX, + attrs[TIPC_NLA_LINK], + tipc_nl_link_policy, + NULL); + if (unlikely(err)) + return err; + if (unlikely(!link[TIPC_NLA_LINK_BROADCAST])) + return -EINVAL; + bc_link = true; + } + } + msg.skb = skb; msg.portid = NETLINK_CB(cb->skb).portid; msg.seq = cb->nlh->nlmsg_seq; @@ -2581,7 +2625,7 @@ int tipc_nl_node_dump_link(struct sk_buff *skb, struct netlink_callback *cb) list) { tipc_node_read_lock(node); err = __tipc_nl_add_node_links(net, &msg, node, - &prev_link); + &prev_link, bc_link); tipc_node_read_unlock(node); if (err) goto out; @@ -2589,14 +2633,14 @@ int tipc_nl_node_dump_link(struct sk_buff *skb, struct netlink_callback *cb) prev_node = node->addr; } } else { - err = tipc_nl_add_bc_link(net, &msg); + err = tipc_nl_add_bc_link(net, &msg, tn->bcl); if (err) goto out; list_for_each_entry_rcu(node, &tn->node_list, list) { tipc_node_read_lock(node); err = __tipc_nl_add_node_links(net, &msg, node, - &prev_link); + &prev_link, bc_link); tipc_node_read_unlock(node); if (err) goto out; @@ -2611,6 +2655,7 @@ out: cb->args[0] = prev_node; cb->args[1] = prev_link; cb->args[2] = done; + cb->args[3] = bc_link; return skb->len; } diff --git a/net/tipc/trace.h b/net/tipc/trace.h index e7535ab75255..04af83f0500c 100644 --- a/net/tipc/trace.h +++ b/net/tipc/trace.h @@ -255,7 +255,7 @@ DECLARE_EVENT_CLASS(tipc_link_class, TP_fast_assign( __assign_str(header, header); - tipc_link_name_ext(l, __entry->name); + memcpy(__entry->name, tipc_link_name(l), TIPC_MAX_LINK_NAME); tipc_link_dump(l, dqueues, __get_str(buf)); ), @@ -295,7 +295,7 @@ DECLARE_EVENT_CLASS(tipc_link_transmq_class, ), TP_fast_assign( - tipc_link_name_ext(r, __entry->name); + memcpy(__entry->name, tipc_link_name(r), TIPC_MAX_LINK_NAME); __entry->from = f; __entry->to = t; __entry->len = skb_queue_len(tq); -- cgit v1.2.3 From 0a3e060f340dbe232ffa290c40f879b7f7db595b Mon Sep 17 00:00:00 2001 From: Tuong Lien Date: Tue, 26 May 2020 16:38:38 +0700 Subject: tipc: add test for Nagle algorithm effectiveness When streaming in Nagle mode, we try to bundle small messages from user as many as possible if there is one outstanding buffer, i.e. not ACK-ed by the receiving side, which helps boost up the overall throughput. So, the algorithm's effectiveness really depends on when Nagle ACK comes or what the specific network latency (RTT) is, compared to the user's message sending rate. In a bad case, the user's sending rate is low or the network latency is small, there will not be many bundles, so making a Nagle ACK or waiting for it is not meaningful. For example: a user sends its messages every 100ms and the RTT is 50ms, then for each messages, we require one Nagle ACK but then there is only one user message sent without any bundles. In a better case, even if we have a few bundles (e.g. the RTT = 300ms), but now the user sends messages in medium size, then there will not be any difference at all, that says 3 x 1000-byte data messages if bundled will still result in 3 bundles with MTU = 1500. When Nagle is ineffective, the delay in user message sending is clearly wasted instead of sending directly. Besides, adding Nagle ACKs will consume some processor load on both the sending and receiving sides. This commit adds a test on the effectiveness of the Nagle algorithm for an individual connection in the network on which it actually runs. Particularly, upon receipt of a Nagle ACK we will compare the number of bundles in the backlog queue to the number of user messages which would be sent directly without Nagle. If the ratio is good (e.g. >= 2), Nagle mode will be kept for further message sending. Otherwise, we will leave Nagle and put a 'penalty' on the connection, so it will have to spend more 'one-way' messages before being able to re-enter Nagle. In addition, the 'ack-required' bit is only set when really needed that the number of Nagle ACKs will be reduced during Nagle mode. Testing with benchmark showed that with the patch, there was not much difference in throughput for small messages since the tool continuously sends messages without a break, so Nagle would still take in effect. Acked-by: Ying Xue Acked-by: Jon Maloy Signed-off-by: Tuong Lien Signed-off-by: David S. Miller --- net/tipc/msg.c | 3 --- net/tipc/msg.h | 14 ++++++++++-- net/tipc/socket.c | 64 ++++++++++++++++++++++++++++++++++++++++++++----------- 3 files changed, 64 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/tipc/msg.c b/net/tipc/msg.c index c69fb99163fc..23809039dda1 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -235,9 +235,6 @@ int tipc_msg_append(struct tipc_msg *_hdr, struct msghdr *m, int dlen, msg_set_size(hdr, MIN_H_SIZE); __skb_queue_tail(txq, skb); total += 1; - if (prev) - msg_set_ack_required(buf_msg(prev), 0); - msg_set_ack_required(hdr, 1); } hdr = buf_msg(skb); curr = msg_blocks(hdr); diff --git a/net/tipc/msg.h b/net/tipc/msg.h index cd4281779468..58660d56bc83 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -340,9 +340,19 @@ static inline int msg_ack_required(struct tipc_msg *m) return msg_bits(m, 0, 18, 1); } -static inline void msg_set_ack_required(struct tipc_msg *m, u32 d) +static inline void msg_set_ack_required(struct tipc_msg *m) { - msg_set_bits(m, 0, 18, 1, d); + msg_set_bits(m, 0, 18, 1, 1); +} + +static inline int msg_nagle_ack(struct tipc_msg *m) +{ + return msg_bits(m, 0, 18, 1); +} + +static inline void msg_set_nagle_ack(struct tipc_msg *m) +{ + msg_set_bits(m, 0, 18, 1, 1); } static inline bool msg_is_rcast(struct tipc_msg *m) diff --git a/net/tipc/socket.c b/net/tipc/socket.c index e370ad0edd76..d6b67d07d22e 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -48,6 +48,8 @@ #include "group.h" #include "trace.h" +#define NAGLE_START_INIT 4 +#define NAGLE_START_MAX 1024 #define CONN_TIMEOUT_DEFAULT 8000 /* default connect timeout = 8s */ #define CONN_PROBING_INTV msecs_to_jiffies(3600000) /* [ms] => 1 h */ #define TIPC_FWD_MSG 1 @@ -119,7 +121,10 @@ struct tipc_sock { struct rcu_head rcu; struct tipc_group *group; u32 oneway; + u32 nagle_start; u16 snd_backlog; + u16 msg_acc; + u16 pkt_cnt; bool expect_ack; bool nodelay; bool group_is_open; @@ -143,7 +148,7 @@ static int tipc_sk_insert(struct tipc_sock *tsk); static void tipc_sk_remove(struct tipc_sock *tsk); static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dsz); static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz); -static void tipc_sk_push_backlog(struct tipc_sock *tsk); +static void tipc_sk_push_backlog(struct tipc_sock *tsk, bool nagle_ack); static const struct proto_ops packet_ops; static const struct proto_ops stream_ops; @@ -474,6 +479,7 @@ static int tipc_sk_create(struct net *net, struct socket *sock, tsk = tipc_sk(sk); tsk->max_pkt = MAX_PKT_DEFAULT; tsk->maxnagle = 0; + tsk->nagle_start = NAGLE_START_INIT; INIT_LIST_HEAD(&tsk->publications); INIT_LIST_HEAD(&tsk->cong_links); msg = &tsk->phdr; @@ -541,7 +547,7 @@ static void __tipc_shutdown(struct socket *sock, int error) !tsk_conn_cong(tsk))); /* Push out delayed messages if in Nagle mode */ - tipc_sk_push_backlog(tsk); + tipc_sk_push_backlog(tsk, false); /* Remove pending SYN */ __skb_queue_purge(&sk->sk_write_queue); @@ -1252,14 +1258,37 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, /* tipc_sk_push_backlog(): send accumulated buffers in socket write queue * when socket is in Nagle mode */ -static void tipc_sk_push_backlog(struct tipc_sock *tsk) +static void tipc_sk_push_backlog(struct tipc_sock *tsk, bool nagle_ack) { struct sk_buff_head *txq = &tsk->sk.sk_write_queue; + struct sk_buff *skb = skb_peek_tail(txq); struct net *net = sock_net(&tsk->sk); u32 dnode = tsk_peer_node(tsk); - struct sk_buff *skb = skb_peek(txq); int rc; + if (nagle_ack) { + tsk->pkt_cnt += skb_queue_len(txq); + if (!tsk->pkt_cnt || tsk->msg_acc / tsk->pkt_cnt < 2) { + tsk->oneway = 0; + if (tsk->nagle_start < NAGLE_START_MAX) + tsk->nagle_start *= 2; + tsk->expect_ack = false; + pr_debug("tsk %10u: bad nagle %u -> %u, next start %u!\n", + tsk->portid, tsk->msg_acc, tsk->pkt_cnt, + tsk->nagle_start); + } else { + tsk->nagle_start = NAGLE_START_INIT; + if (skb) { + msg_set_ack_required(buf_msg(skb)); + tsk->expect_ack = true; + } else { + tsk->expect_ack = false; + } + } + tsk->msg_acc = 0; + tsk->pkt_cnt = 0; + } + if (!skb || tsk->cong_link_cnt) return; @@ -1267,9 +1296,10 @@ static void tipc_sk_push_backlog(struct tipc_sock *tsk) if (msg_is_syn(buf_msg(skb))) return; + if (tsk->msg_acc) + tsk->pkt_cnt += skb_queue_len(txq); tsk->snt_unacked += tsk->snd_backlog; tsk->snd_backlog = 0; - tsk->expect_ack = true; rc = tipc_node_xmit(net, txq, dnode, tsk->portid); if (rc == -ELINKCONG) tsk->cong_link_cnt = 1; @@ -1322,8 +1352,7 @@ static void tipc_sk_conn_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb, return; } else if (mtyp == CONN_ACK) { was_cong = tsk_conn_cong(tsk); - tsk->expect_ack = false; - tipc_sk_push_backlog(tsk); + tipc_sk_push_backlog(tsk, msg_nagle_ack(hdr)); tsk->snt_unacked -= msg_conn_ack(hdr); if (tsk->peer_caps & TIPC_BLOCK_FLOWCTL) tsk->snd_win = msg_adv_win(hdr); @@ -1516,6 +1545,7 @@ static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dlen) struct tipc_sock *tsk = tipc_sk(sk); struct tipc_msg *hdr = &tsk->phdr; struct net *net = sock_net(sk); + struct sk_buff *skb; u32 dnode = tsk_peer_node(tsk); int maxnagle = tsk->maxnagle; int maxpkt = tsk->max_pkt; @@ -1544,17 +1574,25 @@ static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dlen) break; send = min_t(size_t, dlen - sent, TIPC_MAX_USER_MSG_SIZE); blocks = tsk->snd_backlog; - if (tsk->oneway++ >= 4 && send <= maxnagle) { + if (tsk->oneway++ >= tsk->nagle_start && send <= maxnagle) { rc = tipc_msg_append(hdr, m, send, maxnagle, txq); if (unlikely(rc < 0)) break; blocks += rc; + tsk->msg_acc++; if (blocks <= 64 && tsk->expect_ack) { tsk->snd_backlog = blocks; sent += send; break; + } else if (blocks > 64) { + tsk->pkt_cnt += skb_queue_len(txq); + } else { + skb = skb_peek_tail(txq); + msg_set_ack_required(buf_msg(skb)); + tsk->expect_ack = true; + tsk->msg_acc = 0; + tsk->pkt_cnt = 0; } - tsk->expect_ack = true; } else { rc = tipc_msg_build(hdr, m, sent, send, maxpkt, txq); if (unlikely(rc != send)) @@ -2091,7 +2129,7 @@ static void tipc_sk_proto_rcv(struct sock *sk, smp_wmb(); tsk->cong_link_cnt--; wakeup = true; - tipc_sk_push_backlog(tsk); + tipc_sk_push_backlog(tsk, false); break; case GROUP_PROTOCOL: tipc_group_proto_rcv(grp, &wakeup, hdr, inputq, xmitq); @@ -2180,7 +2218,7 @@ static bool tipc_sk_filter_connect(struct tipc_sock *tsk, struct sk_buff *skb, return false; case TIPC_ESTABLISHED: if (!skb_queue_empty(&sk->sk_write_queue)) - tipc_sk_push_backlog(tsk); + tipc_sk_push_backlog(tsk, false); /* Accept only connection-based messages sent by peer */ if (likely(con_msg && !err && pport == oport && pnode == onode)) { @@ -2188,8 +2226,10 @@ static bool tipc_sk_filter_connect(struct tipc_sock *tsk, struct sk_buff *skb, struct sk_buff *skb; skb = tipc_sk_build_ack(tsk); - if (skb) + if (skb) { + msg_set_nagle_ack(buf_msg(skb)); __skb_queue_tail(xmitq, skb); + } } return true; } -- cgit v1.2.3 From 58cff782cc55eb755826c649976aea9f5f8b3086 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Tue, 26 May 2020 14:29:00 +0200 Subject: flow_dissector: Parse multiple MPLS Label Stack Entries The current MPLS dissector only parses the first MPLS Label Stack Entry (second LSE can be parsed too, but only to set a key_id). This patch adds the possibility to parse several LSEs by making __skb_flow_dissect_mpls() return FLOW_DISSECT_RET_PROTO_AGAIN as long as the Bottom Of Stack bit hasn't been seen, up to a maximum of FLOW_DIS_MPLS_MAX entries. FLOW_DIS_MPLS_MAX is arbitrarily set to 7. This should be enough for many practical purposes, without wasting too much space. To record the parsed values, flow_dissector_key_mpls is modified to store an array of stack entries, instead of just the values of the first one. A bit field, "used_lses", is also added to keep track of the LSEs that have been set. The objective is to avoid defining a new FLOW_DISSECTOR_KEY_MPLS_XX for each level of the MPLS stack. TC flower is adapted for the new struct flow_dissector_key_mpls layout. Matching on several MPLS Label Stack Entries will be added in the next patch. The NFP and MLX5 drivers are also adapted: nfp_flower_compile_mac() and mlx5's parse_tunnel() now verify that the rule only uses the first LSE and fail if it doesn't. Finally, the behaviour of the FLOW_DISSECTOR_KEY_MPLS_ENTROPY key is slightly modified. Instead of recording the first Entropy Label, it now records the last one. This shouldn't have any consequences since there doesn't seem to have any user of FLOW_DISSECTOR_KEY_MPLS_ENTROPY in the tree. We'd probably better do a hash of all parsed MPLS labels instead (excluding reserved labels) anyway. That'd give better entropy and would probably also simplify the code. But that's not the purpose of this patch, so I'm keeping that as a future possible improvement. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- .../mellanox/mlx5/core/en/tc_tun_mplsoudp.c | 27 +++++++---- drivers/net/ethernet/netronome/nfp/flower/match.c | 42 ++++++++++++----- include/net/flow_dissector.h | 14 +++++- net/core/flow_dissector.c | 49 +++++++++++++------- net/sched/cls_flower.c | 52 +++++++++++++++------- 5 files changed, 132 insertions(+), 52 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_mplsoudp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_mplsoudp.c index 98ee62e427d2..b4a3c96d34fd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_mplsoudp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_mplsoudp.c @@ -101,25 +101,36 @@ static int parse_tunnel(struct mlx5e_priv *priv, flow_rule_match_mpls(rule, &match); + /* Only support matching the first LSE */ + if (match.mask->used_lses != 1) + return -EOPNOTSUPP; + MLX5_SET(fte_match_set_misc2, misc2_c, - outer_first_mpls_over_udp.mpls_label, match.mask->mpls_label); + outer_first_mpls_over_udp.mpls_label, + match.mask->ls[0].mpls_label); MLX5_SET(fte_match_set_misc2, misc2_v, - outer_first_mpls_over_udp.mpls_label, match.key->mpls_label); + outer_first_mpls_over_udp.mpls_label, + match.key->ls[0].mpls_label); MLX5_SET(fte_match_set_misc2, misc2_c, - outer_first_mpls_over_udp.mpls_exp, match.mask->mpls_tc); + outer_first_mpls_over_udp.mpls_exp, + match.mask->ls[0].mpls_tc); MLX5_SET(fte_match_set_misc2, misc2_v, - outer_first_mpls_over_udp.mpls_exp, match.key->mpls_tc); + outer_first_mpls_over_udp.mpls_exp, match.key->ls[0].mpls_tc); MLX5_SET(fte_match_set_misc2, misc2_c, - outer_first_mpls_over_udp.mpls_s_bos, match.mask->mpls_bos); + outer_first_mpls_over_udp.mpls_s_bos, + match.mask->ls[0].mpls_bos); MLX5_SET(fte_match_set_misc2, misc2_v, - outer_first_mpls_over_udp.mpls_s_bos, match.key->mpls_bos); + outer_first_mpls_over_udp.mpls_s_bos, + match.key->ls[0].mpls_bos); MLX5_SET(fte_match_set_misc2, misc2_c, - outer_first_mpls_over_udp.mpls_ttl, match.mask->mpls_ttl); + outer_first_mpls_over_udp.mpls_ttl, + match.mask->ls[0].mpls_ttl); MLX5_SET(fte_match_set_misc2, misc2_v, - outer_first_mpls_over_udp.mpls_ttl, match.key->mpls_ttl); + outer_first_mpls_over_udp.mpls_ttl, + match.key->ls[0].mpls_ttl); spec->match_criteria_enable |= MLX5_MATCH_MISC_PARAMETERS_2; return 0; diff --git a/drivers/net/ethernet/netronome/nfp/flower/match.c b/drivers/net/ethernet/netronome/nfp/flower/match.c index 546bc01d507d..f7f01e2e3dce 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/match.c +++ b/drivers/net/ethernet/netronome/nfp/flower/match.c @@ -74,9 +74,10 @@ nfp_flower_compile_port(struct nfp_flower_in_port *frame, u32 cmsg_port, return 0; } -static void +static int nfp_flower_compile_mac(struct nfp_flower_mac_mpls *ext, - struct nfp_flower_mac_mpls *msk, struct flow_rule *rule) + struct nfp_flower_mac_mpls *msk, struct flow_rule *rule, + struct netlink_ext_ack *extack) { memset(ext, 0, sizeof(struct nfp_flower_mac_mpls)); memset(msk, 0, sizeof(struct nfp_flower_mac_mpls)); @@ -97,14 +98,28 @@ nfp_flower_compile_mac(struct nfp_flower_mac_mpls *ext, u32 t_mpls; flow_rule_match_mpls(rule, &match); - t_mpls = FIELD_PREP(NFP_FLOWER_MASK_MPLS_LB, match.key->mpls_label) | - FIELD_PREP(NFP_FLOWER_MASK_MPLS_TC, match.key->mpls_tc) | - FIELD_PREP(NFP_FLOWER_MASK_MPLS_BOS, match.key->mpls_bos) | + + /* Only support matching the first LSE */ + if (match.mask->used_lses != 1) { + NL_SET_ERR_MSG_MOD(extack, + "unsupported offload: invalid LSE depth for MPLS match offload"); + return -EOPNOTSUPP; + } + + t_mpls = FIELD_PREP(NFP_FLOWER_MASK_MPLS_LB, + match.key->ls[0].mpls_label) | + FIELD_PREP(NFP_FLOWER_MASK_MPLS_TC, + match.key->ls[0].mpls_tc) | + FIELD_PREP(NFP_FLOWER_MASK_MPLS_BOS, + match.key->ls[0].mpls_bos) | NFP_FLOWER_MASK_MPLS_Q; ext->mpls_lse = cpu_to_be32(t_mpls); - t_mpls = FIELD_PREP(NFP_FLOWER_MASK_MPLS_LB, match.mask->mpls_label) | - FIELD_PREP(NFP_FLOWER_MASK_MPLS_TC, match.mask->mpls_tc) | - FIELD_PREP(NFP_FLOWER_MASK_MPLS_BOS, match.mask->mpls_bos) | + t_mpls = FIELD_PREP(NFP_FLOWER_MASK_MPLS_LB, + match.mask->ls[0].mpls_label) | + FIELD_PREP(NFP_FLOWER_MASK_MPLS_TC, + match.mask->ls[0].mpls_tc) | + FIELD_PREP(NFP_FLOWER_MASK_MPLS_BOS, + match.mask->ls[0].mpls_bos) | NFP_FLOWER_MASK_MPLS_Q; msk->mpls_lse = cpu_to_be32(t_mpls); } else if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_BASIC)) { @@ -121,6 +136,8 @@ nfp_flower_compile_mac(struct nfp_flower_mac_mpls *ext, msk->mpls_lse = cpu_to_be32(NFP_FLOWER_MASK_MPLS_Q); } } + + return 0; } static void @@ -461,9 +478,12 @@ int nfp_flower_compile_flow_match(struct nfp_app *app, msk += sizeof(struct nfp_flower_in_port); if (NFP_FLOWER_LAYER_MAC & key_ls->key_layer) { - nfp_flower_compile_mac((struct nfp_flower_mac_mpls *)ext, - (struct nfp_flower_mac_mpls *)msk, - rule); + err = nfp_flower_compile_mac((struct nfp_flower_mac_mpls *)ext, + (struct nfp_flower_mac_mpls *)msk, + rule, extack); + if (err) + return err; + ext += sizeof(struct nfp_flower_mac_mpls); msk += sizeof(struct nfp_flower_mac_mpls); } diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index 628383915827..4fb1a69c6ecf 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -59,13 +59,25 @@ struct flow_dissector_key_vlan { __be16 vlan_tpid; }; -struct flow_dissector_key_mpls { +struct flow_dissector_mpls_lse { u32 mpls_ttl:8, mpls_bos:1, mpls_tc:3, mpls_label:20; }; +#define FLOW_DIS_MPLS_MAX 7 +struct flow_dissector_key_mpls { + struct flow_dissector_mpls_lse ls[FLOW_DIS_MPLS_MAX]; /* Label Stack */ + u8 used_lses; /* One bit set for each Label Stack Entry in use */ +}; + +static inline void dissector_set_mpls_lse(struct flow_dissector_key_mpls *mpls, + int lse_index) +{ + mpls->used_lses |= 1 << lse_index; +} + #define FLOW_DIS_TUN_OPTS_MAX 255 /** * struct flow_dissector_key_enc_opts: diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 5dceed467f64..0aeb33572feb 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -480,47 +480,59 @@ EXPORT_SYMBOL(skb_flow_dissect_tunnel_info); static enum flow_dissect_ret __skb_flow_dissect_mpls(const struct sk_buff *skb, struct flow_dissector *flow_dissector, - void *target_container, void *data, int nhoff, int hlen) + void *target_container, void *data, int nhoff, int hlen, + int lse_index, bool *entropy_label) { - struct flow_dissector_key_keyid *key_keyid; - struct mpls_label *hdr, _hdr[2]; - u32 entry, label; + struct mpls_label *hdr, _hdr; + u32 entry, label, bos; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS_ENTROPY) && !dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS)) return FLOW_DISSECT_RET_OUT_GOOD; + if (lse_index >= FLOW_DIS_MPLS_MAX) + return FLOW_DISSECT_RET_OUT_GOOD; + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) return FLOW_DISSECT_RET_OUT_BAD; - entry = ntohl(hdr[0].entry); + entry = ntohl(hdr->entry); label = (entry & MPLS_LS_LABEL_MASK) >> MPLS_LS_LABEL_SHIFT; + bos = (entry & MPLS_LS_S_MASK) >> MPLS_LS_S_SHIFT; if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS)) { struct flow_dissector_key_mpls *key_mpls; + struct flow_dissector_mpls_lse *lse; key_mpls = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_MPLS, target_container); - key_mpls->mpls_label = label; - key_mpls->mpls_ttl = (entry & MPLS_LS_TTL_MASK) - >> MPLS_LS_TTL_SHIFT; - key_mpls->mpls_tc = (entry & MPLS_LS_TC_MASK) - >> MPLS_LS_TC_SHIFT; - key_mpls->mpls_bos = (entry & MPLS_LS_S_MASK) - >> MPLS_LS_S_SHIFT; + lse = &key_mpls->ls[lse_index]; + + lse->mpls_ttl = (entry & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT; + lse->mpls_bos = bos; + lse->mpls_tc = (entry & MPLS_LS_TC_MASK) >> MPLS_LS_TC_SHIFT; + lse->mpls_label = label; + dissector_set_mpls_lse(key_mpls, lse_index); } - if (label == MPLS_LABEL_ENTROPY) { + if (*entropy_label && + dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_MPLS_ENTROPY)) { + struct flow_dissector_key_keyid *key_keyid; + key_keyid = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_MPLS_ENTROPY, target_container); - key_keyid->keyid = hdr[1].entry & htonl(MPLS_LS_LABEL_MASK); + key_keyid->keyid = cpu_to_be32(label); } - return FLOW_DISSECT_RET_OUT_GOOD; + + *entropy_label = label == MPLS_LABEL_ENTROPY; + + return bos ? FLOW_DISSECT_RET_OUT_GOOD : FLOW_DISSECT_RET_PROTO_AGAIN; } static enum flow_dissect_ret @@ -979,6 +991,8 @@ bool __skb_flow_dissect(const struct net *net, struct bpf_prog *attached = NULL; enum flow_dissect_ret fdret; enum flow_dissector_key_id dissector_vlan = FLOW_DISSECTOR_KEY_MAX; + bool mpls_el = false; + int mpls_lse = 0; int num_hdrs = 0; u8 ip_proto = 0; bool ret; @@ -1278,7 +1292,10 @@ proto_again: case htons(ETH_P_MPLS_MC): fdret = __skb_flow_dissect_mpls(skb, flow_dissector, target_container, data, - nhoff, hlen); + nhoff, hlen, mpls_lse, + &mpls_el); + nhoff += sizeof(struct mpls_label); + mpls_lse++; break; case htons(ETH_P_FCOE): if ((hlen - nhoff) < FCOE_HEADER_LEN) { diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 0c574700da75..f524afe0b7f5 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -781,9 +781,17 @@ static int fl_set_key_mpls(struct nlattr **tb, struct flow_dissector_key_mpls *key_mask, struct netlink_ext_ack *extack) { + struct flow_dissector_mpls_lse *lse_mask; + struct flow_dissector_mpls_lse *lse_val; + + lse_val = &key_val->ls[0]; + lse_mask = &key_mask->ls[0]; + if (tb[TCA_FLOWER_KEY_MPLS_TTL]) { - key_val->mpls_ttl = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_TTL]); - key_mask->mpls_ttl = MPLS_TTL_MASK; + lse_val->mpls_ttl = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_TTL]); + lse_mask->mpls_ttl = MPLS_TTL_MASK; + dissector_set_mpls_lse(key_val, 0); + dissector_set_mpls_lse(key_mask, 0); } if (tb[TCA_FLOWER_KEY_MPLS_BOS]) { u8 bos = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_BOS]); @@ -794,8 +802,10 @@ static int fl_set_key_mpls(struct nlattr **tb, "Bottom Of Stack (BOS) must be 0 or 1"); return -EINVAL; } - key_val->mpls_bos = bos; - key_mask->mpls_bos = MPLS_BOS_MASK; + lse_val->mpls_bos = bos; + lse_mask->mpls_bos = MPLS_BOS_MASK; + dissector_set_mpls_lse(key_val, 0); + dissector_set_mpls_lse(key_mask, 0); } if (tb[TCA_FLOWER_KEY_MPLS_TC]) { u8 tc = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_TC]); @@ -806,8 +816,10 @@ static int fl_set_key_mpls(struct nlattr **tb, "Traffic Class (TC) must be between 0 and 7"); return -EINVAL; } - key_val->mpls_tc = tc; - key_mask->mpls_tc = MPLS_TC_MASK; + lse_val->mpls_tc = tc; + lse_mask->mpls_tc = MPLS_TC_MASK; + dissector_set_mpls_lse(key_val, 0); + dissector_set_mpls_lse(key_mask, 0); } if (tb[TCA_FLOWER_KEY_MPLS_LABEL]) { u32 label = nla_get_u32(tb[TCA_FLOWER_KEY_MPLS_LABEL]); @@ -818,8 +830,10 @@ static int fl_set_key_mpls(struct nlattr **tb, "Label must be between 0 and 1048575"); return -EINVAL; } - key_val->mpls_label = label; - key_mask->mpls_label = MPLS_LABEL_MASK; + lse_val->mpls_label = label; + lse_mask->mpls_label = MPLS_LABEL_MASK; + dissector_set_mpls_lse(key_val, 0); + dissector_set_mpls_lse(key_mask, 0); } return 0; } @@ -2222,31 +2236,37 @@ static int fl_dump_key_mpls(struct sk_buff *skb, struct flow_dissector_key_mpls *mpls_key, struct flow_dissector_key_mpls *mpls_mask) { + struct flow_dissector_mpls_lse *lse_mask; + struct flow_dissector_mpls_lse *lse_key; int err; if (!memchr_inv(mpls_mask, 0, sizeof(*mpls_mask))) return 0; - if (mpls_mask->mpls_ttl) { + + lse_mask = &mpls_mask->ls[0]; + lse_key = &mpls_key->ls[0]; + + if (lse_mask->mpls_ttl) { err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_TTL, - mpls_key->mpls_ttl); + lse_key->mpls_ttl); if (err) return err; } - if (mpls_mask->mpls_tc) { + if (lse_mask->mpls_tc) { err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_TC, - mpls_key->mpls_tc); + lse_key->mpls_tc); if (err) return err; } - if (mpls_mask->mpls_label) { + if (lse_mask->mpls_label) { err = nla_put_u32(skb, TCA_FLOWER_KEY_MPLS_LABEL, - mpls_key->mpls_label); + lse_key->mpls_label); if (err) return err; } - if (mpls_mask->mpls_bos) { + if (lse_mask->mpls_bos) { err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_BOS, - mpls_key->mpls_bos); + lse_key->mpls_bos); if (err) return err; } -- cgit v1.2.3 From 61aec25a6db5d0c2e8ab5da6d2d152269d0d9d69 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Tue, 26 May 2020 14:29:04 +0200 Subject: cls_flower: Support filtering on multiple MPLS Label Stack Entries With struct flow_dissector_key_mpls now recording the first FLOW_DIS_MPLS_MAX labels, we can extend Flower to filter on any of these LSEs independently. In order to avoid creating new netlink attributes for every possible depth, let's define a new TCA_FLOWER_KEY_MPLS_OPTS nested attribute that contains the list of LSEs to match. Each LSE is represented by another attribute, TCA_FLOWER_KEY_MPLS_OPTS_LSE, which then contains the attributes representing the depth and the MPLS fields to match at this depth (label, TTL, etc.). For each MPLS field, the mask is always set to all-ones, as this is what the original API did. We could allow user configurable masks in the future if there is demand for more flexibility. The new API also allows to only specify an LSE depth. In that case, Flower only verifies that the MPLS label stack depth is greater or equal to the provided depth (that is, an LSE exists at this depth). Filters that only match on one (or more) fields of the first LSE are dumped using the old netlink attributes, to avoid confusing user space programs that don't understand the new API. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 23 ++++ net/sched/cls_flower.c | 243 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 265 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index fc672b232437..7576209d96f9 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -576,6 +576,8 @@ enum { TCA_FLOWER_KEY_CT_LABELS, /* u128 */ TCA_FLOWER_KEY_CT_LABELS_MASK, /* u128 */ + TCA_FLOWER_KEY_MPLS_OPTS, + __TCA_FLOWER_MAX, }; @@ -640,6 +642,27 @@ enum { #define TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX \ (__TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX - 1) +enum { + TCA_FLOWER_KEY_MPLS_OPTS_UNSPEC, + TCA_FLOWER_KEY_MPLS_OPTS_LSE, + __TCA_FLOWER_KEY_MPLS_OPTS_MAX, +}; + +#define TCA_FLOWER_KEY_MPLS_OPTS_MAX (__TCA_FLOWER_KEY_MPLS_OPTS_MAX - 1) + +enum { + TCA_FLOWER_KEY_MPLS_OPT_LSE_UNSPEC, + TCA_FLOWER_KEY_MPLS_OPT_LSE_DEPTH, + TCA_FLOWER_KEY_MPLS_OPT_LSE_TTL, + TCA_FLOWER_KEY_MPLS_OPT_LSE_BOS, + TCA_FLOWER_KEY_MPLS_OPT_LSE_TC, + TCA_FLOWER_KEY_MPLS_OPT_LSE_LABEL, + __TCA_FLOWER_KEY_MPLS_OPT_LSE_MAX, +}; + +#define TCA_FLOWER_KEY_MPLS_OPT_LSE_MAX \ + (__TCA_FLOWER_KEY_MPLS_OPT_LSE_MAX - 1) + enum { TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT = (1 << 0), TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST = (1 << 1), diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index f524afe0b7f5..96f5999281e0 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -668,6 +668,7 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { [TCA_FLOWER_KEY_MPLS_BOS] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_MPLS_TC] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_MPLS_LABEL] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_MPLS_OPTS] = { .type = NLA_NESTED }, [TCA_FLOWER_KEY_TCP_FLAGS] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_TCP_FLAGS_MASK] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_IP_TOS] = { .type = NLA_U8 }, @@ -726,6 +727,20 @@ erspan_opt_policy[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX + 1] = { [TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID] = { .type = NLA_U8 }, }; +static const struct nla_policy +mpls_opts_policy[TCA_FLOWER_KEY_MPLS_OPTS_MAX + 1] = { + [TCA_FLOWER_KEY_MPLS_OPTS_LSE] = { .type = NLA_NESTED }, +}; + +static const struct nla_policy +mpls_stack_entry_policy[TCA_FLOWER_KEY_MPLS_OPT_LSE_MAX + 1] = { + [TCA_FLOWER_KEY_MPLS_OPT_LSE_DEPTH] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_MPLS_OPT_LSE_TTL] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_MPLS_OPT_LSE_BOS] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_MPLS_OPT_LSE_TC] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_MPLS_OPT_LSE_LABEL] = { .type = NLA_U32 }, +}; + static void fl_set_key_val(struct nlattr **tb, void *val, int val_type, void *mask, int mask_type, int len) @@ -776,6 +791,126 @@ static int fl_set_key_port_range(struct nlattr **tb, struct fl_flow_key *key, return 0; } +static int fl_set_key_mpls_lse(const struct nlattr *nla_lse, + struct flow_dissector_key_mpls *key_val, + struct flow_dissector_key_mpls *key_mask, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_MAX + 1]; + struct flow_dissector_mpls_lse *lse_mask; + struct flow_dissector_mpls_lse *lse_val; + u8 lse_index; + u8 depth; + int err; + + err = nla_parse_nested(tb, TCA_FLOWER_KEY_MPLS_OPT_LSE_MAX, nla_lse, + mpls_stack_entry_policy, extack); + if (err < 0) + return err; + + if (!tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_DEPTH]) { + NL_SET_ERR_MSG(extack, "Missing MPLS option \"depth\""); + return -EINVAL; + } + + depth = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_DEPTH]); + + /* LSE depth starts at 1, for consistency with terminology used by + * RFC 3031 (section 3.9), where depth 0 refers to unlabeled packets. + */ + if (depth < 1 || depth > FLOW_DIS_MPLS_MAX) { + NL_SET_ERR_MSG_ATTR(extack, + tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_DEPTH], + "Invalid MPLS depth"); + return -EINVAL; + } + lse_index = depth - 1; + + dissector_set_mpls_lse(key_val, lse_index); + dissector_set_mpls_lse(key_mask, lse_index); + + lse_val = &key_val->ls[lse_index]; + lse_mask = &key_mask->ls[lse_index]; + + if (tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_TTL]) { + lse_val->mpls_ttl = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_TTL]); + lse_mask->mpls_ttl = MPLS_TTL_MASK; + } + if (tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_BOS]) { + u8 bos = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_BOS]); + + if (bos & ~MPLS_BOS_MASK) { + NL_SET_ERR_MSG_ATTR(extack, + tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_BOS], + "Bottom Of Stack (BOS) must be 0 or 1"); + return -EINVAL; + } + lse_val->mpls_bos = bos; + lse_mask->mpls_bos = MPLS_BOS_MASK; + } + if (tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_TC]) { + u8 tc = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_TC]); + + if (tc & ~MPLS_TC_MASK) { + NL_SET_ERR_MSG_ATTR(extack, + tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_TC], + "Traffic Class (TC) must be between 0 and 7"); + return -EINVAL; + } + lse_val->mpls_tc = tc; + lse_mask->mpls_tc = MPLS_TC_MASK; + } + if (tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_LABEL]) { + u32 label = nla_get_u32(tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_LABEL]); + + if (label & ~MPLS_LABEL_MASK) { + NL_SET_ERR_MSG_ATTR(extack, + tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_LABEL], + "Label must be between 0 and 1048575"); + return -EINVAL; + } + lse_val->mpls_label = label; + lse_mask->mpls_label = MPLS_LABEL_MASK; + } + + return 0; +} + +static int fl_set_key_mpls_opts(const struct nlattr *nla_mpls_opts, + struct flow_dissector_key_mpls *key_val, + struct flow_dissector_key_mpls *key_mask, + struct netlink_ext_ack *extack) +{ + struct nlattr *nla_lse; + int rem; + int err; + + if (!(nla_mpls_opts->nla_type & NLA_F_NESTED)) { + NL_SET_ERR_MSG_ATTR(extack, nla_mpls_opts, + "NLA_F_NESTED is missing"); + return -EINVAL; + } + + nla_for_each_nested(nla_lse, nla_mpls_opts, rem) { + if (nla_type(nla_lse) != TCA_FLOWER_KEY_MPLS_OPTS_LSE) { + NL_SET_ERR_MSG_ATTR(extack, nla_lse, + "Invalid MPLS option type"); + return -EINVAL; + } + + err = fl_set_key_mpls_lse(nla_lse, key_val, key_mask, extack); + if (err < 0) + return err; + } + if (rem) { + NL_SET_ERR_MSG(extack, + "Bytes leftover after parsing MPLS options"); + return -EINVAL; + } + + return 0; +} + static int fl_set_key_mpls(struct nlattr **tb, struct flow_dissector_key_mpls *key_val, struct flow_dissector_key_mpls *key_mask, @@ -784,6 +919,21 @@ static int fl_set_key_mpls(struct nlattr **tb, struct flow_dissector_mpls_lse *lse_mask; struct flow_dissector_mpls_lse *lse_val; + if (tb[TCA_FLOWER_KEY_MPLS_OPTS]) { + if (tb[TCA_FLOWER_KEY_MPLS_TTL] || + tb[TCA_FLOWER_KEY_MPLS_BOS] || + tb[TCA_FLOWER_KEY_MPLS_TC] || + tb[TCA_FLOWER_KEY_MPLS_LABEL]) { + NL_SET_ERR_MSG_ATTR(extack, + tb[TCA_FLOWER_KEY_MPLS_OPTS], + "MPLS label, Traffic Class, Bottom Of Stack and Time To Live must be encapsulated in the MPLS options attribute"); + return -EBADMSG; + } + + return fl_set_key_mpls_opts(tb[TCA_FLOWER_KEY_MPLS_OPTS], + key_val, key_mask, extack); + } + lse_val = &key_val->ls[0]; lse_mask = &key_mask->ls[0]; @@ -2232,6 +2382,89 @@ static int fl_dump_key_port_range(struct sk_buff *skb, struct fl_flow_key *key, return 0; } +static int fl_dump_key_mpls_opt_lse(struct sk_buff *skb, + struct flow_dissector_key_mpls *mpls_key, + struct flow_dissector_key_mpls *mpls_mask, + u8 lse_index) +{ + struct flow_dissector_mpls_lse *lse_mask = &mpls_mask->ls[lse_index]; + struct flow_dissector_mpls_lse *lse_key = &mpls_key->ls[lse_index]; + int err; + + err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_OPT_LSE_DEPTH, + lse_index + 1); + if (err) + return err; + + if (lse_mask->mpls_ttl) { + err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_OPT_LSE_TTL, + lse_key->mpls_ttl); + if (err) + return err; + } + if (lse_mask->mpls_bos) { + err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_OPT_LSE_BOS, + lse_key->mpls_bos); + if (err) + return err; + } + if (lse_mask->mpls_tc) { + err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_OPT_LSE_TC, + lse_key->mpls_tc); + if (err) + return err; + } + if (lse_mask->mpls_label) { + err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_OPT_LSE_LABEL, + lse_key->mpls_label); + if (err) + return err; + } + + return 0; +} + +static int fl_dump_key_mpls_opts(struct sk_buff *skb, + struct flow_dissector_key_mpls *mpls_key, + struct flow_dissector_key_mpls *mpls_mask) +{ + struct nlattr *opts; + struct nlattr *lse; + u8 lse_index; + int err; + + opts = nla_nest_start(skb, TCA_FLOWER_KEY_MPLS_OPTS); + if (!opts) + return -EMSGSIZE; + + for (lse_index = 0; lse_index < FLOW_DIS_MPLS_MAX; lse_index++) { + if (!(mpls_mask->used_lses & 1 << lse_index)) + continue; + + lse = nla_nest_start(skb, TCA_FLOWER_KEY_MPLS_OPTS_LSE); + if (!lse) { + err = -EMSGSIZE; + goto err_opts; + } + + err = fl_dump_key_mpls_opt_lse(skb, mpls_key, mpls_mask, + lse_index); + if (err) + goto err_opts_lse; + nla_nest_end(skb, lse); + } + nla_nest_end(skb, opts); + + return 0; + +err_opts_lse: + nla_nest_cancel(skb, lse); +err_opts: + nla_nest_cancel(skb, opts); + + return err; +} + static int fl_dump_key_mpls(struct sk_buff *skb, struct flow_dissector_key_mpls *mpls_key, struct flow_dissector_key_mpls *mpls_mask) @@ -2240,12 +2473,20 @@ static int fl_dump_key_mpls(struct sk_buff *skb, struct flow_dissector_mpls_lse *lse_key; int err; - if (!memchr_inv(mpls_mask, 0, sizeof(*mpls_mask))) + if (!mpls_mask->used_lses) return 0; lse_mask = &mpls_mask->ls[0]; lse_key = &mpls_key->ls[0]; + /* For backward compatibility, don't use the MPLS nested attributes if + * the rule can be expressed using the old attributes. + */ + if (mpls_mask->used_lses & ~1 || + (!lse_mask->mpls_ttl && !lse_mask->mpls_bos && + !lse_mask->mpls_tc && !lse_mask->mpls_label)) + return fl_dump_key_mpls_opts(skb, mpls_key, mpls_mask); + if (lse_mask->mpls_ttl) { err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_TTL, lse_key->mpls_ttl); -- cgit v1.2.3 From 09d0310f07672b7e3e60cf719b96c803f4830e5c Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Mon, 25 May 2020 17:31:58 +0200 Subject: net/smc: mark smc_pnet_policy as const Netlink policies are generally declared as const. This is safer and prevents potential bugs. Signed-off-by: Dmitry Vyukov Signed-off-by: David S. Miller --- net/smc/smc_pnet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index be03f1260d59..014d91b9778e 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -32,7 +32,7 @@ static struct net_device *pnet_find_base_ndev(struct net_device *ndev); -static struct nla_policy smc_pnet_policy[SMC_PNETID_MAX + 1] = { +static const struct nla_policy smc_pnet_policy[SMC_PNETID_MAX + 1] = { [SMC_PNETID_NAME] = { .type = NLA_NUL_STRING, .len = SMC_MAX_PNETID_LEN -- cgit v1.2.3 From 4e637c70b503b686aae45716a25a94dc3a434f3a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 25 May 2020 23:41:13 +0200 Subject: mptcp: attempt coalescing when moving skbs to mptcp rx queue We can try to coalesce skbs we take from the subflows rx queue with the tail of the mptcp rx queue. If successful, the skb head can be discarded early. We can also free the skb extensions, we do not access them after this. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index ba9d3d5c625f..b2c8b57e7942 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -144,12 +144,29 @@ static void __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, unsigned int offset, size_t copy_len) { struct sock *sk = (struct sock *)msk; + struct sk_buff *tail; __skb_unlink(skb, &ssk->sk_receive_queue); - skb_set_owner_r(skb, sk); - __skb_queue_tail(&sk->sk_receive_queue, skb); + skb_ext_reset(skb); + skb_orphan(skb); msk->ack_seq += copy_len; + + tail = skb_peek_tail(&sk->sk_receive_queue); + if (offset == 0 && tail) { + bool fragstolen; + int delta; + + if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { + kfree_skb_partial(skb, fragstolen); + atomic_add(delta, &sk->sk_rmem_alloc); + sk_mem_charge(sk, delta); + return; + } + } + + skb_set_owner_r(skb, sk); + __skb_queue_tail(&sk->sk_receive_queue, skb); MPTCP_SKB_CB(skb)->offset = offset; } -- cgit v1.2.3 From 1a644de29f712771c2ec00e52caa391544eb6141 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 27 May 2020 00:21:38 +0200 Subject: net: ethtool: Add generic parts of cable test TDR Add the generic parts of the code used to trigger a cable test and return raw TDR data. Any PHY driver which support this must implement the new driver op. Signed-off-by: Andrew Lunn v2 Update nxp-tja11xx for API change. Signed-off-by: David S. Miller --- drivers/net/phy/nxp-tja11xx.c | 2 +- drivers/net/phy/phy.c | 65 ++++++++++++++++++++++++++++++++++++++++- include/linux/ethtool_netlink.h | 4 +-- include/linux/phy.h | 13 +++++++++ net/ethtool/cabletest.c | 64 ++++++++++++++++++++++++++++++++++++---- net/ethtool/netlink.c | 5 ++++ net/ethtool/netlink.h | 1 + 7 files changed, 144 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/drivers/net/phy/nxp-tja11xx.c b/drivers/net/phy/nxp-tja11xx.c index 1e79c30ca81a..a72fa0d2e7c7 100644 --- a/drivers/net/phy/nxp-tja11xx.c +++ b/drivers/net/phy/nxp-tja11xx.c @@ -194,7 +194,7 @@ static int tja11xx_config_aneg_cable_test(struct phy_device *phydev) !phydev->drv->cable_test_get_status) return 0; - ret = ethnl_cable_test_alloc(phydev); + ret = ethnl_cable_test_alloc(phydev, ETHTOOL_MSG_CABLE_TEST_NTF); if (ret) return ret; diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index 27da0c94818f..495d9ba3d5bf 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -519,7 +519,7 @@ int phy_start_cable_test(struct phy_device *phydev, goto out; } - err = ethnl_cable_test_alloc(phydev); + err = ethnl_cable_test_alloc(phydev, ETHTOOL_MSG_CABLE_TEST_NTF); if (err) goto out; @@ -552,6 +552,69 @@ out: } EXPORT_SYMBOL(phy_start_cable_test); +int phy_start_cable_test_tdr(struct phy_device *phydev, + struct netlink_ext_ack *extack) +{ + struct net_device *dev = phydev->attached_dev; + int err = -ENOMEM; + + if (!(phydev->drv && + phydev->drv->cable_test_tdr_start && + phydev->drv->cable_test_get_status)) { + NL_SET_ERR_MSG(extack, + "PHY driver does not support cable test TDR"); + return -EOPNOTSUPP; + } + + mutex_lock(&phydev->lock); + if (phydev->state == PHY_CABLETEST) { + NL_SET_ERR_MSG(extack, + "PHY already performing a test"); + err = -EBUSY; + goto out; + } + + if (phydev->state < PHY_UP || + phydev->state > PHY_CABLETEST) { + NL_SET_ERR_MSG(extack, + "PHY not configured. Try setting interface up"); + err = -EBUSY; + goto out; + } + + err = ethnl_cable_test_alloc(phydev, ETHTOOL_MSG_CABLE_TEST_TDR_NTF); + if (err) + goto out; + + /* Mark the carrier down until the test is complete */ + phy_link_down(phydev); + + netif_testing_on(dev); + err = phydev->drv->cable_test_tdr_start(phydev); + if (err) { + netif_testing_off(dev); + phy_link_up(phydev); + goto out_free; + } + + phydev->state = PHY_CABLETEST; + + if (phy_polling_mode(phydev)) + phy_trigger_machine(phydev); + + mutex_unlock(&phydev->lock); + + return 0; + +out_free: + ethnl_cable_test_free(phydev); +out: + mutex_unlock(&phydev->lock); + + return err; +} +EXPORT_SYMBOL(phy_start_cable_test_tdr); + static int phy_config_aneg(struct phy_device *phydev) { if (phydev->drv->config_aneg) diff --git a/include/linux/ethtool_netlink.h b/include/linux/ethtool_netlink.h index e317fc99565e..24817ba252a0 100644 --- a/include/linux/ethtool_netlink.h +++ b/include/linux/ethtool_netlink.h @@ -17,13 +17,13 @@ enum ethtool_multicast_groups { struct phy_device; #if IS_ENABLED(CONFIG_ETHTOOL_NETLINK) -int ethnl_cable_test_alloc(struct phy_device *phydev); +int ethnl_cable_test_alloc(struct phy_device *phydev, u8 cmd); void ethnl_cable_test_free(struct phy_device *phydev); void ethnl_cable_test_finished(struct phy_device *phydev); int ethnl_cable_test_result(struct phy_device *phydev, u8 pair, u8 result); int ethnl_cable_test_fault_length(struct phy_device *phydev, u8 pair, u32 cm); #else -static inline int ethnl_cable_test_alloc(struct phy_device *phydev) +static inline int ethnl_cable_test_alloc(struct phy_device *phydev, u8 cmd) { return -EOPNOTSUPP; } diff --git a/include/linux/phy.h b/include/linux/phy.h index 6d256e720a66..d3c384f353ca 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -699,6 +699,10 @@ struct phy_driver { /* Start a cable test */ int (*cable_test_start)(struct phy_device *dev); + + /* Start a raw TDR cable test */ + int (*cable_test_tdr_start)(struct phy_device *dev); + /* Once per second, or on interrupt, request the status of the * test. */ @@ -1251,6 +1255,8 @@ int phy_reset_after_clk_enable(struct phy_device *phydev); #if IS_ENABLED(CONFIG_PHYLIB) int phy_start_cable_test(struct phy_device *phydev, struct netlink_ext_ack *extack); +int phy_start_cable_test_tdr(struct phy_device *phydev, + struct netlink_ext_ack *extack); #else static inline int phy_start_cable_test(struct phy_device *phydev, @@ -1259,6 +1265,13 @@ int phy_start_cable_test(struct phy_device *phydev, NL_SET_ERR_MSG(extack, "Kernel not compiled with PHYLIB support"); return -EOPNOTSUPP; } +static inline +int phy_start_cable_test_tdr(struct phy_device *phydev, + struct netlink_ext_ack *extack) +{ + NL_SET_ERR_MSG(extack, "Kernel not compiled with PHYLIB support"); + return -EOPNOTSUPP; +} #endif int phy_cable_test_result(struct phy_device *phydev, u8 pair, u16 result); diff --git a/net/ethtool/cabletest.c b/net/ethtool/cabletest.c index 5ba06eabe8c2..94e9d5f04353 100644 --- a/net/ethtool/cabletest.c +++ b/net/ethtool/cabletest.c @@ -13,7 +13,7 @@ cable_test_act_policy[ETHTOOL_A_CABLE_TEST_MAX + 1] = { [ETHTOOL_A_CABLE_TEST_HEADER] = { .type = NLA_NESTED }, }; -static int ethnl_cable_test_started(struct phy_device *phydev) +static int ethnl_cable_test_started(struct phy_device *phydev, u8 cmd) { struct sk_buff *skb; int err = -ENOMEM; @@ -23,7 +23,7 @@ static int ethnl_cable_test_started(struct phy_device *phydev) if (!skb) goto out; - ehdr = ethnl_bcastmsg_put(skb, ETHTOOL_MSG_CABLE_TEST_NTF); + ehdr = ethnl_bcastmsg_put(skb, cmd); if (!ehdr) { err = -EMSGSIZE; goto out; @@ -86,7 +86,8 @@ int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info) ethnl_ops_complete(dev); if (!ret) - ethnl_cable_test_started(dev->phydev); + ethnl_cable_test_started(dev->phydev, + ETHTOOL_MSG_CABLE_TEST_NTF); out_rtnl: rtnl_unlock(); @@ -95,7 +96,7 @@ out_dev_put: return ret; } -int ethnl_cable_test_alloc(struct phy_device *phydev) +int ethnl_cable_test_alloc(struct phy_device *phydev, u8 cmd) { int err = -ENOMEM; @@ -103,8 +104,7 @@ int ethnl_cable_test_alloc(struct phy_device *phydev) if (!phydev->skb) goto out; - phydev->ehdr = ethnl_bcastmsg_put(phydev->skb, - ETHTOOL_MSG_CABLE_TEST_NTF); + phydev->ehdr = ethnl_bcastmsg_put(phydev->skb, cmd); if (!phydev->ehdr) { err = -EMSGSIZE; goto out; @@ -199,3 +199,55 @@ err: return ret; } EXPORT_SYMBOL_GPL(ethnl_cable_test_fault_length); + +static const struct nla_policy +cable_test_tdr_act_policy[ETHTOOL_A_CABLE_TEST_TDR_MAX + 1] = { + [ETHTOOL_A_CABLE_TEST_TDR_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_CABLE_TEST_TDR_HEADER] = { .type = NLA_NESTED }, +}; + +int ethnl_act_cable_test_tdr(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *tb[ETHTOOL_A_CABLE_TEST_TDR_MAX + 1]; + struct ethnl_req_info req_info = {}; + struct net_device *dev; + int ret; + + ret = nlmsg_parse(info->nlhdr, GENL_HDRLEN, tb, + ETHTOOL_A_CABLE_TEST_TDR_MAX, + cable_test_tdr_act_policy, info->extack); + if (ret < 0) + return ret; + + ret = ethnl_parse_header_dev_get(&req_info, + tb[ETHTOOL_A_CABLE_TEST_TDR_HEADER], + genl_info_net(info), info->extack, + true); + if (ret < 0) + return ret; + + dev = req_info.dev; + if (!dev->phydev) { + ret = -EOPNOTSUPP; + goto out_dev_put; + } + + rtnl_lock(); + ret = ethnl_ops_begin(dev); + if (ret < 0) + goto out_rtnl; + + ret = phy_start_cable_test_tdr(dev->phydev, info->extack); + + ethnl_ops_complete(dev); + + if (!ret) + ethnl_cable_test_started(dev->phydev, + ETHTOOL_MSG_CABLE_TEST_TDR_NTF); + +out_rtnl: + rtnl_unlock(); +out_dev_put: + dev_put(dev); + return ret; +} diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 0f2f4754dcf9..88fd07f47040 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -844,6 +844,11 @@ static const struct genl_ops ethtool_genl_ops[] = { .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_act_cable_test, }, + { + .cmd = ETHTOOL_MSG_CABLE_TEST_TDR_ACT, + .flags = GENL_UNS_ADMIN_PERM, + .doit = ethnl_act_cable_test_tdr, + }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index b0eb5d920099..9a96b6e90dc2 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -360,5 +360,6 @@ int ethnl_set_coalesce(struct sk_buff *skb, struct genl_info *info); int ethnl_set_pause(struct sk_buff *skb, struct genl_info *info); int ethnl_set_eee(struct sk_buff *skb, struct genl_info *info); int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info); +int ethnl_act_cable_test_tdr(struct sk_buff *skb, struct genl_info *info); #endif /* _NET_ETHTOOL_NETLINK_H */ -- cgit v1.2.3 From 6b4a0fc106521e480c00b55a7ef38c89f02dc4e8 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 27 May 2020 00:21:39 +0200 Subject: net: ethtool: Add helpers for cable test TDR data Add helpers for returning raw TDR helpers in netlink messages. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/ethtool_netlink.h | 21 +++++++++++ net/ethtool/cabletest.c | 80 ++++++++++++++++++++++++++++++++++++++++- 2 files changed, 100 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/linux/ethtool_netlink.h b/include/linux/ethtool_netlink.h index 24817ba252a0..8fbe4f97ffad 100644 --- a/include/linux/ethtool_netlink.h +++ b/include/linux/ethtool_netlink.h @@ -22,6 +22,10 @@ void ethnl_cable_test_free(struct phy_device *phydev); void ethnl_cable_test_finished(struct phy_device *phydev); int ethnl_cable_test_result(struct phy_device *phydev, u8 pair, u8 result); int ethnl_cable_test_fault_length(struct phy_device *phydev, u8 pair, u32 cm); +int ethnl_cable_test_amplitude(struct phy_device *phydev, u8 pair, s16 mV); +int ethnl_cable_test_pulse(struct phy_device *phydev, u16 mV); +int ethnl_cable_test_step(struct phy_device *phydev, u32 first, u32 last, + u32 step); #else static inline int ethnl_cable_test_alloc(struct phy_device *phydev, u8 cmd) { @@ -46,5 +50,22 @@ static inline int ethnl_cable_test_fault_length(struct phy_device *phydev, { return -EOPNOTSUPP; } + +static inline int ethnl_cable_test_amplitude(struct phy_device *phydev, + u8 pair, s16 mV) +{ + return -EOPNOTSUPP; +} + +static inline int ethnl_cable_test_pulse(struct phy_device *phydev, u16 mV) +{ + return -EOPNOTSUPP; +} + +static inline int ethnl_cable_test_step(struct phy_device *phydev, u32 first, + u32 last, u32 step) +{ + return -EOPNOTSUPP; +} #endif /* IS_ENABLED(ETHTOOL_NETLINK) */ #endif /* _LINUX_ETHTOOL_NETLINK_H_ */ diff --git a/net/ethtool/cabletest.c b/net/ethtool/cabletest.c index 94e9d5f04353..390d0673ff01 100644 --- a/net/ethtool/cabletest.c +++ b/net/ethtool/cabletest.c @@ -100,7 +100,10 @@ int ethnl_cable_test_alloc(struct phy_device *phydev, u8 cmd) { int err = -ENOMEM; - phydev->skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + /* One TDR sample occupies 20 bytes. For a 150 meter cable, + * with four pairs, around 12K is needed. + */ + phydev->skb = genlmsg_new(SZ_16K, GFP_KERNEL); if (!phydev->skb) goto out; @@ -251,3 +254,78 @@ out_dev_put: dev_put(dev); return ret; } + +int ethnl_cable_test_amplitude(struct phy_device *phydev, + u8 pair, s16 mV) +{ + struct nlattr *nest; + int ret = -EMSGSIZE; + + nest = nla_nest_start(phydev->skb, + ETHTOOL_A_CABLE_TDR_NEST_AMPLITUDE); + if (!nest) + return -EMSGSIZE; + + if (nla_put_u8(phydev->skb, ETHTOOL_A_CABLE_AMPLITUDE_PAIR, pair)) + goto err; + if (nla_put_u16(phydev->skb, ETHTOOL_A_CABLE_AMPLITUDE_mV, mV)) + goto err; + + nla_nest_end(phydev->skb, nest); + return 0; + +err: + nla_nest_cancel(phydev->skb, nest); + return ret; +} +EXPORT_SYMBOL_GPL(ethnl_cable_test_amplitude); + +int ethnl_cable_test_pulse(struct phy_device *phydev, u16 mV) +{ + struct nlattr *nest; + int ret = -EMSGSIZE; + + nest = nla_nest_start(phydev->skb, ETHTOOL_A_CABLE_TDR_NEST_PULSE); + if (!nest) + return -EMSGSIZE; + + if (nla_put_u16(phydev->skb, ETHTOOL_A_CABLE_PULSE_mV, mV)) + goto err; + + nla_nest_end(phydev->skb, nest); + return 0; + +err: + nla_nest_cancel(phydev->skb, nest); + return ret; +} +EXPORT_SYMBOL_GPL(ethnl_cable_test_pulse); + +int ethnl_cable_test_step(struct phy_device *phydev, u32 first, u32 last, + u32 step) +{ + struct nlattr *nest; + int ret = -EMSGSIZE; + + nest = nla_nest_start(phydev->skb, ETHTOOL_A_CABLE_TDR_NEST_STEP); + if (!nest) + return -EMSGSIZE; + + if (nla_put_u32(phydev->skb, ETHTOOL_A_CABLE_STEP_FIRST_DISTANCE, + first)) + goto err; + + if (nla_put_u32(phydev->skb, ETHTOOL_A_CABLE_STEP_LAST_DISTANCE, last)) + goto err; + + if (nla_put_u32(phydev->skb, ETHTOOL_A_CABLE_STEP_STEP_DISTANCE, step)) + goto err; + + nla_nest_end(phydev->skb, nest); + return 0; + +err: + nla_nest_cancel(phydev->skb, nest); + return ret; +} +EXPORT_SYMBOL_GPL(ethnl_cable_test_step); -- cgit v1.2.3 From f2bc8ad31a7f814237bc6301d59296d76505a688 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 27 May 2020 00:21:41 +0200 Subject: net: ethtool: Allow PHY cable test TDR data to configured Allow the user to configure where on the cable the TDR data should be retrieved, in terms of first and last sample, and the step between samples. Also add the ability to ask for TDR data for just one pair. If this configuration is not provided, it defaults to 1-150m at 1m intervals for all pairs. Signed-off-by: Andrew Lunn v3: Move the TDR configuration into a structure Add a range check on step Use NL_SET_ERR_MSG_ATTR() when appropriate Move TDR configuration into a nest Document attributes in the request Signed-off-by: David S. Miller --- Documentation/networking/ethtool-netlink.rst | 22 +++++- drivers/net/phy/marvell.c | 59 ++++++++++----- drivers/net/phy/phy.c | 5 +- include/linux/phy.h | 21 +++++- include/uapi/linux/ethtool_netlink.h | 13 ++++ net/ethtool/cabletest.c | 104 ++++++++++++++++++++++++++- 6 files changed, 197 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index dae36227d590..d42661b91128 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -1023,9 +1023,25 @@ Start a cable test and report raw TDR data Request contents: - ==================================== ====== ========================== - ``ETHTOOL_A_CABLE_TEST_TDR_HEADER`` nested request header - ==================================== ====== ========================== + +--------------------------------------------+--------+-----------------------+ + | ``ETHTOOL_A_CABLE_TEST_TDR_HEADER`` | nested | reply header | + +--------------------------------------------+--------+-----------------------+ + | ``ETHTOOL_A_CABLE_TEST_TDR_CFG`` | nested | test configuration | + +-+------------------------------------------+--------+-----------------------+ + | | ``ETHTOOL_A_CABLE_STEP_FIRST_DISTANCE `` | u32 | first data distance | + +-+-+----------------------------------------+--------+-----------------------+ + | | ``ETHTOOL_A_CABLE_STEP_LAST_DISTANCE `` | u32 | last data distance | + +-+-+----------------------------------------+--------+-----------------------+ + | | ``ETHTOOL_A_CABLE_STEP_STEP_DISTANCE `` | u32 | distance of each step | + +-+-+----------------------------------------+--------+-----------------------+ + | | ``ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR`` | u8 | pair to test | + +-+-+----------------------------------------+--------+-----------------------+ + +The ETHTOOL_A_CABLE_TEST_TDR_CFG is optional, as well as all members +of the nest. All distances are expressed in centimeters. The PHY takes +the distances as a guide, and rounds to the nearest distance it +actually supports. If a pair is passed, only that one pair will be +tested. Otherwise all pairs are tested. Notification contents: diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c index e597bee2e966..335e51d6f138 100644 --- a/drivers/net/phy/marvell.c +++ b/drivers/net/phy/marvell.c @@ -198,6 +198,7 @@ #define MII_VCT5_CTRL_PEEK_HYST_DEFAULT 3 #define MII_VCT5_SAMPLE_POINT_DISTANCE 0x18 +#define MII_VCT5_SAMPLE_POINT_DISTANCE_MAX 511 #define MII_VCT5_TX_PULSE_CTRL 0x1c #define MII_VCT5_TX_PULSE_CTRL_DONT_WAIT_LINK_DOWN BIT(12) #define MII_VCT5_TX_PULSE_CTRL_PULSE_WIDTH_128nS (0x0 << 10) @@ -270,6 +271,10 @@ struct marvell_priv { char *hwmon_name; struct device *hwmon_dev; bool cable_test_tdr; + u32 first; + u32 last; + u32 step; + s8 pair; }; static int marvell_read_page(struct phy_device *phydev) @@ -1787,12 +1792,18 @@ static u32 marvell_vct5_distance2cm(int distance) return distance * 805 / 10; } +static u32 marvell_vct5_cm2distance(int cm) +{ + return cm * 10 / 805; +} + static int marvell_vct5_amplitude_distance(struct phy_device *phydev, - int distance) + int distance, int pair) { - int mV_pair0, mV_pair1, mV_pair2, mV_pair3; u16 reg; int err; + int mV; + int i; err = phy_write_paged(phydev, MII_MARVELL_VCT5_PAGE, MII_VCT5_SAMPLE_POINT_DISTANCE, @@ -1814,21 +1825,20 @@ static int marvell_vct5_amplitude_distance(struct phy_device *phydev, if (err) return err; - mV_pair0 = marvell_vct5_amplitude(phydev, 0); - mV_pair1 = marvell_vct5_amplitude(phydev, 1); - mV_pair2 = marvell_vct5_amplitude(phydev, 2); - mV_pair3 = marvell_vct5_amplitude(phydev, 3); + for (i = 0; i < 4; i++) { + if (pair != PHY_PAIR_ALL && i != pair) + continue; - ethnl_cable_test_amplitude(phydev, ETHTOOL_A_CABLE_PAIR_A, mV_pair0); - ethnl_cable_test_amplitude(phydev, ETHTOOL_A_CABLE_PAIR_B, mV_pair1); - ethnl_cable_test_amplitude(phydev, ETHTOOL_A_CABLE_PAIR_C, mV_pair2); - ethnl_cable_test_amplitude(phydev, ETHTOOL_A_CABLE_PAIR_D, mV_pair3); + mV = marvell_vct5_amplitude(phydev, i); + ethnl_cable_test_amplitude(phydev, i, mV); + } return 0; } static int marvell_vct5_amplitude_graph(struct phy_device *phydev) { + struct marvell_priv *priv = phydev->priv; int distance; int err; u16 reg; @@ -1843,8 +1853,11 @@ static int marvell_vct5_amplitude_graph(struct phy_device *phydev) if (err) return err; - for (distance = 0; distance <= 100; distance++) { - err = marvell_vct5_amplitude_distance(phydev, distance); + for (distance = priv->first; + distance <= priv->last; + distance += priv->step) { + err = marvell_vct5_amplitude_distance(phydev, distance, + priv->pair); if (err) return err; } @@ -1918,11 +1931,24 @@ static int marvell_vct7_cable_test_start(struct phy_device *phydev) MII_VCT7_CTRL_CENTIMETERS); } -static int marvell_vct5_cable_test_tdr_start(struct phy_device *phydev) +static int marvell_vct5_cable_test_tdr_start(struct phy_device *phydev, + const struct phy_tdr_config *cfg) { struct marvell_priv *priv = phydev->priv; int ret; + priv->cable_test_tdr = true; + priv->first = marvell_vct5_cm2distance(cfg->first); + priv->last = marvell_vct5_cm2distance(cfg->last); + priv->step = marvell_vct5_cm2distance(cfg->step); + priv->pair = cfg->pair; + + if (priv->first > MII_VCT5_SAMPLE_POINT_DISTANCE_MAX) + return -EINVAL; + + if (priv->last > MII_VCT5_SAMPLE_POINT_DISTANCE_MAX) + return -EINVAL; + /* Disable VCT7 */ ret = phy_write_paged(phydev, MII_MARVELL_VCT7_PAGE, MII_VCT7_CTRL, 0); @@ -1933,15 +1959,14 @@ static int marvell_vct5_cable_test_tdr_start(struct phy_device *phydev) if (ret) return ret; - priv->cable_test_tdr = true; ret = ethnl_cable_test_pulse(phydev, 1000); if (ret) return ret; return ethnl_cable_test_step(phydev, - marvell_vct5_distance2cm(0), - marvell_vct5_distance2cm(100), - marvell_vct5_distance2cm(1)); + marvell_vct5_distance2cm(priv->first), + marvell_vct5_distance2cm(priv->last), + marvell_vct5_distance2cm(priv->step)); } static int marvell_vct7_distance_to_length(int distance, bool meter) diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index 495d9ba3d5bf..1de3938628f4 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -553,7 +553,8 @@ out: EXPORT_SYMBOL(phy_start_cable_test); int phy_start_cable_test_tdr(struct phy_device *phydev, - struct netlink_ext_ack *extack) + struct netlink_ext_ack *extack, + const struct phy_tdr_config *config) { struct net_device *dev = phydev->attached_dev; int err = -ENOMEM; @@ -590,7 +591,7 @@ int phy_start_cable_test_tdr(struct phy_device *phydev, phy_link_down(phydev); netif_testing_on(dev); - err = phydev->drv->cable_test_tdr_start(phydev); + err = phydev->drv->cable_test_tdr_start(phydev, config); if (err) { netif_testing_off(dev); phy_link_up(phydev); diff --git a/include/linux/phy.h b/include/linux/phy.h index d3c384f353ca..8c05d0fb5c00 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -548,6 +548,18 @@ struct phy_device { #define to_phy_device(d) container_of(to_mdio_device(d), \ struct phy_device, mdio) +/* A structure containing possible configuration parameters + * for a TDR cable test. The driver does not need to implement + * all the parameters, but should report what is actually used. + */ +struct phy_tdr_config { + u32 first; + u32 last; + u32 step; + s8 pair; +}; +#define PHY_PAIR_ALL -1 + /* struct phy_driver: Driver structure for a particular PHY type * * driver_data: static driver data @@ -701,7 +713,8 @@ struct phy_driver { int (*cable_test_start)(struct phy_device *dev); /* Start a raw TDR cable test */ - int (*cable_test_tdr_start)(struct phy_device *dev); + int (*cable_test_tdr_start)(struct phy_device *dev, + const struct phy_tdr_config *config); /* Once per second, or on interrupt, request the status of the * test. @@ -1256,7 +1269,8 @@ int phy_reset_after_clk_enable(struct phy_device *phydev); int phy_start_cable_test(struct phy_device *phydev, struct netlink_ext_ack *extack); int phy_start_cable_test_tdr(struct phy_device *phydev, - struct netlink_ext_ack *extack); + struct netlink_ext_ack *extack, + const struct phy_tdr_config *config); #else static inline int phy_start_cable_test(struct phy_device *phydev, @@ -1267,7 +1281,8 @@ int phy_start_cable_test(struct phy_device *phydev, } static inline int phy_start_cable_test_tdr(struct phy_device *phydev, - struct netlink_ext_ack *extack) + struct netlink_ext_ack *extack, + const struct phy_tdr_config *config) { NL_SET_ERR_MSG(extack, "Kernel not compiled with PHYLIB support"); return -EOPNOTSUPP; diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 739faa7070c6..fc9051f2eeac 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -482,9 +482,22 @@ enum { /* CABLE TEST TDR */ +enum { + ETHTOOL_A_CABLE_TEST_TDR_CFG_UNSPEC, + ETHTOOL_A_CABLE_TEST_TDR_CFG_FIRST, /* u32 */ + ETHTOOL_A_CABLE_TEST_TDR_CFG_LAST, /* u32 */ + ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP, /* u32 */ + ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR, /* u8 */ + + /* add new constants above here */ + __ETHTOOL_A_CABLE_TEST_TDR_CFG_CNT, + ETHTOOL_A_CABLE_TEST_TDR_CFG_MAX = __ETHTOOL_A_CABLE_TEST_TDR_CFG_CNT - 1 +}; + enum { ETHTOOL_A_CABLE_TEST_TDR_UNSPEC, ETHTOOL_A_CABLE_TEST_TDR_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_CABLE_TEST_TDR_CFG, /* nest - *_TDR_CFG_* */ /* add new constants above here */ __ETHTOOL_A_CABLE_TEST_TDR_CNT, diff --git a/net/ethtool/cabletest.c b/net/ethtool/cabletest.c index 390d0673ff01..9991688d7d1d 100644 --- a/net/ethtool/cabletest.c +++ b/net/ethtool/cabletest.c @@ -5,7 +5,11 @@ #include "netlink.h" #include "common.h" -/* CABLE_TEST_ACT */ +/* 802.3 standard allows 100 meters for BaseT cables. However longer + * cables might work, depending on the quality of the cables and the + * PHY. So allow testing for up to 150 meters. + */ +#define MAX_CABLE_LENGTH_CM (150 * 100) static const struct nla_policy cable_test_act_policy[ETHTOOL_A_CABLE_TEST_MAX + 1] = { @@ -203,16 +207,107 @@ err: } EXPORT_SYMBOL_GPL(ethnl_cable_test_fault_length); +struct cable_test_tdr_req_info { + struct ethnl_req_info base; +}; + +static const struct nla_policy +cable_test_tdr_act_cfg_policy[ETHTOOL_A_CABLE_TEST_TDR_CFG_MAX + 1] = { + [ETHTOOL_A_CABLE_TEST_TDR_CFG_FIRST] = { .type = NLA_U32 }, + [ETHTOOL_A_CABLE_TEST_TDR_CFG_LAST] = { .type = NLA_U32 }, + [ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP] = { .type = NLA_U32 }, + [ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR] = { .type = NLA_U8 }, +}; + static const struct nla_policy cable_test_tdr_act_policy[ETHTOOL_A_CABLE_TEST_TDR_MAX + 1] = { [ETHTOOL_A_CABLE_TEST_TDR_UNSPEC] = { .type = NLA_REJECT }, [ETHTOOL_A_CABLE_TEST_TDR_HEADER] = { .type = NLA_NESTED }, + [ETHTOOL_A_CABLE_TEST_TDR_CFG] = { .type = NLA_NESTED }, }; +/* CABLE_TEST_TDR_ACT */ +int ethnl_act_cable_test_tdr_cfg(const struct nlattr *nest, + struct genl_info *info, + struct phy_tdr_config *cfg) +{ + struct nlattr *tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_MAX + 1]; + int ret; + + ret = nla_parse_nested(tb, ETHTOOL_A_CABLE_TEST_TDR_CFG_MAX, nest, + cable_test_tdr_act_cfg_policy, info->extack); + if (ret < 0) + return ret; + + if (tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_FIRST]) + cfg->first = nla_get_u32( + tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_FIRST]); + else + cfg->first = 100; + if (tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_LAST]) + cfg->last = nla_get_u32(tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_LAST]); + else + cfg->last = MAX_CABLE_LENGTH_CM; + + if (tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP]) + cfg->step = nla_get_u32(tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP]); + else + cfg->step = 100; + + if (tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR]) { + cfg->pair = nla_get_u8(tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR]); + if (cfg->pair > ETHTOOL_A_CABLE_PAIR_D) { + NL_SET_ERR_MSG_ATTR( + info->extack, + tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR], + "invalid pair parameter"); + return -EINVAL; + } + } else { + cfg->pair = PHY_PAIR_ALL; + } + + if (cfg->first > MAX_CABLE_LENGTH_CM) { + NL_SET_ERR_MSG_ATTR(info->extack, + tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_FIRST], + "invalid first parameter"); + return -EINVAL; + } + + if (cfg->last > MAX_CABLE_LENGTH_CM) { + NL_SET_ERR_MSG_ATTR(info->extack, + tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_LAST], + "invalid last parameter"); + return -EINVAL; + } + + if (cfg->first > cfg->last) { + NL_SET_ERR_MSG(info->extack, "invalid first/last parameter"); + return -EINVAL; + } + + if (!cfg->step) { + NL_SET_ERR_MSG_ATTR(info->extack, + tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP], + "invalid step parameter"); + return -EINVAL; + } + + if (cfg->step > (cfg->last - cfg->first)) { + NL_SET_ERR_MSG_ATTR(info->extack, + tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP], + "step parameter too big"); + return -EINVAL; + } + + return 0; +} + int ethnl_act_cable_test_tdr(struct sk_buff *skb, struct genl_info *info) { struct nlattr *tb[ETHTOOL_A_CABLE_TEST_TDR_MAX + 1]; struct ethnl_req_info req_info = {}; + struct phy_tdr_config cfg; struct net_device *dev; int ret; @@ -235,12 +330,17 @@ int ethnl_act_cable_test_tdr(struct sk_buff *skb, struct genl_info *info) goto out_dev_put; } + ret = ethnl_act_cable_test_tdr_cfg(tb[ETHTOOL_A_CABLE_TEST_TDR_CFG], + info, &cfg); + if (ret) + goto out_dev_put; + rtnl_lock(); ret = ethnl_ops_begin(dev); if (ret < 0) goto out_rtnl; - ret = phy_start_cable_test_tdr(dev->phydev, info->extack); + ret = phy_start_cable_test_tdr(dev->phydev, info->extack, &cfg); ethnl_ops_complete(dev); -- cgit v1.2.3 From 60c2ef0ef07f319504763eaaed8cb003af879008 Mon Sep 17 00:00:00 2001 From: Sergey Matyukevich Date: Fri, 24 Apr 2020 14:29:02 +0300 Subject: mac80211: fix variable names in TID config methods Fix all variable names from 'tid' to 'tids' to avoid confusion. Now this is not TID number, but TID mask. Signed-off-by: Sergey Matyukevich Link: https://lore.kernel.org/r/20200424112905.26770-3-sergey.matyukevich.os@quantenna.com Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 6 +++--- net/mac80211/driver-ops.h | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 548a384b0509..06a2b7640a9d 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -3957,7 +3957,7 @@ static int ieee80211_set_tid_config(struct wiphy *wiphy, static int ieee80211_reset_tid_config(struct wiphy *wiphy, struct net_device *dev, - const u8 *peer, u8 tid) + const u8 *peer, u8 tids) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct sta_info *sta; @@ -3967,7 +3967,7 @@ static int ieee80211_reset_tid_config(struct wiphy *wiphy, return -EOPNOTSUPP; if (!peer) - return drv_reset_tid_config(sdata->local, sdata, NULL, tid); + return drv_reset_tid_config(sdata->local, sdata, NULL, tids); mutex_lock(&sdata->local->sta_mtx); sta = sta_info_get_bss(sdata, peer); @@ -3976,7 +3976,7 @@ static int ieee80211_reset_tid_config(struct wiphy *wiphy, return -ENOENT; } - ret = drv_reset_tid_config(sdata->local, sdata, &sta->sta, tid); + ret = drv_reset_tid_config(sdata->local, sdata, &sta->sta, tids); mutex_unlock(&sdata->local->sta_mtx); return ret; diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index 3877710e3b48..de69fc9c4f07 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -1375,12 +1375,12 @@ static inline int drv_set_tid_config(struct ieee80211_local *local, static inline int drv_reset_tid_config(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, - struct ieee80211_sta *sta, u8 tid) + struct ieee80211_sta *sta, u8 tids) { int ret; might_sleep(); - ret = local->ops->reset_tid_config(&local->hw, &sdata->vif, sta, tid); + ret = local->ops->reset_tid_config(&local->hw, &sdata->vif, sta, tids); trace_drv_return_int(local, ret); return ret; -- cgit v1.2.3 From 33462e68231bccfe563a87614f4c4dd5d333837c Mon Sep 17 00:00:00 2001 From: Sergey Matyukevich Date: Fri, 24 Apr 2020 14:29:03 +0300 Subject: cfg80211: add support for TID specific AMSDU configuration This patch adds support to control per TID MSDU aggregation using the NL80211_TID_CONFIG_ATTR_AMSDU_CTRL attribute. Signed-off-by: Sergey Matyukevich Link: https://lore.kernel.org/r/20200424112905.26770-4-sergey.matyukevich.os@quantenna.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 4 +++- include/uapi/linux/nl80211.h | 10 +++++++--- net/wireless/nl80211.c | 8 ++++++++ 3 files changed, 18 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index e71d4f690ef1..5cacf24cc9f0 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -640,8 +640,9 @@ struct cfg80211_chan_def { * @noack: noack configuration value for the TID * @retry_long: retry count value * @retry_short: retry count value - * @ampdu: Enable/Disable aggregation + * @ampdu: Enable/Disable MPDU aggregation * @rtscts: Enable/Disable RTS/CTS + * @amsdu: Enable/Disable MSDU aggregation */ struct cfg80211_tid_cfg { bool config_override; @@ -651,6 +652,7 @@ struct cfg80211_tid_cfg { u8 retry_long, retry_short; enum nl80211_tid_config ampdu; enum nl80211_tid_config rtscts; + enum nl80211_tid_config amsdu; }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 9679d561f7d0..1ccb0bf657ec 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -4844,12 +4844,15 @@ enum nl80211_tid_config { * &NL80211_CMD_SET_TID_CONFIG. Its type is u8, min value is 1 and * the max value is advertised by the driver in this attribute on * output in wiphy capabilities. - * @NL80211_TID_CONFIG_ATTR_AMPDU_CTRL: Enable/Disable aggregation for the TIDs - * specified in %NL80211_TID_CONFIG_ATTR_TIDS. Its type is u8, using - * the values from &nl80211_tid_config. + * @NL80211_TID_CONFIG_ATTR_AMPDU_CTRL: Enable/Disable MPDU aggregation + * for the TIDs specified in %NL80211_TID_CONFIG_ATTR_TIDS. + * Its type is u8, using the values from &nl80211_tid_config. * @NL80211_TID_CONFIG_ATTR_RTSCTS_CTRL: Enable/Disable RTS_CTS for the TIDs * specified in %NL80211_TID_CONFIG_ATTR_TIDS. It is u8 type, using * the values from &nl80211_tid_config. + * @NL80211_TID_CONFIG_ATTR_AMSDU_CTRL: Enable/Disable MSDU aggregation + * for the TIDs specified in %NL80211_TID_CONFIG_ATTR_TIDS. + * Its type is u8, using the values from &nl80211_tid_config. */ enum nl80211_tid_config_attr { __NL80211_TID_CONFIG_ATTR_INVALID, @@ -4863,6 +4866,7 @@ enum nl80211_tid_config_attr { NL80211_TID_CONFIG_ATTR_RETRY_LONG, NL80211_TID_CONFIG_ATTR_AMPDU_CTRL, NL80211_TID_CONFIG_ATTR_RTSCTS_CTRL, + NL80211_TID_CONFIG_ATTR_AMSDU_CTRL, /* keep last */ __NL80211_TID_CONFIG_ATTR_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index fa66d5b6f557..482a80b78844 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -343,6 +343,8 @@ nl80211_tid_config_attr_policy[NL80211_TID_CONFIG_ATTR_MAX + 1] = { NLA_POLICY_MAX(NLA_U8, NL80211_TID_CONFIG_DISABLE), [NL80211_TID_CONFIG_ATTR_RTSCTS_CTRL] = NLA_POLICY_MAX(NLA_U8, NL80211_TID_CONFIG_DISABLE), + [NL80211_TID_CONFIG_ATTR_AMSDU_CTRL] = + NLA_POLICY_MAX(NLA_U8, NL80211_TID_CONFIG_DISABLE), }; static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { @@ -14080,6 +14082,12 @@ static int parse_tid_conf(struct cfg80211_registered_device *rdev, nla_get_u8(attrs[NL80211_TID_CONFIG_ATTR_RTSCTS_CTRL]); } + if (attrs[NL80211_TID_CONFIG_ATTR_AMSDU_CTRL]) { + tid_conf->mask |= BIT(NL80211_TID_CONFIG_ATTR_AMSDU_CTRL); + tid_conf->amsdu = + nla_get_u8(attrs[NL80211_TID_CONFIG_ATTR_AMSDU_CTRL]); + } + if (peer) mask = rdev->wiphy.tid_config_support.peer; else -- cgit v1.2.3 From c03369558c435f7e82f7c06b0173fa73c1ed15c0 Mon Sep 17 00:00:00 2001 From: Sergey Matyukevich Date: Fri, 24 Apr 2020 14:29:04 +0300 Subject: nl80211: simplify peer specific TID configuration Current rule for applying TID configuration for specific peer looks overly complicated. No need to reject new TID configuration when override flag is specified. Another call with the same TID configuration, but without override flag, allows to apply new configuration anyway. Use the same approach as for the 'all peers' case: if override flag is specified, then reset existing TID configuration and immediately apply a new one. Signed-off-by: Sergey Matyukevich Link: https://lore.kernel.org/r/20200424112905.26770-5-sergey.matyukevich.os@quantenna.com Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 10 ++++------ net/wireless/nl80211.c | 5 +---- 2 files changed, 5 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 1ccb0bf657ec..d1b1d9e49887 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -4823,12 +4823,10 @@ enum nl80211_tid_config { * (%NL80211_TID_CONFIG_ATTR_TIDS, %NL80211_TID_CONFIG_ATTR_OVERRIDE). * @NL80211_TID_CONFIG_ATTR_PEER_SUPP: same as the previous per-vif one, but * per peer instead. - * @NL80211_TID_CONFIG_ATTR_OVERRIDE: flag attribue, if no peer - * is selected, if set indicates that the new configuration overrides - * all previous peer configurations, otherwise previous peer specific - * configurations should be left untouched. If peer is selected then - * it will reset particular TID configuration of that peer and it will - * not accept other TID config attributes along with peer. + * @NL80211_TID_CONFIG_ATTR_OVERRIDE: flag attribue, if set indicates + * that the new configuration overrides all previous peer + * configurations, otherwise previous peer specific configurations + * should be left untouched. * @NL80211_TID_CONFIG_ATTR_TIDS: a bitmask value of TIDs (bit 0 to 7) * Its type is u16. * @NL80211_TID_CONFIG_ATTR_NOACK: Configure ack policy for the TID. diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 482a80b78844..258c621f651c 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -14036,10 +14036,7 @@ static int parse_tid_conf(struct cfg80211_registered_device *rdev, if (rdev->ops->reset_tid_config) { err = rdev_reset_tid_config(rdev, dev, peer, tid_conf->tids); - /* If peer is there no other configuration will be - * allowed - */ - if (err || peer) + if (err) return err; } else { return -EINVAL; -- cgit v1.2.3 From e76fede8bf7c90d92c799d9ceb092dec48346e2c Mon Sep 17 00:00:00 2001 From: Thomas Pedersen Date: Thu, 30 Apr 2020 10:25:50 -0700 Subject: cfg80211: add KHz variants of frame RX API Drivers may wish to report the RX frequency in units of KHz. Provide cfg80211_rx_mgmt_khz() and wrap it with cfg80211_rx_mgmt() so exisiting drivers which can't report KHz anyway don't need to change. Add a similar wrapper for cfg80211_report_obss_beacon() so the frequency units stay somewhat consistent. This doesn't actually change the nl80211 API yet. Signed-off-by: Thomas Pedersen Link: https://lore.kernel.org/r/20200430172554.18383-2-thomas@adapt-ip.com [fix mac80211 calling the non-khz version of obss beacon report, drop trace point name changes] Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 2 ++ include/net/cfg80211.h | 54 ++++++++++++++++++++++++++++++++++++++++++----- net/mac80211/rx.c | 12 ++++++----- net/wireless/mlme.c | 6 +++--- net/wireless/nl80211.c | 12 +++++------ net/wireless/trace.h | 8 +++---- 6 files changed, 71 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index a561db435a4b..41d5f000c0d9 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -3333,6 +3333,8 @@ static inline int ieee80211_get_tdls_action(struct sk_buff *skb, u32 hdr_size) /* convert frequencies */ #define MHZ_TO_KHZ(freq) ((freq) * 1000) #define KHZ_TO_MHZ(freq) ((freq) / 1000) +#define PR_KHZ(f) KHZ_TO_MHZ(f), f % 1000 +#define KHZ_F "%d.%03d" /* convert powers */ #define DBI_TO_MBI(gain) ((gain) * 100) diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 5cacf24cc9f0..7415f77d99ca 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -6988,6 +6988,26 @@ void cfg80211_conn_failed(struct net_device *dev, const u8 *mac_addr, enum nl80211_connect_failed_reason reason, gfp_t gfp); +/** + * cfg80211_rx_mgmt_khz - notification of received, unprocessed management frame + * @wdev: wireless device receiving the frame + * @freq: Frequency on which the frame was received in KHz + * @sig_dbm: signal strength in dBm, or 0 if unknown + * @buf: Management frame (header + body) + * @len: length of the frame data + * @flags: flags, as defined in enum nl80211_rxmgmt_flags + * + * This function is called whenever an Action frame is received for a station + * mode interface, but is not processed in kernel. + * + * Return: %true if a user space application has registered for this frame. + * For action frames, that makes it responsible for rejecting unrecognized + * action frames; %false otherwise, in which case for action frames the + * driver is responsible for rejecting the frame. + */ +bool cfg80211_rx_mgmt_khz(struct wireless_dev *wdev, int freq, int sig_dbm, + const u8 *buf, size_t len, u32 flags); + /** * cfg80211_rx_mgmt - notification of received, unprocessed management frame * @wdev: wireless device receiving the frame @@ -7005,8 +7025,13 @@ void cfg80211_conn_failed(struct net_device *dev, const u8 *mac_addr, * action frames; %false otherwise, in which case for action frames the * driver is responsible for rejecting the frame. */ -bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq, int sig_dbm, - const u8 *buf, size_t len, u32 flags); +static inline bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq, + int sig_dbm, const u8 *buf, size_t len, + u32 flags) +{ + return cfg80211_rx_mgmt_khz(wdev, MHZ_TO_KHZ(freq), sig_dbm, buf, len, + flags); +} /** * cfg80211_mgmt_tx_status - notification of TX status for management frame @@ -7204,6 +7229,21 @@ void cfg80211_probe_status(struct net_device *dev, const u8 *addr, u64 cookie, bool acked, s32 ack_signal, bool is_valid_ack_signal, gfp_t gfp); +/** + * cfg80211_report_obss_beacon_khz - report beacon from other APs + * @wiphy: The wiphy that received the beacon + * @frame: the frame + * @len: length of the frame + * @freq: frequency the frame was received on in KHz + * @sig_dbm: signal strength in dBm, or 0 if unknown + * + * Use this function to report to userspace when a beacon was + * received. It is not useful to call this when there is no + * netdev that is in AP/GO mode. + */ +void cfg80211_report_obss_beacon_khz(struct wiphy *wiphy, const u8 *frame, + size_t len, int freq, int sig_dbm); + /** * cfg80211_report_obss_beacon - report beacon from other APs * @wiphy: The wiphy that received the beacon @@ -7216,9 +7256,13 @@ void cfg80211_probe_status(struct net_device *dev, const u8 *addr, * received. It is not useful to call this when there is no * netdev that is in AP/GO mode. */ -void cfg80211_report_obss_beacon(struct wiphy *wiphy, - const u8 *frame, size_t len, - int freq, int sig_dbm); +static inline void cfg80211_report_obss_beacon(struct wiphy *wiphy, + const u8 *frame, size_t len, + int freq, int sig_dbm) +{ + cfg80211_report_obss_beacon_khz(wiphy, frame, len, MHZ_TO_KHZ(freq), + sig_dbm); +} /** * cfg80211_reg_can_beacon - check if beaconing is allowed diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index eaf8931e4627..8e47b0d31051 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -3095,9 +3095,10 @@ ieee80211_rx_h_mgmt_check(struct ieee80211_rx_data *rx) !(status->flag & RX_FLAG_NO_SIGNAL_VAL)) sig = status->signal; - cfg80211_report_obss_beacon(rx->local->hw.wiphy, - rx->skb->data, rx->skb->len, - status->freq, sig); + cfg80211_report_obss_beacon_khz(rx->local->hw.wiphy, + rx->skb->data, rx->skb->len, + ieee80211_rx_status_to_khz(status), + sig); rx->flags |= IEEE80211_RX_BEACON_REPORTED; } @@ -3443,8 +3444,9 @@ ieee80211_rx_h_userspace_mgmt(struct ieee80211_rx_data *rx) !(status->flag & RX_FLAG_NO_SIGNAL_VAL)) sig = status->signal; - if (cfg80211_rx_mgmt(&rx->sdata->wdev, status->freq, sig, - rx->skb->data, rx->skb->len, 0)) { + if (cfg80211_rx_mgmt_khz(&rx->sdata->wdev, + ieee80211_rx_status_to_khz(status), sig, + rx->skb->data, rx->skb->len, 0)) { if (rx->sta) rx->sta->rx_stats.packets++; dev_kfree_skb(rx->skb); diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index 409497a3527d..189334314cba 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -729,8 +729,8 @@ int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev, return rdev_mgmt_tx(rdev, wdev, params, cookie); } -bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq, int sig_dbm, - const u8 *buf, size_t len, u32 flags) +bool cfg80211_rx_mgmt_khz(struct wireless_dev *wdev, int freq, int sig_dbm, + const u8 *buf, size_t len, u32 flags) { struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); @@ -785,7 +785,7 @@ bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq, int sig_dbm, trace_cfg80211_return_bool(result); return result; } -EXPORT_SYMBOL(cfg80211_rx_mgmt); +EXPORT_SYMBOL(cfg80211_rx_mgmt_khz); void cfg80211_sched_dfs_chan_update(struct cfg80211_registered_device *rdev) { diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 258c621f651c..f6523f1485a3 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -16214,7 +16214,7 @@ int nl80211_send_mgmt(struct cfg80211_registered_device *rdev, netdev->ifindex)) || nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev), NL80211_ATTR_PAD) || - nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, freq) || + nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, KHZ_TO_MHZ(freq)) || (sig_dbm && nla_put_u32(msg, NL80211_ATTR_RX_SIGNAL_DBM, sig_dbm)) || nla_put(msg, NL80211_ATTR_FRAME, len, buf) || @@ -16840,9 +16840,8 @@ void cfg80211_probe_status(struct net_device *dev, const u8 *addr, } EXPORT_SYMBOL(cfg80211_probe_status); -void cfg80211_report_obss_beacon(struct wiphy *wiphy, - const u8 *frame, size_t len, - int freq, int sig_dbm) +void cfg80211_report_obss_beacon_khz(struct wiphy *wiphy, const u8 *frame, + size_t len, int freq, int sig_dbm) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct sk_buff *msg; @@ -16865,7 +16864,8 @@ void cfg80211_report_obss_beacon(struct wiphy *wiphy, if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || (freq && - nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, freq)) || + nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, + KHZ_TO_MHZ(freq))) || (sig_dbm && nla_put_u32(msg, NL80211_ATTR_RX_SIGNAL_DBM, sig_dbm)) || nla_put(msg, NL80211_ATTR_FRAME, len, frame)) @@ -16882,7 +16882,7 @@ void cfg80211_report_obss_beacon(struct wiphy *wiphy, spin_unlock_bh(&rdev->beacon_registrations_lock); nlmsg_free(msg); } -EXPORT_SYMBOL(cfg80211_report_obss_beacon); +EXPORT_SYMBOL(cfg80211_report_obss_beacon_khz); #ifdef CONFIG_PM static int cfg80211_net_detect_results(struct sk_buff *msg, diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 53c887ea67c7..f2ab44a2a3e4 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -2840,8 +2840,8 @@ TRACE_EVENT(cfg80211_rx_mgmt, __entry->freq = freq; __entry->sig_dbm = sig_dbm; ), - TP_printk(WDEV_PR_FMT ", freq: %d, sig dbm: %d", - WDEV_PR_ARG, __entry->freq, __entry->sig_dbm) + TP_printk(WDEV_PR_FMT ", freq: "KHZ_F", sig dbm: %d", + WDEV_PR_ARG, PR_KHZ(__entry->freq), __entry->sig_dbm) ); TRACE_EVENT(cfg80211_mgmt_tx_status, @@ -3121,8 +3121,8 @@ TRACE_EVENT(cfg80211_report_obss_beacon, __entry->freq = freq; __entry->sig_dbm = sig_dbm; ), - TP_printk(WIPHY_PR_FMT ", freq: %d, sig_dbm: %d", - WIPHY_PR_ARG, __entry->freq, __entry->sig_dbm) + TP_printk(WIPHY_PR_FMT ", freq: "KHZ_F", sig_dbm: %d", + WIPHY_PR_ARG, PR_KHZ(__entry->freq), __entry->sig_dbm) ); TRACE_EVENT(cfg80211_tdls_oper_request, -- cgit v1.2.3 From 942ba88ba9c87f5e225574f1f0d6548f0105ed73 Mon Sep 17 00:00:00 2001 From: Thomas Pedersen Date: Thu, 30 Apr 2020 10:25:51 -0700 Subject: nl80211: add KHz frequency offset for most wifi commands cfg80211 recently gained the ability to understand a frequency offset component in KHz. Expose this in nl80211 through the new attributes NL80211_ATTR_WIPHY_FREQ_OFFSET, NL80211_FREQUENCY_ATTR_OFFSET, NL80211_ATTR_CENTER_FREQ1_OFFSET, and NL80211_BSS_FREQUENCY_OFFSET. These add support to send and receive a KHz offset component with the following NL80211 commands: - NL80211_CMD_FRAME - NL80211_CMD_GET_SCAN - NL80211_CMD_AUTHENTICATE - NL80211_CMD_ASSOCIATE - NL80211_CMD_CONNECT Along with any other command which takes a chandef, ie: - NL80211_CMD_SET_CHANNEL - NL80211_CMD_SET_WIPHY - NL80211_CMD_START_AP - NL80211_CMD_RADAR_DETECT - NL80211_CMD_NOTIFY_RADAR - NL80211_CMD_CHANNEL_SWITCH - NL80211_JOIN_IBSS - NL80211_CMD_REMAIN_ON_CHANNEL - NL80211_CMD_JOIN_OCB - NL80211_CMD_JOIN_MESH - NL80211_CMD_TDLS_CHANNEL_SWITCH If the driver advertises a band containing channels with frequency offset, it must also verify support for frequency offset channels in its cfg80211 ops, or return an error. Signed-off-by: Thomas Pedersen Link: https://lore.kernel.org/r/20200430172554.18383-3-thomas@adapt-ip.com Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 50 ++++++++++++++++++---------- net/wireless/nl80211.c | 78 ++++++++++++++++++++++++++++++++------------ 2 files changed, 91 insertions(+), 37 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index d1b1d9e49887..b1cd132c1d27 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -296,13 +296,14 @@ * to get a list of all present wiphys. * @NL80211_CMD_SET_WIPHY: set wiphy parameters, needs %NL80211_ATTR_WIPHY or * %NL80211_ATTR_IFINDEX; can be used to set %NL80211_ATTR_WIPHY_NAME, - * %NL80211_ATTR_WIPHY_TXQ_PARAMS, %NL80211_ATTR_WIPHY_FREQ (and the - * attributes determining the channel width; this is used for setting - * monitor mode channel), %NL80211_ATTR_WIPHY_RETRY_SHORT, - * %NL80211_ATTR_WIPHY_RETRY_LONG, %NL80211_ATTR_WIPHY_FRAG_THRESHOLD, - * and/or %NL80211_ATTR_WIPHY_RTS_THRESHOLD. - * However, for setting the channel, see %NL80211_CMD_SET_CHANNEL - * instead, the support here is for backward compatibility only. + * %NL80211_ATTR_WIPHY_TXQ_PARAMS, %NL80211_ATTR_WIPHY_FREQ, + * %NL80211_ATTR_WIPHY_FREQ_OFFSET (and the attributes determining the + * channel width; this is used for setting monitor mode channel), + * %NL80211_ATTR_WIPHY_RETRY_SHORT, %NL80211_ATTR_WIPHY_RETRY_LONG, + * %NL80211_ATTR_WIPHY_FRAG_THRESHOLD, and/or + * %NL80211_ATTR_WIPHY_RTS_THRESHOLD. However, for setting the channel, + * see %NL80211_CMD_SET_CHANNEL instead, the support here is for backward + * compatibility only. * @NL80211_CMD_NEW_WIPHY: Newly created wiphy, response to get request * or rename notification. Has attributes %NL80211_ATTR_WIPHY and * %NL80211_ATTR_WIPHY_NAME. @@ -351,7 +352,8 @@ * %NL80211_ATTR_AUTH_TYPE, %NL80211_ATTR_INACTIVITY_TIMEOUT, * %NL80211_ATTR_ACL_POLICY and %NL80211_ATTR_MAC_ADDRS. * The channel to use can be set on the interface or be given using the - * %NL80211_ATTR_WIPHY_FREQ and the attributes determining channel width. + * %NL80211_ATTR_WIPHY_FREQ and %NL80211_ATTR_WIPHY_FREQ_OFFSET, and the + * attributes determining channel width. * @NL80211_CMD_NEW_BEACON: old alias for %NL80211_CMD_START_AP * @NL80211_CMD_STOP_AP: Stop AP operation on the given interface * @NL80211_CMD_DEL_BEACON: old alias for %NL80211_CMD_STOP_AP @@ -536,11 +538,12 @@ * interface. %NL80211_ATTR_MAC is used to specify PeerSTAAddress (and * BSSID in case of station mode). %NL80211_ATTR_SSID is used to specify * the SSID (mainly for association, but is included in authentication - * request, too, to help BSS selection. %NL80211_ATTR_WIPHY_FREQ is used - * to specify the frequence of the channel in MHz. %NL80211_ATTR_AUTH_TYPE - * is used to specify the authentication type. %NL80211_ATTR_IE is used to - * define IEs (VendorSpecificInfo, but also including RSN IE and FT IEs) - * to be added to the frame. + * request, too, to help BSS selection. %NL80211_ATTR_WIPHY_FREQ + + * %NL80211_ATTR_WIPHY_FREQ_OFFSET is used to specify the frequence of the + * channel in MHz. %NL80211_ATTR_AUTH_TYPE is used to specify the + * authentication type. %NL80211_ATTR_IE is used to define IEs + * (VendorSpecificInfo, but also including RSN IE and FT IEs) to be added + * to the frame. * When used as an event, this reports reception of an Authentication * frame in station and IBSS modes when the local MLME processed the * frame, i.e., it was for the local STA and was received in correct @@ -595,8 +598,9 @@ * requests to connect to a specified network but without separating * auth and assoc steps. For this, you need to specify the SSID in a * %NL80211_ATTR_SSID attribute, and can optionally specify the association - * IEs in %NL80211_ATTR_IE, %NL80211_ATTR_AUTH_TYPE, %NL80211_ATTR_USE_MFP, - * %NL80211_ATTR_MAC, %NL80211_ATTR_WIPHY_FREQ, %NL80211_ATTR_CONTROL_PORT, + * IEs in %NL80211_ATTR_IE, %NL80211_ATTR_AUTH_TYPE, + * %NL80211_ATTR_USE_MFP, %NL80211_ATTR_MAC, %NL80211_ATTR_WIPHY_FREQ, + * %NL80211_ATTR_WIPHY_FREQ_OFFSET, %NL80211_ATTR_CONTROL_PORT, * %NL80211_ATTR_CONTROL_PORT_ETHERTYPE, * %NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT, * %NL80211_ATTR_CONTROL_PORT_OVER_NL80211, %NL80211_ATTR_MAC_HINT, and @@ -1433,7 +1437,8 @@ enum nl80211_commands { * of &enum nl80211_chan_width, describing the channel width. See the * documentation of the enum for more information. * @NL80211_ATTR_CENTER_FREQ1: Center frequency of the first part of the - * channel, used for anything but 20 MHz bandwidth + * channel, used for anything but 20 MHz bandwidth. In S1G this is the + * operating channel center frequency. * @NL80211_ATTR_CENTER_FREQ2: Center frequency of the second part of the * channel, used only for 80+80 MHz bandwidth * @NL80211_ATTR_WIPHY_CHANNEL_TYPE: included with NL80211_ATTR_WIPHY_FREQ @@ -2480,9 +2485,14 @@ enum nl80211_commands { * entry without having to force a disconnection after the PMK timeout. If * no roaming occurs between the reauth threshold and PMK expiration, * disassociation is still forced. - * * @NL80211_ATTR_RECEIVE_MULTICAST: multicast flag for the * %NL80211_CMD_REGISTER_FRAME command, see the description there. + * @NL80211_ATTR_WIPHY_FREQ_OFFSET: offset of the associated + * %NL80211_ATTR_WIPHY_FREQ in positive KHz. Only valid when supplied with + * an %NL80211_ATTR_WIPHY_FREQ_OFFSET. + * @NL80211_ATTR_CENTER_FREQ1_OFFSET: Center frequency offset in KHz for the + * first channel segment specified in %NL80211_ATTR_CENTER_FREQ1. + * * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined @@ -2960,6 +2970,8 @@ enum nl80211_attrs { NL80211_ATTR_PMK_REAUTH_THRESHOLD, NL80211_ATTR_RECEIVE_MULTICAST, + NL80211_ATTR_WIPHY_FREQ_OFFSET, + NL80211_ATTR_CENTER_FREQ1_OFFSET, /* add attributes here, update the policy in nl80211.c */ @@ -3682,6 +3694,7 @@ enum nl80211_wmm_rule { * (see &enum nl80211_wmm_rule) * @NL80211_FREQUENCY_ATTR_NO_HE: HE operation is not allowed on this channel * in current regulatory domain. + * @NL80211_FREQUENCY_ATTR_OFFSET: frequency offset in KHz * @NL80211_FREQUENCY_ATTR_MAX: highest frequency attribute number * currently defined * @__NL80211_FREQUENCY_ATTR_AFTER_LAST: internal use @@ -3712,6 +3725,7 @@ enum nl80211_frequency_attr { NL80211_FREQUENCY_ATTR_NO_10MHZ, NL80211_FREQUENCY_ATTR_WMM, NL80211_FREQUENCY_ATTR_NO_HE, + NL80211_FREQUENCY_ATTR_OFFSET, /* keep last */ __NL80211_FREQUENCY_ATTR_AFTER_LAST, @@ -4482,6 +4496,7 @@ enum nl80211_bss_scan_width { * @NL80211_BSS_CHAIN_SIGNAL: per-chain signal strength of last BSS update. * Contains a nested array of signal strength attributes (u8, dBm), * using the nesting index as the antenna number. + * @NL80211_BSS_FREQUENCY_OFFSET: frequency offset in KHz * @__NL80211_BSS_AFTER_LAST: internal * @NL80211_BSS_MAX: highest BSS attribute */ @@ -4506,6 +4521,7 @@ enum nl80211_bss { NL80211_BSS_PARENT_TSF, NL80211_BSS_PARENT_BSSID, NL80211_BSS_CHAIN_SIGNAL, + NL80211_BSS_FREQUENCY_OFFSET, /* keep last */ __NL80211_BSS_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index f6523f1485a3..87d7efd186d0 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -365,6 +365,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_CHANNEL_WIDTH] = { .type = NLA_U32 }, [NL80211_ATTR_CENTER_FREQ1] = { .type = NLA_U32 }, + [NL80211_ATTR_CENTER_FREQ1_OFFSET] = NLA_POLICY_RANGE(NLA_U32, 0, 999), [NL80211_ATTR_CENTER_FREQ2] = { .type = NLA_U32 }, [NL80211_ATTR_WIPHY_RETRY_SHORT] = NLA_POLICY_MIN(NLA_U8, 1), @@ -638,6 +639,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_PMK_LIFETIME] = NLA_POLICY_MIN(NLA_U32, 1), [NL80211_ATTR_PMK_REAUTH_THRESHOLD] = NLA_POLICY_RANGE(NLA_U8, 1, 100), [NL80211_ATTR_RECEIVE_MULTICAST] = { .type = NLA_FLAG }, + [NL80211_ATTR_WIPHY_FREQ_OFFSET] = NLA_POLICY_RANGE(NLA_U32, 0, 999), }; /* policy for the key attributes */ @@ -904,6 +906,9 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, struct wiphy *wiphy, chan->center_freq)) goto nla_put_failure; + if (nla_put_u32(msg, NL80211_FREQUENCY_ATTR_OFFSET, chan->freq_offset)) + goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_DISABLED) && nla_put_flag(msg, NL80211_FREQUENCY_ATTR_DISABLED)) goto nla_put_failure; @@ -1309,13 +1314,11 @@ static int nl80211_key_allowed(struct wireless_dev *wdev) } static struct ieee80211_channel *nl80211_get_valid_chan(struct wiphy *wiphy, - struct nlattr *tb) + u32 freq) { struct ieee80211_channel *chan; - if (tb == NULL) - return NULL; - chan = ieee80211_get_channel(wiphy, nla_get_u32(tb)); + chan = ieee80211_get_channel_khz(wiphy, freq); if (!chan || chan->flags & IEEE80211_CHAN_DISABLED) return NULL; return chan; @@ -2770,13 +2773,17 @@ int nl80211_parse_chandef(struct cfg80211_registered_device *rdev, if (!attrs[NL80211_ATTR_WIPHY_FREQ]) return -EINVAL; - control_freq = nla_get_u32(attrs[NL80211_ATTR_WIPHY_FREQ]); + control_freq = MHZ_TO_KHZ( + nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ])); + if (info->attrs[NL80211_ATTR_WIPHY_FREQ_OFFSET]) + control_freq += + nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ_OFFSET]); memset(chandef, 0, sizeof(*chandef)); - - chandef->chan = ieee80211_get_channel(&rdev->wiphy, control_freq); + chandef->chan = ieee80211_get_channel_khz(&rdev->wiphy, control_freq); chandef->width = NL80211_CHAN_WIDTH_20_NOHT; - chandef->center_freq1 = control_freq; + chandef->center_freq1 = KHZ_TO_MHZ(control_freq); + chandef->freq1_offset = control_freq % 1000; chandef->center_freq2 = 0; /* Primary channel not allowed */ @@ -2824,9 +2831,15 @@ int nl80211_parse_chandef(struct cfg80211_registered_device *rdev, } else if (attrs[NL80211_ATTR_CHANNEL_WIDTH]) { chandef->width = nla_get_u32(attrs[NL80211_ATTR_CHANNEL_WIDTH]); - if (attrs[NL80211_ATTR_CENTER_FREQ1]) + if (attrs[NL80211_ATTR_CENTER_FREQ1]) { chandef->center_freq1 = nla_get_u32(attrs[NL80211_ATTR_CENTER_FREQ1]); + if (attrs[NL80211_ATTR_CENTER_FREQ1_OFFSET]) + chandef->freq1_offset = nla_get_u32( + attrs[NL80211_ATTR_CENTER_FREQ1_OFFSET]); + else + chandef->freq1_offset = 0; + } if (attrs[NL80211_ATTR_CENTER_FREQ2]) chandef->center_freq2 = nla_get_u32(attrs[NL80211_ATTR_CENTER_FREQ2]); @@ -3259,6 +3272,9 @@ static int nl80211_send_chandef(struct sk_buff *msg, if (nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, chandef->chan->center_freq)) return -ENOBUFS; + if (nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ_OFFSET, + chandef->chan->freq_offset)) + return -ENOBUFS; switch (chandef->width) { case NL80211_CHAN_WIDTH_20_NOHT: case NL80211_CHAN_WIDTH_20: @@ -8873,6 +8889,8 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb, goto nla_put_failure; if (nla_put_u16(msg, NL80211_BSS_CAPABILITY, res->capability) || nla_put_u32(msg, NL80211_BSS_FREQUENCY, res->channel->center_freq) || + nla_put_u32(msg, NL80211_BSS_FREQUENCY_OFFSET, + res->channel->freq_offset) || nla_put_u32(msg, NL80211_BSS_CHAN_WIDTH, res->scan_width) || nla_put_u32(msg, NL80211_BSS_SEEN_MS_AGO, jiffies_to_msecs(jiffies - intbss->ts))) @@ -9141,6 +9159,7 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info) enum nl80211_auth_type auth_type; struct key_parse key; bool local_state_change; + u32 freq; if (!info->attrs[NL80211_ATTR_MAC]) return -EINVAL; @@ -9197,8 +9216,12 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info) return -EOPNOTSUPP; bssid = nla_data(info->attrs[NL80211_ATTR_MAC]); - chan = nl80211_get_valid_chan(&rdev->wiphy, - info->attrs[NL80211_ATTR_WIPHY_FREQ]); + freq = MHZ_TO_KHZ(nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ])); + if (info->attrs[NL80211_ATTR_WIPHY_FREQ_OFFSET]) + freq += + nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ_OFFSET]); + + chan = nl80211_get_valid_chan(&rdev->wiphy, freq); if (!chan) return -EINVAL; @@ -9388,6 +9411,7 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) struct cfg80211_assoc_request req = {}; const u8 *bssid, *ssid; int err, ssid_len = 0; + u32 freq; if (dev->ieee80211_ptr->conn_owner_nlportid && dev->ieee80211_ptr->conn_owner_nlportid != info->snd_portid) @@ -9407,8 +9431,11 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) bssid = nla_data(info->attrs[NL80211_ATTR_MAC]); - chan = nl80211_get_valid_chan(&rdev->wiphy, - info->attrs[NL80211_ATTR_WIPHY_FREQ]); + freq = MHZ_TO_KHZ(nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ])); + if (info->attrs[NL80211_ATTR_WIPHY_FREQ_OFFSET]) + freq += + nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ_OFFSET]); + chan = nl80211_get_valid_chan(&rdev->wiphy, freq); if (!chan) return -EINVAL; @@ -10088,6 +10115,7 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) struct cfg80211_connect_params connect; struct wiphy *wiphy; struct cfg80211_cached_keys *connkeys = NULL; + u32 freq = 0; int err; memset(&connect, 0, sizeof(connect)); @@ -10158,14 +10186,21 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) connect.prev_bssid = nla_data(info->attrs[NL80211_ATTR_PREV_BSSID]); - if (info->attrs[NL80211_ATTR_WIPHY_FREQ]) { - connect.channel = nl80211_get_valid_chan( - wiphy, info->attrs[NL80211_ATTR_WIPHY_FREQ]); + if (info->attrs[NL80211_ATTR_WIPHY_FREQ]) + freq = MHZ_TO_KHZ(nla_get_u32( + info->attrs[NL80211_ATTR_WIPHY_FREQ])); + if (info->attrs[NL80211_ATTR_WIPHY_FREQ_OFFSET]) + freq += + nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ_OFFSET]); + + if (freq) { + connect.channel = nl80211_get_valid_chan(wiphy, freq); if (!connect.channel) return -EINVAL; } else if (info->attrs[NL80211_ATTR_WIPHY_FREQ_HINT]) { - connect.channel_hint = nl80211_get_valid_chan( - wiphy, info->attrs[NL80211_ATTR_WIPHY_FREQ_HINT]); + freq = nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ_HINT]); + freq = MHZ_TO_KHZ(freq); + connect.channel_hint = nl80211_get_valid_chan(wiphy, freq); if (!connect.channel_hint) return -EINVAL; } @@ -16215,6 +16250,7 @@ int nl80211_send_mgmt(struct cfg80211_registered_device *rdev, nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev), NL80211_ATTR_PAD) || nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, KHZ_TO_MHZ(freq)) || + nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ_OFFSET, freq % 1000) || (sig_dbm && nla_put_u32(msg, NL80211_ATTR_RX_SIGNAL_DBM, sig_dbm)) || nla_put(msg, NL80211_ATTR_FRAME, len, buf) || @@ -16864,8 +16900,10 @@ void cfg80211_report_obss_beacon_khz(struct wiphy *wiphy, const u8 *frame, if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || (freq && - nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, - KHZ_TO_MHZ(freq))) || + (nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, + KHZ_TO_MHZ(freq)) || + nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ_OFFSET, + freq % 1000))) || (sig_dbm && nla_put_u32(msg, NL80211_ATTR_RX_SIGNAL_DBM, sig_dbm)) || nla_put(msg, NL80211_ATTR_FRAME, len, frame)) -- cgit v1.2.3 From 2032f3b2f943256ff40df23182913dfc7e73ec6a Mon Sep 17 00:00:00 2001 From: Thomas Pedersen Date: Thu, 30 Apr 2020 10:25:52 -0700 Subject: nl80211: support scan frequencies in KHz If the driver advertises NL80211_EXT_FEATURE_SCAN_FREQ_KHZ userspace can omit NL80211_ATTR_SCAN_FREQUENCIES in favor of an NL80211_ATTR_SCAN_FREQ_KHZ. To get scan results in KHz userspace must also set the NL80211_SCAN_FLAG_FREQ_KHZ. This lets nl80211 remain compatible with older userspaces while not requring and sending redundant (and potentially incorrect) scan frequency sets. Signed-off-by: Thomas Pedersen Link: https://lore.kernel.org/r/20200430172554.18383-4-thomas@adapt-ip.com [use just nla_nest_start() (not _noflag) for NL80211_ATTR_SCAN_FREQ_KHZ] Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 13 ++++++++++- net/mac80211/main.c | 2 ++ net/wireless/nl80211.c | 51 +++++++++++++++++++++++++++++++++----------- 3 files changed, 53 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index b1cd132c1d27..47d39b6a073d 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2492,7 +2492,7 @@ enum nl80211_commands { * an %NL80211_ATTR_WIPHY_FREQ_OFFSET. * @NL80211_ATTR_CENTER_FREQ1_OFFSET: Center frequency offset in KHz for the * first channel segment specified in %NL80211_ATTR_CENTER_FREQ1. - * + * @NL80211_ATTR_SCAN_FREQ_KHZ: nested attribute with KHz frequencies * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined @@ -2972,6 +2972,7 @@ enum nl80211_attrs { NL80211_ATTR_RECEIVE_MULTICAST, NL80211_ATTR_WIPHY_FREQ_OFFSET, NL80211_ATTR_CENTER_FREQ1_OFFSET, + NL80211_ATTR_SCAN_FREQ_KHZ, /* add attributes here, update the policy in nl80211.c */ @@ -5723,6 +5724,11 @@ enum nl80211_feature_flags { * @NL80211_EXT_FEATURE_MULTICAST_REGISTRATIONS: management frame registrations * are possible for multicast frames and those will be reported properly. * + * @NL80211_EXT_FEATURE_SCAN_FREQ_KHZ: This driver supports receiving and + * reporting scan request with %NL80211_ATTR_SCAN_FREQ_KHZ. In order to + * report %NL80211_ATTR_SCAN_FREQ_KHZ, %NL80211_SCAN_FLAG_FREQ_KHZ must be + * included in the scan request. + * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ @@ -5776,6 +5782,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_DEL_IBSS_STA, NL80211_EXT_FEATURE_MULTICAST_REGISTRATIONS, NL80211_EXT_FEATURE_BEACON_PROTECTION_CLIENT, + NL80211_EXT_FEATURE_SCAN_FREQ_KHZ, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, @@ -5887,6 +5894,9 @@ enum nl80211_timeout_reason { * @NL80211_SCAN_FLAG_MIN_PREQ_CONTENT: minimize probe request content to * only have supported rates and no additional capabilities (unless * added by userspace explicitly.) + * @NL80211_SCAN_FLAG_FREQ_KHZ: report scan results with + * %NL80211_ATTR_SCAN_FREQ_KHZ. This also means + * %NL80211_ATTR_SCAN_FREQUENCIES will not be included. */ enum nl80211_scan_flags { NL80211_SCAN_FLAG_LOW_PRIORITY = 1<<0, @@ -5902,6 +5912,7 @@ enum nl80211_scan_flags { NL80211_SCAN_FLAG_HIGH_ACCURACY = 1<<10, NL80211_SCAN_FLAG_RANDOM_SN = 1<<11, NL80211_SCAN_FLAG_MIN_PREQ_CONTENT = 1<<12, + NL80211_SCAN_FLAG_FREQ_KHZ = 1<<13, }; /** diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 06c90d360633..ac74bd780b42 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -596,6 +596,8 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211); wiphy_ext_feature_set(wiphy, NL80211_EXT_FEATURE_CONTROL_PORT_NO_PREAUTH); + wiphy_ext_feature_set(wiphy, + NL80211_EXT_FEATURE_SCAN_FREQ_KHZ); if (!ops->hw_scan) { wiphy->features |= NL80211_FEATURE_LOW_PRIORITY_SCAN | diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 87d7efd186d0..84bfa147769a 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -640,6 +640,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_PMK_REAUTH_THRESHOLD] = NLA_POLICY_RANGE(NLA_U8, 1, 100), [NL80211_ATTR_RECEIVE_MULTICAST] = { .type = NLA_FLAG }, [NL80211_ATTR_WIPHY_FREQ_OFFSET] = NLA_POLICY_RANGE(NLA_U32, 0, 999), + [NL80211_ATTR_SCAN_FREQ_KHZ] = { .type = NLA_NESTED }, }; /* policy for the key attributes */ @@ -7719,6 +7720,8 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct wireless_dev *wdev = info->user_ptr[1]; struct cfg80211_scan_request *request; + struct nlattr *scan_freqs = NULL; + bool scan_freqs_khz = false; struct nlattr *attr; struct wiphy *wiphy; int err, tmp, n_ssids = 0, n_channels, i; @@ -7737,9 +7740,17 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) goto unlock; } - if (info->attrs[NL80211_ATTR_SCAN_FREQUENCIES]) { - n_channels = validate_scan_freqs( - info->attrs[NL80211_ATTR_SCAN_FREQUENCIES]); + if (info->attrs[NL80211_ATTR_SCAN_FREQ_KHZ]) { + if (!wiphy_ext_feature_isset(wiphy, + NL80211_EXT_FEATURE_SCAN_FREQ_KHZ)) + return -EOPNOTSUPP; + scan_freqs = info->attrs[NL80211_ATTR_SCAN_FREQ_KHZ]; + scan_freqs_khz = true; + } else if (info->attrs[NL80211_ATTR_SCAN_FREQUENCIES]) + scan_freqs = info->attrs[NL80211_ATTR_SCAN_FREQUENCIES]; + + if (scan_freqs) { + n_channels = validate_scan_freqs(scan_freqs); if (!n_channels) { err = -EINVAL; goto unlock; @@ -7787,13 +7798,16 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) } i = 0; - if (info->attrs[NL80211_ATTR_SCAN_FREQUENCIES]) { + if (scan_freqs) { /* user specified, bail out if channel not found */ - nla_for_each_nested(attr, info->attrs[NL80211_ATTR_SCAN_FREQUENCIES], tmp) { + nla_for_each_nested(attr, scan_freqs, tmp) { struct ieee80211_channel *chan; + int freq = nla_get_u32(attr); - chan = ieee80211_get_channel(wiphy, nla_get_u32(attr)); + if (!scan_freqs_khz) + freq = MHZ_TO_KHZ(freq); + chan = ieee80211_get_channel_khz(wiphy, freq); if (!chan) { err = -EINVAL; goto out_free; @@ -15231,14 +15245,27 @@ static int nl80211_add_scan_req(struct sk_buff *msg, } nla_nest_end(msg, nest); - nest = nla_nest_start_noflag(msg, NL80211_ATTR_SCAN_FREQUENCIES); - if (!nest) - goto nla_put_failure; - for (i = 0; i < req->n_channels; i++) { - if (nla_put_u32(msg, i, req->channels[i]->center_freq)) + if (req->flags & NL80211_SCAN_FLAG_FREQ_KHZ) { + nest = nla_nest_start(msg, NL80211_ATTR_SCAN_FREQ_KHZ); + if (!nest) + goto nla_put_failure; + for (i = 0; i < req->n_channels; i++) { + if (nla_put_u32(msg, i, + ieee80211_channel_to_khz(req->channels[i]))) + goto nla_put_failure; + } + nla_nest_end(msg, nest); + } else { + nest = nla_nest_start_noflag(msg, + NL80211_ATTR_SCAN_FREQUENCIES); + if (!nest) goto nla_put_failure; + for (i = 0; i < req->n_channels; i++) { + if (nla_put_u32(msg, i, req->channels[i]->center_freq)) + goto nla_put_failure; + } + nla_nest_end(msg, nest); } - nla_nest_end(msg, nest); if (req->ie && nla_put(msg, NL80211_ATTR_IE, req->ie_len, req->ie)) -- cgit v1.2.3 From 396fba0a59f3c94d6fd6443fbeabd8bd9e3956eb Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 7 May 2020 13:39:09 -0500 Subject: cfg80211: Replace zero-length array with flexible-array The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] sizeof(flexible-array-member) triggers a warning because flexible array members have incomplete type[1]. There are some instances of code in which the sizeof operator is being incorrectly/erroneously applied to zero-length arrays and the result is zero. Such instances may be hiding some bugs. So, this work (flexible-array member conversions) will also help to get completely rid of those sorts of issues. This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Link: https://lore.kernel.org/r/20200507183909.GA12993@embeddedor Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 6 +++--- include/net/cfg80211.h | 8 ++++---- net/wireless/core.h | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 2153d465d752..0320ca4c7d28 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -800,7 +800,7 @@ struct ieee80211_msrment_ie { u8 token; u8 mode; u8 type; - u8 request[0]; + u8 request[]; } __packed; /** @@ -1781,7 +1781,7 @@ struct ieee80211_he_operation { __le32 he_oper_params; __le16 he_mcs_nss_set; /* Optional 0,1,3,4,5,7 or 8 bytes: depends on @he_oper_params */ - u8 optional[0]; + u8 optional[]; } __packed; /** @@ -1793,7 +1793,7 @@ struct ieee80211_he_operation { struct ieee80211_he_spr { u8 he_sr_control; /* Optional 0 to 19 bytes: depends on @he_sr_control */ - u8 optional[0]; + u8 optional[]; } __packed; /** diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 7415f77d99ca..021366cfb2b0 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2037,7 +2037,7 @@ struct cfg80211_scan_request { bool no_cck; /* keep last */ - struct ieee80211_channel *channels[0]; + struct ieee80211_channel *channels[]; }; static inline void get_random_mask_addr(u8 *buf, const u8 *addr, const u8 *mask) @@ -2183,7 +2183,7 @@ struct cfg80211_sched_scan_request { struct list_head list; /* keep last */ - struct ieee80211_channel *channels[0]; + struct ieee80211_channel *channels[]; }; /** @@ -2305,7 +2305,7 @@ struct cfg80211_bss { u8 bssid_index; u8 max_bssid_indicator; - u8 priv[0] __aligned(sizeof(void *)); + u8 priv[] __aligned(sizeof(void *)); }; /** @@ -4852,7 +4852,7 @@ struct wiphy { u8 max_data_retry_count; - char priv[0] __aligned(NETDEV_ALIGN); + char priv[] __aligned(NETDEV_ALIGN); }; static inline struct net *wiphy_net(struct wiphy *wiphy) diff --git a/net/wireless/core.h b/net/wireless/core.h index 639d41896573..e0e5b3ee9699 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -286,7 +286,7 @@ struct cfg80211_cqm_config { u32 rssi_hyst; s32 last_rssi_event_value; int n_rssi_thresholds; - s32 rssi_thresholds[0]; + s32 rssi_thresholds[]; }; void cfg80211_destroy_ifaces(struct cfg80211_registered_device *rdev); -- cgit v1.2.3 From 3c23215ba8c70c0e9b16beffb7f700a401391e38 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 7 May 2020 13:59:07 -0500 Subject: mac80211: Replace zero-length array with flexible-array The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] sizeof(flexible-array-member) triggers a warning because flexible array members have incomplete type[1]. There are some instances of code in which the sizeof operator is being incorrectly/erroneously applied to zero-length arrays and the result is zero. Such instances may be hiding some bugs. So, this work (flexible-array member conversions) will also help to get completely rid of those sorts of issues. This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Link: https://lore.kernel.org/r/20200507185907.GA15102@embeddedor Signed-off-by: Johannes Berg --- include/net/mac80211.h | 10 +++++----- net/mac80211/ieee80211_i.h | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 0d48e679efb0..7cb712427df1 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -230,7 +230,7 @@ struct ieee80211_chanctx_conf { bool radar_enabled; - u8 drv_priv[0] __aligned(sizeof(void *)); + u8 drv_priv[] __aligned(sizeof(void *)); }; /** @@ -1670,7 +1670,7 @@ struct ieee80211_vif { bool txqs_stopped[IEEE80211_NUM_ACS]; /* must be last */ - u8 drv_priv[0] __aligned(sizeof(void *)); + u8 drv_priv[] __aligned(sizeof(void *)); }; static inline bool ieee80211_vif_is_mesh(struct ieee80211_vif *vif) @@ -1798,7 +1798,7 @@ struct ieee80211_key_conf { s8 keyidx; u16 flags; u8 keylen; - u8 key[0]; + u8 key[]; }; #define IEEE80211_MAX_PN_LEN 16 @@ -2053,7 +2053,7 @@ struct ieee80211_sta { struct ieee80211_txq *txq[IEEE80211_NUM_TIDS + 1]; /* must be last */ - u8 drv_priv[0] __aligned(sizeof(void *)); + u8 drv_priv[] __aligned(sizeof(void *)); }; /** @@ -2099,7 +2099,7 @@ struct ieee80211_txq { u8 ac; /* must be last */ - u8 drv_priv[0] __aligned(sizeof(void *)); + u8 drv_priv[] __aligned(sizeof(void *)); }; /** diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 8cbae66b5cdb..2d1b6cb75497 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -267,7 +267,7 @@ struct probe_resp { struct rcu_head rcu_head; int len; u16 csa_counter_offsets[IEEE80211_MAX_CSA_COUNTERS_NUM]; - u8 data[0]; + u8 data[]; }; struct ps_data { -- cgit v1.2.3 From dca9ca2d588bd2c0989c671f048540b82e57cf1e Mon Sep 17 00:00:00 2001 From: Markus Theil Date: Fri, 8 May 2020 16:42:00 +0200 Subject: nl80211: add ability to report TX status for control port TX This adds the necessary capabilities in nl80211 to allow drivers to assign a cookie to control port TX frames (returned via extack in the netlink ACK message of the command) and then later report the frame's status. Signed-off-by: Markus Theil Link: https://lore.kernel.org/r/20200508144202.7678-2-markus.theil@tu-ilmenau.de [use extack cookie instead of explicit message, recombine patches] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 20 +++++++++++++++++++- include/uapi/linux/nl80211.h | 12 ++++++++++++ net/mac80211/ieee80211_i.h | 3 ++- net/mac80211/tx.c | 3 ++- net/wireless/nl80211.c | 41 +++++++++++++++++++++++++++++++++-------- net/wireless/rdev-ops.h | 9 ++++++--- net/wireless/trace.h | 17 +++++++++++++++++ 7 files changed, 91 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 021366cfb2b0..f842f3652026 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -4069,7 +4069,8 @@ struct cfg80211_ops { struct net_device *dev, const u8 *buf, size_t len, const u8 *dest, const __be16 proto, - const bool noencrypt); + const bool noencrypt, + u64 *cookie); int (*get_ftm_responder_stats)(struct wiphy *wiphy, struct net_device *dev, @@ -7049,6 +7050,23 @@ static inline bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq, void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie, const u8 *buf, size_t len, bool ack, gfp_t gfp); +/** + * cfg80211_control_port_tx_status - notification of TX status for control + * port frames + * @wdev: wireless device receiving the frame + * @cookie: Cookie returned by cfg80211_ops::tx_control_port() + * @buf: Data frame (header + body) + * @len: length of the frame data + * @ack: Whether frame was acknowledged + * @gfp: context flags + * + * This function is called whenever a control port frame was requested to be + * transmitted with cfg80211_ops::tx_control_port() to report the TX status of + * the transmission attempt. + */ +void cfg80211_control_port_tx_status(struct wireless_dev *wdev, u64 cookie, + const u8 *buf, size_t len, bool ack, + gfp_t gfp); /** * cfg80211_rx_control_port - notification about a received control port frame diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 47d39b6a073d..0f324b6b81cc 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1164,6 +1164,12 @@ * dropped because it did not include a valid MME MIC while beacon * protection was enabled (BIGTK configured in station mode). * + * @NL80211_CMD_CONTROL_PORT_FRAME_TX_STATUS: Report TX status of a control + * port frame transmitted with %NL80211_CMD_CONTROL_PORT_FRAME. + * %NL80211_ATTR_COOKIE identifies the TX command and %NL80211_ATTR_FRAME + * includes the contents of the frame. %NL80211_ATTR_ACK flag is included + * if the recipient acknowledged the frame. + * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use */ @@ -1392,6 +1398,8 @@ enum nl80211_commands { NL80211_CMD_UNPROT_BEACON, + NL80211_CMD_CONTROL_PORT_FRAME_TX_STATUS, + /* add new commands above here */ /* used to define NL80211_CMD_MAX below */ @@ -5729,6 +5737,9 @@ enum nl80211_feature_flags { * report %NL80211_ATTR_SCAN_FREQ_KHZ, %NL80211_SCAN_FLAG_FREQ_KHZ must be * included in the scan request. * + * @NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211_TX_STATUS: The driver + * can report tx status for control port over nl80211 tx operations. + * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ @@ -5783,6 +5794,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_MULTICAST_REGISTRATIONS, NL80211_EXT_FEATURE_BEACON_PROTECTION_CLIENT, NL80211_EXT_FEATURE_SCAN_FREQ_KHZ, + NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211_TX_STATUS, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 2d1b6cb75497..b87dc873825b 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1800,7 +1800,8 @@ void ieee80211_check_fast_xmit_iface(struct ieee80211_sub_if_data *sdata); void ieee80211_clear_fast_xmit(struct sta_info *sta); int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev, const u8 *buf, size_t len, - const u8 *dest, __be16 proto, bool unencrypted); + const u8 *dest, __be16 proto, bool unencrypted, + u64 *cookie); int ieee80211_probe_mesh_link(struct wiphy *wiphy, struct net_device *dev, const u8 *buf, size_t len); diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 47f460c8bd74..5931128e1855 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -5339,7 +5339,8 @@ void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata, int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev, const u8 *buf, size_t len, - const u8 *dest, __be16 proto, bool unencrypted) + const u8 *dest, __be16 proto, bool unencrypted, + u64 *cookie) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 84bfa147769a..7ea764865546 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -13866,6 +13866,7 @@ static int nl80211_external_auth(struct sk_buff *skb, struct genl_info *info) static int nl80211_tx_control_port(struct sk_buff *skb, struct genl_info *info) { + bool dont_wait_for_ack = info->attrs[NL80211_ATTR_DONT_WAIT_FOR_ACK]; struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wireless_dev *wdev = dev->ieee80211_ptr; @@ -13874,6 +13875,7 @@ static int nl80211_tx_control_port(struct sk_buff *skb, struct genl_info *info) u8 *dest; u16 proto; bool noencrypt; + u64 cookie = 0; int err; if (!wiphy_ext_feature_isset(&rdev->wiphy, @@ -13918,9 +13920,12 @@ static int nl80211_tx_control_port(struct sk_buff *skb, struct genl_info *info) noencrypt = nla_get_flag(info->attrs[NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT]); - return rdev_tx_control_port(rdev, dev, buf, len, - dest, cpu_to_be16(proto), noencrypt); - + err = rdev_tx_control_port(rdev, dev, buf, len, + dest, cpu_to_be16(proto), noencrypt, + dont_wait_for_ack ? NULL : &cookie); + if (!err && !dont_wait_for_ack) + nl_set_extack_cookie_u64(info->extack, cookie); + return err; out: wdev_unlock(wdev); return err; @@ -16294,8 +16299,9 @@ int nl80211_send_mgmt(struct cfg80211_registered_device *rdev, return -ENOBUFS; } -void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie, - const u8 *buf, size_t len, bool ack, gfp_t gfp) +static void nl80211_frame_tx_status(struct wireless_dev *wdev, u64 cookie, + const u8 *buf, size_t len, bool ack, + gfp_t gfp, enum nl80211_commands command) { struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); @@ -16303,13 +16309,16 @@ void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie, struct sk_buff *msg; void *hdr; - trace_cfg80211_mgmt_tx_status(wdev, cookie, ack); + if (command == NL80211_CMD_FRAME_TX_STATUS) + trace_cfg80211_mgmt_tx_status(wdev, cookie, ack); + else + trace_cfg80211_control_port_tx_status(wdev, cookie, ack); msg = nlmsg_new(100 + len, gfp); if (!msg) return; - hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_FRAME_TX_STATUS); + hdr = nl80211hdr_put(msg, 0, 0, 0, command); if (!hdr) { nlmsg_free(msg); return; @@ -16332,9 +16341,25 @@ void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie, NL80211_MCGRP_MLME, gfp); return; - nla_put_failure: +nla_put_failure: nlmsg_free(msg); } + +void cfg80211_control_port_tx_status(struct wireless_dev *wdev, u64 cookie, + const u8 *buf, size_t len, bool ack, + gfp_t gfp) +{ + nl80211_frame_tx_status(wdev, cookie, buf, len, ack, gfp, + NL80211_CMD_CONTROL_PORT_FRAME_TX_STATUS); +} +EXPORT_SYMBOL(cfg80211_control_port_tx_status); + +void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie, + const u8 *buf, size_t len, bool ack, gfp_t gfp) +{ + nl80211_frame_tx_status(wdev, cookie, buf, len, ack, gfp, + NL80211_CMD_FRAME_TX_STATUS); +} EXPORT_SYMBOL(cfg80211_mgmt_tx_status); static int __nl80211_rx_control_port(struct net_device *dev, diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index df5142e86c4f..950d57494168 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -748,14 +748,17 @@ static inline int rdev_tx_control_port(struct cfg80211_registered_device *rdev, struct net_device *dev, const void *buf, size_t len, const u8 *dest, __be16 proto, - const bool noencrypt) + const bool noencrypt, u64 *cookie) { int ret; trace_rdev_tx_control_port(&rdev->wiphy, dev, buf, len, dest, proto, noencrypt); ret = rdev->ops->tx_control_port(&rdev->wiphy, dev, buf, len, - dest, proto, noencrypt); - trace_rdev_return_int(&rdev->wiphy, ret); + dest, proto, noencrypt, cookie); + if (cookie) + trace_rdev_return_int_cookie(&rdev->wiphy, ret, *cookie); + else + trace_rdev_return_int(&rdev->wiphy, ret); return ret; } diff --git a/net/wireless/trace.h b/net/wireless/trace.h index f2ab44a2a3e4..b23cab016521 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -2861,6 +2861,23 @@ TRACE_EVENT(cfg80211_mgmt_tx_status, WDEV_PR_ARG, __entry->cookie, BOOL_TO_STR(__entry->ack)) ); +TRACE_EVENT(cfg80211_control_port_tx_status, + TP_PROTO(struct wireless_dev *wdev, u64 cookie, bool ack), + TP_ARGS(wdev, cookie, ack), + TP_STRUCT__entry( + WDEV_ENTRY + __field(u64, cookie) + __field(bool, ack) + ), + TP_fast_assign( + WDEV_ASSIGN; + __entry->cookie = cookie; + __entry->ack = ack; + ), + TP_printk(WDEV_PR_FMT", cookie: %llu, ack: %s", + WDEV_PR_ARG, __entry->cookie, BOOL_TO_STR(__entry->ack)) +); + TRACE_EVENT(cfg80211_rx_control_port, TP_PROTO(struct net_device *netdev, struct sk_buff *skb, bool unencrypted), -- cgit v1.2.3 From 1ea02224afc29431880a67b8c3198146cc01d33e Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 26 May 2020 10:31:33 +0200 Subject: mac80211: allow SA-QUERY processing in userspace As discussed with Mathy almost two years ago in http://lore.kernel.org/r/20180806224857.14853-1-Mathy.Vanhoef@cs.kuleuven.be we should let userspace process SA-QUERY frames if it wants to, so that it can handle OCV (operating channel validation) which mac80211 doesn't know how to. Evidently I had been expecting Mathy to (re)send such a patch, but he never did, perhaps expecting me to do it after our discussion. In any case, this came up now with OCV getting more attention, so move the code around as discussed there to let userspace handle it, and do it properly. Signed-off-by: Johannes Berg Link: https://lore.kernel.org/r/20200526103131.1f9cf7e5b6db.Iae5b42b09ad2b1cbcbe13492002c43f0d1d51dfc@changeid Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 49 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 36 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 8e47b0d31051..8b0fa5e345f4 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -3354,19 +3354,6 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) } } break; - case WLAN_CATEGORY_SA_QUERY: - if (len < (IEEE80211_MIN_ACTION_SIZE + - sizeof(mgmt->u.action.u.sa_query))) - break; - - switch (mgmt->u.action.u.sa_query.action) { - case WLAN_ACTION_SA_QUERY_REQUEST: - if (sdata->vif.type != NL80211_IFTYPE_STATION) - break; - ieee80211_process_sa_query_req(sdata, mgmt, len); - goto handled; - } - break; case WLAN_CATEGORY_SELF_PROTECTED: if (len < (IEEE80211_MIN_ACTION_SIZE + sizeof(mgmt->u.action.u.self_prot.action_code))) @@ -3456,6 +3443,41 @@ ieee80211_rx_h_userspace_mgmt(struct ieee80211_rx_data *rx) return RX_CONTINUE; } +static ieee80211_rx_result debug_noinline +ieee80211_rx_h_action_post_userspace(struct ieee80211_rx_data *rx) +{ + struct ieee80211_sub_if_data *sdata = rx->sdata; + struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *) rx->skb->data; + int len = rx->skb->len; + + if (!ieee80211_is_action(mgmt->frame_control)) + return RX_CONTINUE; + + switch (mgmt->u.action.category) { + case WLAN_CATEGORY_SA_QUERY: + if (len < (IEEE80211_MIN_ACTION_SIZE + + sizeof(mgmt->u.action.u.sa_query))) + break; + + switch (mgmt->u.action.u.sa_query.action) { + case WLAN_ACTION_SA_QUERY_REQUEST: + if (sdata->vif.type != NL80211_IFTYPE_STATION) + break; + ieee80211_process_sa_query_req(sdata, mgmt, len); + goto handled; + } + break; + } + + return RX_CONTINUE; + + handled: + if (rx->sta) + rx->sta->rx_stats.packets++; + dev_kfree_skb(rx->skb); + return RX_QUEUED; +} + static ieee80211_rx_result debug_noinline ieee80211_rx_h_action_return(struct ieee80211_rx_data *rx) { @@ -3736,6 +3758,7 @@ static void ieee80211_rx_handlers(struct ieee80211_rx_data *rx, CALL_RXH(ieee80211_rx_h_mgmt_check); CALL_RXH(ieee80211_rx_h_action); CALL_RXH(ieee80211_rx_h_userspace_mgmt); + CALL_RXH(ieee80211_rx_h_action_post_userspace); CALL_RXH(ieee80211_rx_h_action_return); CALL_RXH(ieee80211_rx_h_mgmt); -- cgit v1.2.3 From 9a5f6488623730dc16cca0836ade23869761adee Mon Sep 17 00:00:00 2001 From: Tamizh Chelvam Date: Wed, 13 May 2020 13:41:44 +0530 Subject: nl80211: Add support to configure TID specific Tx rate configuration This patch adds support to configure per TID Tx Rate configuration through NL80211_TID_CONFIG_ATTR_TX_RATE* attributes. And it uses nl80211_parse_tx_bitrate_mask api to validate the Tx rate mask. Signed-off-by: Tamizh Chelvam Link: https://lore.kernel.org/r/1589357504-10175-1-git-send-email-tamizhr@codeaurora.org Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 29 +++++++++++++---------- include/uapi/linux/nl80211.h | 21 +++++++++++++++++ net/wireless/nl80211.c | 56 +++++++++++++++++++++++++++++++++----------- 3 files changed, 80 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index f842f3652026..e2dbc9c02ef3 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -630,6 +630,19 @@ struct cfg80211_chan_def { u16 freq1_offset; }; +/* + * cfg80211_bitrate_mask - masks for bitrate control + */ +struct cfg80211_bitrate_mask { + struct { + u32 legacy; + u8 ht_mcs[IEEE80211_HT_MCS_MASK_LEN]; + u16 vht_mcs[NL80211_VHT_NSS_MAX]; + enum nl80211_txrate_gi gi; + } control[NUM_NL80211_BANDS]; +}; + + /** * struct cfg80211_tid_cfg - TID specific configuration * @config_override: Flag to notify driver to reset TID configuration @@ -643,6 +656,8 @@ struct cfg80211_chan_def { * @ampdu: Enable/Disable MPDU aggregation * @rtscts: Enable/Disable RTS/CTS * @amsdu: Enable/Disable MSDU aggregation + * @txrate_type: Tx bitrate mask type + * @txrate_mask: Tx bitrate to be applied for the TID */ struct cfg80211_tid_cfg { bool config_override; @@ -653,6 +668,8 @@ struct cfg80211_tid_cfg { enum nl80211_tid_config ampdu; enum nl80211_tid_config rtscts; enum nl80211_tid_config amsdu; + enum nl80211_tx_rate_setting txrate_type; + struct cfg80211_bitrate_mask txrate_mask; }; /** @@ -1007,18 +1024,6 @@ struct cfg80211_acl_data { struct mac_address mac_addrs[]; }; -/* - * cfg80211_bitrate_mask - masks for bitrate control - */ -struct cfg80211_bitrate_mask { - struct { - u32 legacy; - u8 ht_mcs[IEEE80211_HT_MCS_MASK_LEN]; - u16 vht_mcs[NL80211_VHT_NSS_MAX]; - enum nl80211_txrate_gi gi; - } control[NUM_NL80211_BANDS]; -}; - /** * enum cfg80211_ap_settings_flags - AP settings flags * diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 0f324b6b81cc..c14666b75e57 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -4841,6 +4841,17 @@ enum nl80211_tid_config { NL80211_TID_CONFIG_DISABLE, }; +/* enum nl80211_tx_rate_setting - TX rate configuration type + * @NL80211_TX_RATE_AUTOMATIC: automatically determine TX rate + * @NL80211_TX_RATE_LIMITED: limit the TX rate by the TX rate parameter + * @NL80211_TX_RATE_FIXED: fix TX rate to the TX rate parameter + */ +enum nl80211_tx_rate_setting { + NL80211_TX_RATE_AUTOMATIC, + NL80211_TX_RATE_LIMITED, + NL80211_TX_RATE_FIXED, +}; + /* enum nl80211_tid_config_attr - TID specific configuration. * @NL80211_TID_CONFIG_ATTR_PAD: pad attribute for 64-bit values * @NL80211_TID_CONFIG_ATTR_VIF_SUPP: a bitmap (u64) of attributes supported @@ -4876,6 +4887,14 @@ enum nl80211_tid_config { * @NL80211_TID_CONFIG_ATTR_AMSDU_CTRL: Enable/Disable MSDU aggregation * for the TIDs specified in %NL80211_TID_CONFIG_ATTR_TIDS. * Its type is u8, using the values from &nl80211_tid_config. + * @NL80211_TID_CONFIG_ATTR_TX_RATE_TYPE: This attribute will be useful + * to notfiy the driver that what type of txrate should be used + * for the TIDs specified in %NL80211_TID_CONFIG_ATTR_TIDS. using + * the values form &nl80211_tx_rate_setting. + * @NL80211_TID_CONFIG_ATTR_TX_RATE: Data frame TX rate mask should be applied + * with the parameters passed through %NL80211_ATTR_TX_RATES. + * configuration is applied to the data frame for the tid to that connected + * station. */ enum nl80211_tid_config_attr { __NL80211_TID_CONFIG_ATTR_INVALID, @@ -4890,6 +4909,8 @@ enum nl80211_tid_config_attr { NL80211_TID_CONFIG_ATTR_AMPDU_CTRL, NL80211_TID_CONFIG_ATTR_RTSCTS_CTRL, NL80211_TID_CONFIG_ATTR_AMSDU_CTRL, + NL80211_TID_CONFIG_ATTR_TX_RATE_TYPE, + NL80211_TID_CONFIG_ATTR_TX_RATE, /* keep last */ __NL80211_TID_CONFIG_ATTR_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 7ea764865546..22c4d13e28cb 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -329,6 +329,15 @@ he_bss_color_policy[NL80211_HE_BSS_COLOR_ATTR_MAX + 1] = { [NL80211_HE_BSS_COLOR_ATTR_PARTIAL] = { .type = NLA_FLAG }, }; +static const struct nla_policy nl80211_txattr_policy[NL80211_TXRATE_MAX + 1] = { + [NL80211_TXRATE_LEGACY] = { .type = NLA_BINARY, + .len = NL80211_MAX_SUPP_RATES }, + [NL80211_TXRATE_HT] = { .type = NLA_BINARY, + .len = NL80211_MAX_SUPP_HT_RATES }, + [NL80211_TXRATE_VHT] = NLA_POLICY_EXACT_LEN_WARN(sizeof(struct nl80211_txrate_vht)), + [NL80211_TXRATE_GI] = { .type = NLA_U8 }, +}; + static const struct nla_policy nl80211_tid_config_attr_policy[NL80211_TID_CONFIG_ATTR_MAX + 1] = { [NL80211_TID_CONFIG_ATTR_VIF_SUPP] = { .type = NLA_U64 }, @@ -345,6 +354,10 @@ nl80211_tid_config_attr_policy[NL80211_TID_CONFIG_ATTR_MAX + 1] = { NLA_POLICY_MAX(NLA_U8, NL80211_TID_CONFIG_DISABLE), [NL80211_TID_CONFIG_ATTR_AMSDU_CTRL] = NLA_POLICY_MAX(NLA_U8, NL80211_TID_CONFIG_DISABLE), + [NL80211_TID_CONFIG_ATTR_TX_RATE_TYPE] = + NLA_POLICY_MAX(NLA_U8, NL80211_TX_RATE_FIXED), + [NL80211_TID_CONFIG_ATTR_TX_RATE] = + NLA_POLICY_NESTED(nl80211_txattr_policy), }; static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { @@ -4388,16 +4401,9 @@ static bool vht_set_mcs_mask(struct ieee80211_supported_band *sband, return true; } -static const struct nla_policy nl80211_txattr_policy[NL80211_TXRATE_MAX + 1] = { - [NL80211_TXRATE_LEGACY] = { .type = NLA_BINARY, - .len = NL80211_MAX_SUPP_RATES }, - [NL80211_TXRATE_HT] = { .type = NLA_BINARY, - .len = NL80211_MAX_SUPP_HT_RATES }, - [NL80211_TXRATE_VHT] = NLA_POLICY_EXACT_LEN_WARN(sizeof(struct nl80211_txrate_vht)), - [NL80211_TXRATE_GI] = { .type = NLA_U8 }, -}; - static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, + struct nlattr *attrs[], + enum nl80211_attrs attr, struct cfg80211_bitrate_mask *mask) { struct nlattr *tb[NL80211_TXRATE_MAX + 1]; @@ -4428,14 +4434,14 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, } /* if no rates are given set it back to the defaults */ - if (!info->attrs[NL80211_ATTR_TX_RATES]) + if (!attrs[attr]) goto out; /* The nested attribute uses enum nl80211_band as the index. This maps * directly to the enum nl80211_band values used in cfg80211. */ BUILD_BUG_ON(NL80211_MAX_SUPP_HT_RATES > IEEE80211_HT_MCS_MASK_LEN * 8); - nla_for_each_nested(tx_rates, info->attrs[NL80211_ATTR_TX_RATES], rem) { + nla_for_each_nested(tx_rates, attrs[attr], rem) { enum nl80211_band band = nla_type(tx_rates); int err; @@ -4940,7 +4946,9 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) return -EINVAL; if (info->attrs[NL80211_ATTR_TX_RATES]) { - err = nl80211_parse_tx_bitrate_mask(info, ¶ms.beacon_rate); + err = nl80211_parse_tx_bitrate_mask(info, info->attrs, + NL80211_ATTR_TX_RATES, + ¶ms.beacon_rate); if (err) return err; @@ -10753,7 +10761,8 @@ static int nl80211_set_tx_bitrate_mask(struct sk_buff *skb, if (!rdev->ops->set_bitrate_mask) return -EOPNOTSUPP; - err = nl80211_parse_tx_bitrate_mask(info, &mask); + err = nl80211_parse_tx_bitrate_mask(info, info->attrs, + NL80211_ATTR_TX_RATES, &mask); if (err) return err; @@ -11359,7 +11368,9 @@ static int nl80211_join_mesh(struct sk_buff *skb, struct genl_info *info) } if (info->attrs[NL80211_ATTR_TX_RATES]) { - err = nl80211_parse_tx_bitrate_mask(info, &setup.beacon_rate); + err = nl80211_parse_tx_bitrate_mask(info, info->attrs, + NL80211_ATTR_TX_RATES, + &setup.beacon_rate); if (err) return err; @@ -14139,6 +14150,23 @@ static int parse_tid_conf(struct cfg80211_registered_device *rdev, nla_get_u8(attrs[NL80211_TID_CONFIG_ATTR_AMSDU_CTRL]); } + if (attrs[NL80211_TID_CONFIG_ATTR_TX_RATE_TYPE]) { + u32 idx = NL80211_TID_CONFIG_ATTR_TX_RATE_TYPE, attr; + + tid_conf->txrate_type = nla_get_u8(attrs[idx]); + + if (tid_conf->txrate_type != NL80211_TX_RATE_AUTOMATIC) { + attr = NL80211_TID_CONFIG_ATTR_TX_RATE; + err = nl80211_parse_tx_bitrate_mask(info, attrs, attr, + &tid_conf->txrate_mask); + if (err) + return err; + + tid_conf->mask |= BIT(NL80211_TID_CONFIG_ATTR_TX_RATE); + } + tid_conf->mask |= BIT(NL80211_TID_CONFIG_ATTR_TX_RATE_TYPE); + } + if (peer) mask = rdev->wiphy.tid_config_support.peer; else -- cgit v1.2.3 From a3b018febccd3686c39e86e98b5081bde014fc66 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Sun, 17 May 2020 18:30:19 +0200 Subject: cfg80211: fix CFG82011_CRDA_SUPPORT still mentioning internal regdb Back with commit c8c240e284b3 (cfg80211: reg: remove support for built-in regdb, 2015-10-15), support for using CFG80211_INTERNAL_REGDB was removed in favor of loading the regulatory database as firmware file. The documentation of CFG80211_CRDA_SUPPORT was not adjusted, though, which is why it still mentions mentions the old way of loading via the internal regulatory database. Remove it so that the kernel option only mentions using the firmware file. Signed-off-by: Patrick Steinhardt Link: https://lore.kernel.org/r/c56e60207fbd0512029de8c6276ee00f73491924.1589732954.git.ps@pks.im Signed-off-by: Johannes Berg --- net/wireless/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig index 63cf7131f601..813e93644ae7 100644 --- a/net/wireless/Kconfig +++ b/net/wireless/Kconfig @@ -181,8 +181,8 @@ config CFG80211_CRDA_SUPPORT default y help You should enable this option unless you know for sure you have no - need for it, for example when using internal regdb (above) or the - database loaded as a firmware file. + need for it, for example when using the regulatory database loaded as + a firmware file. If unsure, say Y. -- cgit v1.2.3 From c11299243370580832c27882dcedf2604f9f48f8 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 26 May 2020 14:33:48 +0200 Subject: mac80211: fix HT-Control field reception for management frames If we receive management frames with an HT-Control field, we cannot parse them properly, as we assume a fixed length management header. Since we don't even need the HTC field (for these frames, or really at all), just remove it at the beginning of RX. Reported-by: Haggai Abramovsky Signed-off-by: Johannes Berg Link: https://lore.kernel.org/r/20200526143346.cf5ce70521c5.I333251a084ec4cfe67b7ef7efe2d2f1a33883931@changeid Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 44 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 37 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 8b0fa5e345f4..21854a61a2b7 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -93,13 +93,44 @@ static u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len, * This function cleans up the SKB, i.e. it removes all the stuff * only useful for monitoring. */ -static void remove_monitor_info(struct sk_buff *skb, - unsigned int present_fcs_len, - unsigned int rtap_space) +static struct sk_buff *ieee80211_clean_skb(struct sk_buff *skb, + unsigned int present_fcs_len, + unsigned int rtap_space) { + struct ieee80211_hdr *hdr; + unsigned int hdrlen; + __le16 fc; + if (present_fcs_len) __pskb_trim(skb, skb->len - present_fcs_len); __pskb_pull(skb, rtap_space); + + hdr = (void *)skb->data; + fc = hdr->frame_control; + + /* + * Remove the HT-Control field (if present) on management + * frames after we've sent the frame to monitoring. We + * (currently) don't need it, and don't properly parse + * frames with it present, due to the assumption of a + * fixed management header length. + */ + if (likely(!ieee80211_is_mgmt(fc) || !ieee80211_has_order(fc))) + return skb; + + hdrlen = ieee80211_hdrlen(fc); + hdr->frame_control &= ~cpu_to_le16(IEEE80211_FCTL_ORDER); + + if (!pskb_may_pull(skb, hdrlen)) { + dev_kfree_skb(skb); + return NULL; + } + + memmove(skb->data + IEEE80211_HT_CTL_LEN, skb->data, + hdrlen - IEEE80211_HT_CTL_LEN); + __pskb_pull(skb, IEEE80211_HT_CTL_LEN); + + return skb; } static inline bool should_drop_frame(struct sk_buff *skb, int present_fcs_len, @@ -827,8 +858,8 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb, return NULL; } - remove_monitor_info(origskb, present_fcs_len, rtap_space); - return origskb; + return ieee80211_clean_skb(origskb, present_fcs_len, + rtap_space); } ieee80211_handle_mu_mimo_mon(monitor_sdata, origskb, rtap_space); @@ -871,8 +902,7 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb, if (!origskb) return NULL; - remove_monitor_info(origskb, present_fcs_len, rtap_space); - return origskb; + return ieee80211_clean_skb(origskb, present_fcs_len, rtap_space); } static void ieee80211_parse_qos(struct ieee80211_rx_data *rx) -- cgit v1.2.3 From d8e79f1dbcee8032667c0718a654c749d64f6304 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 27 May 2020 01:00:20 -0700 Subject: nexthop: Fix type of event_type in call_nexthop_notifiers Clang warns: net/ipv4/nexthop.c:841:30: warning: implicit conversion from enumeration type 'enum nexthop_event_type' to different enumeration type 'enum fib_event_type' [-Wenum-conversion] call_nexthop_notifiers(net, NEXTHOP_EVENT_DEL, nh); ~~~~~~~~~~~~~~~~~~~~~~ ^~~~~~~~~~~~~~~~~ 1 warning generated. Use the right type for event_type so that clang does not warn. Fixes: 8590ceedb701 ("nexthop: add support for notifiers") Link: https://github.com/ClangBuiltLinux/linux/issues/1038 Signed-off-by: Nathan Chancellor Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/nexthop.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 143011f9b580..ec1282858cb7 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -37,7 +37,7 @@ static const struct nla_policy rtm_nh_policy[NHA_MAX + 1] = { }; static int call_nexthop_notifiers(struct net *net, - enum fib_event_type event_type, + enum nexthop_event_type event_type, struct nexthop *nh) { int err; -- cgit v1.2.3 From 20f6a05ef63594feb0c6dfbd629da0448b43124d Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Wed, 27 May 2020 12:34:30 +0000 Subject: bridge: mrp: Rework the MRP netlink interface This patch reworks the MRP netlink interface. Before, each attribute represented a binary structure which made it hard to be extended. Therefore update the MRP netlink interface such that each existing attribute to be a nested attribute which contains the fields of the binary structures. In this way the MRP netlink interface can be extended without breaking the backwards compatibility. It is also using strict checking for attributes under the MRP top attribute. Signed-off-by: Horatiu Vultur Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 64 ++++++++-- net/bridge/br_mrp.c | 8 +- net/bridge/br_mrp_netlink.c | 266 +++++++++++++++++++++++++++++++++++------ net/bridge/br_private_mrp.h | 2 +- 4 files changed, 290 insertions(+), 50 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index bd8c95488f16..5a43eb86c93b 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -169,17 +169,69 @@ enum { __IFLA_BRIDGE_MRP_MAX, }; +#define IFLA_BRIDGE_MRP_MAX (__IFLA_BRIDGE_MRP_MAX - 1) + +enum { + IFLA_BRIDGE_MRP_INSTANCE_UNSPEC, + IFLA_BRIDGE_MRP_INSTANCE_RING_ID, + IFLA_BRIDGE_MRP_INSTANCE_P_IFINDEX, + IFLA_BRIDGE_MRP_INSTANCE_S_IFINDEX, + __IFLA_BRIDGE_MRP_INSTANCE_MAX, +}; + +#define IFLA_BRIDGE_MRP_INSTANCE_MAX (__IFLA_BRIDGE_MRP_INSTANCE_MAX - 1) + +enum { + IFLA_BRIDGE_MRP_PORT_STATE_UNSPEC, + IFLA_BRIDGE_MRP_PORT_STATE_STATE, + __IFLA_BRIDGE_MRP_PORT_STATE_MAX, +}; + +#define IFLA_BRIDGE_MRP_PORT_STATE_MAX (__IFLA_BRIDGE_MRP_PORT_STATE_MAX - 1) + +enum { + IFLA_BRIDGE_MRP_PORT_ROLE_UNSPEC, + IFLA_BRIDGE_MRP_PORT_ROLE_ROLE, + __IFLA_BRIDGE_MRP_PORT_ROLE_MAX, +}; + +#define IFLA_BRIDGE_MRP_PORT_ROLE_MAX (__IFLA_BRIDGE_MRP_PORT_ROLE_MAX - 1) + +enum { + IFLA_BRIDGE_MRP_RING_STATE_UNSPEC, + IFLA_BRIDGE_MRP_RING_STATE_RING_ID, + IFLA_BRIDGE_MRP_RING_STATE_STATE, + __IFLA_BRIDGE_MRP_RING_STATE_MAX, +}; + +#define IFLA_BRIDGE_MRP_RING_STATE_MAX (__IFLA_BRIDGE_MRP_RING_STATE_MAX - 1) + +enum { + IFLA_BRIDGE_MRP_RING_ROLE_UNSPEC, + IFLA_BRIDGE_MRP_RING_ROLE_RING_ID, + IFLA_BRIDGE_MRP_RING_ROLE_ROLE, + __IFLA_BRIDGE_MRP_RING_ROLE_MAX, +}; + +#define IFLA_BRIDGE_MRP_RING_ROLE_MAX (__IFLA_BRIDGE_MRP_RING_ROLE_MAX - 1) + +enum { + IFLA_BRIDGE_MRP_START_TEST_UNSPEC, + IFLA_BRIDGE_MRP_START_TEST_RING_ID, + IFLA_BRIDGE_MRP_START_TEST_INTERVAL, + IFLA_BRIDGE_MRP_START_TEST_MAX_MISS, + IFLA_BRIDGE_MRP_START_TEST_PERIOD, + __IFLA_BRIDGE_MRP_START_TEST_MAX, +}; + +#define IFLA_BRIDGE_MRP_START_TEST_MAX (__IFLA_BRIDGE_MRP_START_TEST_MAX - 1) + struct br_mrp_instance { __u32 ring_id; __u32 p_ifindex; __u32 s_ifindex; }; -struct br_mrp_port_role { - __u32 ring_id; - __u32 role; -}; - struct br_mrp_ring_state { __u32 ring_id; __u32 ring_state; @@ -197,8 +249,6 @@ struct br_mrp_start_test { __u32 period; }; -#define IFLA_BRIDGE_MRP_MAX (__IFLA_BRIDGE_MRP_MAX - 1) - struct bridge_stp_xstats { __u64 transition_blk; __u64 transition_fwd; diff --git a/net/bridge/br_mrp.c b/net/bridge/br_mrp.c index 528d767eb026..8ea59504ef47 100644 --- a/net/bridge/br_mrp.c +++ b/net/bridge/br_mrp.c @@ -376,24 +376,24 @@ int br_mrp_set_port_state(struct net_bridge_port *p, * note: already called with rtnl_lock */ int br_mrp_set_port_role(struct net_bridge_port *p, - struct br_mrp_port_role *role) + enum br_mrp_port_role_type role) { struct br_mrp *mrp; if (!p || !(p->flags & BR_MRP_AWARE)) return -EINVAL; - mrp = br_mrp_find_id(p->br, role->ring_id); + mrp = br_mrp_find_port(p->br, p); if (!mrp) return -EINVAL; - if (role->role == BR_MRP_PORT_ROLE_PRIMARY) + if (role == BR_MRP_PORT_ROLE_PRIMARY) rcu_assign_pointer(mrp->p_port, p); else rcu_assign_pointer(mrp->s_port, p); - br_mrp_port_switchdev_set_role(p, role->role); + br_mrp_port_switchdev_set_role(p, role); return 0; } diff --git a/net/bridge/br_mrp_netlink.c b/net/bridge/br_mrp_netlink.c index 4a08a99519b0..d9de780d2ce0 100644 --- a/net/bridge/br_mrp_netlink.c +++ b/net/bridge/br_mrp_netlink.c @@ -8,19 +8,222 @@ static const struct nla_policy br_mrp_policy[IFLA_BRIDGE_MRP_MAX + 1] = { [IFLA_BRIDGE_MRP_UNSPEC] = { .type = NLA_REJECT }, - [IFLA_BRIDGE_MRP_INSTANCE] = { .type = NLA_EXACT_LEN, - .len = sizeof(struct br_mrp_instance)}, - [IFLA_BRIDGE_MRP_PORT_STATE] = { .type = NLA_U32 }, - [IFLA_BRIDGE_MRP_PORT_ROLE] = { .type = NLA_EXACT_LEN, - .len = sizeof(struct br_mrp_port_role)}, - [IFLA_BRIDGE_MRP_RING_STATE] = { .type = NLA_EXACT_LEN, - .len = sizeof(struct br_mrp_ring_state)}, - [IFLA_BRIDGE_MRP_RING_ROLE] = { .type = NLA_EXACT_LEN, - .len = sizeof(struct br_mrp_ring_role)}, - [IFLA_BRIDGE_MRP_START_TEST] = { .type = NLA_EXACT_LEN, - .len = sizeof(struct br_mrp_start_test)}, + [IFLA_BRIDGE_MRP_INSTANCE] = { .type = NLA_NESTED }, + [IFLA_BRIDGE_MRP_PORT_STATE] = { .type = NLA_NESTED }, + [IFLA_BRIDGE_MRP_PORT_ROLE] = { .type = NLA_NESTED }, + [IFLA_BRIDGE_MRP_RING_STATE] = { .type = NLA_NESTED }, + [IFLA_BRIDGE_MRP_RING_ROLE] = { .type = NLA_NESTED }, + [IFLA_BRIDGE_MRP_START_TEST] = { .type = NLA_NESTED }, }; +static const struct nla_policy +br_mrp_instance_policy[IFLA_BRIDGE_MRP_INSTANCE_MAX + 1] = { + [IFLA_BRIDGE_MRP_INSTANCE_UNSPEC] = { .type = NLA_REJECT }, + [IFLA_BRIDGE_MRP_INSTANCE_RING_ID] = { .type = NLA_U32 }, + [IFLA_BRIDGE_MRP_INSTANCE_P_IFINDEX] = { .type = NLA_U32 }, + [IFLA_BRIDGE_MRP_INSTANCE_S_IFINDEX] = { .type = NLA_U32 }, +}; + +static int br_mrp_instance_parse(struct net_bridge *br, struct nlattr *attr, + int cmd, struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_BRIDGE_MRP_INSTANCE_MAX + 1]; + struct br_mrp_instance inst; + int err; + + err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_INSTANCE_MAX, attr, + br_mrp_instance_policy, extack); + if (err) + return err; + + if (!tb[IFLA_BRIDGE_MRP_INSTANCE_RING_ID] || + !tb[IFLA_BRIDGE_MRP_INSTANCE_P_IFINDEX] || + !tb[IFLA_BRIDGE_MRP_INSTANCE_S_IFINDEX]) { + NL_SET_ERR_MSG_MOD(extack, + "Missing attribute: RING_ID or P_IFINDEX or S_IFINDEX"); + return -EINVAL; + } + + memset(&inst, 0, sizeof(inst)); + + inst.ring_id = nla_get_u32(tb[IFLA_BRIDGE_MRP_INSTANCE_RING_ID]); + inst.p_ifindex = nla_get_u32(tb[IFLA_BRIDGE_MRP_INSTANCE_P_IFINDEX]); + inst.s_ifindex = nla_get_u32(tb[IFLA_BRIDGE_MRP_INSTANCE_S_IFINDEX]); + + if (cmd == RTM_SETLINK) + return br_mrp_add(br, &inst); + else + return br_mrp_del(br, &inst); + + return 0; +} + +static const struct nla_policy +br_mrp_port_state_policy[IFLA_BRIDGE_MRP_PORT_STATE_MAX + 1] = { + [IFLA_BRIDGE_MRP_PORT_STATE_UNSPEC] = { .type = NLA_REJECT }, + [IFLA_BRIDGE_MRP_PORT_STATE_STATE] = { .type = NLA_U32 }, +}; + +static int br_mrp_port_state_parse(struct net_bridge_port *p, + struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_BRIDGE_MRP_PORT_STATE_MAX + 1]; + enum br_mrp_port_state_type state; + int err; + + err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_PORT_STATE_MAX, attr, + br_mrp_port_state_policy, extack); + if (err) + return err; + + if (!tb[IFLA_BRIDGE_MRP_PORT_STATE_STATE]) { + NL_SET_ERR_MSG_MOD(extack, "Missing attribute: STATE"); + return -EINVAL; + } + + state = nla_get_u32(tb[IFLA_BRIDGE_MRP_PORT_STATE_STATE]); + + return br_mrp_set_port_state(p, state); +} + +static const struct nla_policy +br_mrp_port_role_policy[IFLA_BRIDGE_MRP_PORT_ROLE_MAX + 1] = { + [IFLA_BRIDGE_MRP_PORT_ROLE_UNSPEC] = { .type = NLA_REJECT }, + [IFLA_BRIDGE_MRP_PORT_ROLE_ROLE] = { .type = NLA_U32 }, +}; + +static int br_mrp_port_role_parse(struct net_bridge_port *p, + struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_BRIDGE_MRP_PORT_ROLE_MAX + 1]; + enum br_mrp_port_role_type role; + int err; + + err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_PORT_ROLE_MAX, attr, + br_mrp_port_role_policy, extack); + if (err) + return err; + + if (!tb[IFLA_BRIDGE_MRP_PORT_ROLE_ROLE]) { + NL_SET_ERR_MSG_MOD(extack, "Missing attribute: ROLE"); + return -EINVAL; + } + + role = nla_get_u32(tb[IFLA_BRIDGE_MRP_PORT_ROLE_ROLE]); + + return br_mrp_set_port_role(p, role); +} + +static const struct nla_policy +br_mrp_ring_state_policy[IFLA_BRIDGE_MRP_RING_STATE_MAX + 1] = { + [IFLA_BRIDGE_MRP_RING_STATE_UNSPEC] = { .type = NLA_REJECT }, + [IFLA_BRIDGE_MRP_RING_STATE_RING_ID] = { .type = NLA_U32 }, + [IFLA_BRIDGE_MRP_RING_STATE_STATE] = { .type = NLA_U32 }, +}; + +static int br_mrp_ring_state_parse(struct net_bridge *br, struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_BRIDGE_MRP_RING_STATE_MAX + 1]; + struct br_mrp_ring_state state; + int err; + + err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_RING_STATE_MAX, attr, + br_mrp_ring_state_policy, extack); + if (err) + return err; + + if (!tb[IFLA_BRIDGE_MRP_RING_STATE_RING_ID] || + !tb[IFLA_BRIDGE_MRP_RING_STATE_STATE]) { + NL_SET_ERR_MSG_MOD(extack, + "Missing attribute: RING_ID or STATE"); + return -EINVAL; + } + + memset(&state, 0x0, sizeof(state)); + + state.ring_id = nla_get_u32(tb[IFLA_BRIDGE_MRP_RING_STATE_RING_ID]); + state.ring_state = nla_get_u32(tb[IFLA_BRIDGE_MRP_RING_STATE_STATE]); + + return br_mrp_set_ring_state(br, &state); +} + +static const struct nla_policy +br_mrp_ring_role_policy[IFLA_BRIDGE_MRP_RING_ROLE_MAX + 1] = { + [IFLA_BRIDGE_MRP_RING_ROLE_UNSPEC] = { .type = NLA_REJECT }, + [IFLA_BRIDGE_MRP_RING_ROLE_RING_ID] = { .type = NLA_U32 }, + [IFLA_BRIDGE_MRP_RING_ROLE_ROLE] = { .type = NLA_U32 }, +}; + +static int br_mrp_ring_role_parse(struct net_bridge *br, struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_BRIDGE_MRP_RING_ROLE_MAX + 1]; + struct br_mrp_ring_role role; + int err; + + err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_RING_ROLE_MAX, attr, + br_mrp_ring_role_policy, extack); + if (err) + return err; + + if (!tb[IFLA_BRIDGE_MRP_RING_ROLE_RING_ID] || + !tb[IFLA_BRIDGE_MRP_RING_ROLE_ROLE]) { + NL_SET_ERR_MSG_MOD(extack, + "Missing attribute: RING_ID or ROLE"); + return -EINVAL; + } + + memset(&role, 0x0, sizeof(role)); + + role.ring_id = nla_get_u32(tb[IFLA_BRIDGE_MRP_RING_ROLE_RING_ID]); + role.ring_role = nla_get_u32(tb[IFLA_BRIDGE_MRP_RING_ROLE_ROLE]); + + return br_mrp_set_ring_role(br, &role); +} + +static const struct nla_policy +br_mrp_start_test_policy[IFLA_BRIDGE_MRP_START_TEST_MAX + 1] = { + [IFLA_BRIDGE_MRP_START_TEST_UNSPEC] = { .type = NLA_REJECT }, + [IFLA_BRIDGE_MRP_START_TEST_RING_ID] = { .type = NLA_U32 }, + [IFLA_BRIDGE_MRP_START_TEST_INTERVAL] = { .type = NLA_U32 }, + [IFLA_BRIDGE_MRP_START_TEST_MAX_MISS] = { .type = NLA_U32 }, + [IFLA_BRIDGE_MRP_START_TEST_PERIOD] = { .type = NLA_U32 }, +}; + +static int br_mrp_start_test_parse(struct net_bridge *br, struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_BRIDGE_MRP_START_TEST_MAX + 1]; + struct br_mrp_start_test test; + int err; + + err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_START_TEST_MAX, attr, + br_mrp_start_test_policy, extack); + if (err) + return err; + + if (!tb[IFLA_BRIDGE_MRP_START_TEST_RING_ID] || + !tb[IFLA_BRIDGE_MRP_START_TEST_INTERVAL] || + !tb[IFLA_BRIDGE_MRP_START_TEST_MAX_MISS] || + !tb[IFLA_BRIDGE_MRP_START_TEST_PERIOD]) { + NL_SET_ERR_MSG_MOD(extack, + "Missing attribute: RING_ID or INTERVAL or MAX_MISS or PERIOD"); + return -EINVAL; + } + + memset(&test, 0x0, sizeof(test)); + + test.ring_id = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_TEST_RING_ID]); + test.interval = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_TEST_INTERVAL]); + test.max_miss = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_TEST_MAX_MISS]); + test.period = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_TEST_PERIOD]); + + return br_mrp_start_test(br, &test); +} + int br_mrp_parse(struct net_bridge *br, struct net_bridge_port *p, struct nlattr *attr, int cmd, struct netlink_ext_ack *extack) { @@ -44,58 +247,45 @@ int br_mrp_parse(struct net_bridge *br, struct net_bridge_port *p, return err; if (tb[IFLA_BRIDGE_MRP_INSTANCE]) { - struct br_mrp_instance *instance = - nla_data(tb[IFLA_BRIDGE_MRP_INSTANCE]); - - if (cmd == RTM_SETLINK) - err = br_mrp_add(br, instance); - else - err = br_mrp_del(br, instance); + err = br_mrp_instance_parse(br, tb[IFLA_BRIDGE_MRP_INSTANCE], + cmd, extack); if (err) return err; } if (tb[IFLA_BRIDGE_MRP_PORT_STATE]) { - enum br_mrp_port_state_type state = - nla_get_u32(tb[IFLA_BRIDGE_MRP_PORT_STATE]); - - err = br_mrp_set_port_state(p, state); + err = br_mrp_port_state_parse(p, tb[IFLA_BRIDGE_MRP_PORT_STATE], + extack); if (err) return err; } if (tb[IFLA_BRIDGE_MRP_PORT_ROLE]) { - struct br_mrp_port_role *role = - nla_data(tb[IFLA_BRIDGE_MRP_PORT_ROLE]); - - err = br_mrp_set_port_role(p, role); + err = br_mrp_port_role_parse(p, tb[IFLA_BRIDGE_MRP_PORT_ROLE], + extack); if (err) return err; } if (tb[IFLA_BRIDGE_MRP_RING_STATE]) { - struct br_mrp_ring_state *state = - nla_data(tb[IFLA_BRIDGE_MRP_RING_STATE]); - - err = br_mrp_set_ring_state(br, state); + err = br_mrp_ring_state_parse(br, + tb[IFLA_BRIDGE_MRP_RING_STATE], + extack); if (err) return err; } if (tb[IFLA_BRIDGE_MRP_RING_ROLE]) { - struct br_mrp_ring_role *role = - nla_data(tb[IFLA_BRIDGE_MRP_RING_ROLE]); - - err = br_mrp_set_ring_role(br, role); + err = br_mrp_ring_role_parse(br, tb[IFLA_BRIDGE_MRP_RING_ROLE], + extack); if (err) return err; } if (tb[IFLA_BRIDGE_MRP_START_TEST]) { - struct br_mrp_start_test *test = - nla_data(tb[IFLA_BRIDGE_MRP_START_TEST]); - - err = br_mrp_start_test(br, test); + err = br_mrp_start_test_parse(br, + tb[IFLA_BRIDGE_MRP_START_TEST], + extack); if (err) return err; } diff --git a/net/bridge/br_private_mrp.h b/net/bridge/br_private_mrp.h index 2921a4b59f8e..a0f53cc3ab85 100644 --- a/net/bridge/br_private_mrp.h +++ b/net/bridge/br_private_mrp.h @@ -37,7 +37,7 @@ int br_mrp_del(struct net_bridge *br, struct br_mrp_instance *instance); int br_mrp_set_port_state(struct net_bridge_port *p, enum br_mrp_port_state_type state); int br_mrp_set_port_role(struct net_bridge_port *p, - struct br_mrp_port_role *role); + enum br_mrp_port_role_type role); int br_mrp_set_ring_state(struct net_bridge *br, struct br_mrp_ring_state *state); int br_mrp_set_ring_role(struct net_bridge *br, struct br_mrp_ring_role *role); -- cgit v1.2.3 From cb8aa9a3affb7d23b11b11fbed41e2feaabc4b0a Mon Sep 17 00:00:00 2001 From: Romain Bellan Date: Mon, 4 May 2020 21:34:29 +0200 Subject: netfilter: ctnetlink: add kernel side filtering for dump Conntrack dump does not support kernel side filtering (only get exists, but it returns only one entry. And user has to give a full valid tuple) It means that userspace has to implement filtering after receiving many irrelevant entries, consuming resources (conntrack table is sometimes very huge, much more than a routing table for example). This patch adds filtering in kernel side. To achieve this goal, we: * Add a new CTA_FILTER netlink attributes, actually a flag list to parametize filtering * Convert some *nlattr_to_tuple() functions, to allow a partial parsing of CTA_TUPLE_ORIG and CTA_TUPLE_REPLY (so nf_conntrack_tuple it not fully set) Filtering is now possible on: * IP SRC/DST values * Ports for TCP and UDP flows * IMCP(v6) codes types and IDs Filtering is done as an "AND" operator. For example, when flags PROTO_SRC_PORT, PROTO_NUM and IP_SRC are sets, only entries matching all values are dumped. Changes since v1: Set NLM_F_DUMP_FILTERED in nlm flags if entries are filtered Changes since v2: Move several constants to nf_internals.h Move a fix on netlink values check in a separate patch Add a check on not-supported flags Return EOPNOTSUPP if CDA_FILTER is set in ctnetlink_flush_conntrack (not yet implemented) Code style issues Changes since v3: Fix compilation warning reported by kbuild test robot Changes since v4: Fix a regression introduced in v3 (returned EINVAL for valid netlink messages without CTA_MARK) Changes since v5: Change definition of CTA_FILTER_F_ALL Fix a regression when CTA_TUPLE_ZONE is not set Signed-off-by: Romain Bellan Signed-off-by: Florent Fourcot Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_l4proto.h | 6 +- include/uapi/linux/netfilter/nfnetlink_conntrack.h | 9 + net/netfilter/nf_conntrack_core.c | 19 +- net/netfilter/nf_conntrack_netlink.c | 334 ++++++++++++++++++--- net/netfilter/nf_conntrack_proto_icmp.c | 40 ++- net/netfilter/nf_conntrack_proto_icmpv6.c | 42 ++- net/netfilter/nf_internals.h | 17 ++ 7 files changed, 394 insertions(+), 73 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h index 4cad1f0a327a..88186b95b3c2 100644 --- a/include/net/netfilter/nf_conntrack_l4proto.h +++ b/include/net/netfilter/nf_conntrack_l4proto.h @@ -42,7 +42,8 @@ struct nf_conntrack_l4proto { /* Calculate tuple nlattr size */ unsigned int (*nlattr_tuple_size)(void); int (*nlattr_to_tuple)(struct nlattr *tb[], - struct nf_conntrack_tuple *t); + struct nf_conntrack_tuple *t, + u_int32_t flags); const struct nla_policy *nla_policy; struct { @@ -152,7 +153,8 @@ const struct nf_conntrack_l4proto *nf_ct_l4proto_find(u8 l4proto); int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple); int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[], - struct nf_conntrack_tuple *t); + struct nf_conntrack_tuple *t, + u_int32_t flags); unsigned int nf_ct_port_nlattr_tuple_size(void); extern const struct nla_policy nf_ct_port_nla_policy[]; diff --git a/include/uapi/linux/netfilter/nfnetlink_conntrack.h b/include/uapi/linux/netfilter/nfnetlink_conntrack.h index 1d41810d17e2..262881792671 100644 --- a/include/uapi/linux/netfilter/nfnetlink_conntrack.h +++ b/include/uapi/linux/netfilter/nfnetlink_conntrack.h @@ -55,6 +55,7 @@ enum ctattr_type { CTA_LABELS, CTA_LABELS_MASK, CTA_SYNPROXY, + CTA_FILTER, __CTA_MAX }; #define CTA_MAX (__CTA_MAX - 1) @@ -276,4 +277,12 @@ enum ctattr_expect_stats { }; #define CTA_STATS_EXP_MAX (__CTA_STATS_EXP_MAX - 1) +enum ctattr_filter { + CTA_FILTER_UNSPEC, + CTA_FILTER_ORIG_FLAGS, + CTA_FILTER_REPLY_FLAGS, + __CTA_FILTER_MAX +}; +#define CTA_FILTER_MAX (__CTA_FILTER_MAX - 1) + #endif /* _IPCONNTRACK_NETLINK_H */ diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 1d57b95d3481..8abb1727bcc4 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1974,13 +1974,22 @@ const struct nla_policy nf_ct_port_nla_policy[CTA_PROTO_MAX+1] = { EXPORT_SYMBOL_GPL(nf_ct_port_nla_policy); int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[], - struct nf_conntrack_tuple *t) + struct nf_conntrack_tuple *t, + u_int32_t flags) { - if (!tb[CTA_PROTO_SRC_PORT] || !tb[CTA_PROTO_DST_PORT]) - return -EINVAL; + if (flags & CTA_FILTER_FLAG(CTA_PROTO_SRC_PORT)) { + if (!tb[CTA_PROTO_SRC_PORT]) + return -EINVAL; + + t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]); + } - t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]); - t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]); + if (flags & CTA_FILTER_FLAG(CTA_PROTO_DST_PORT)) { + if (!tb[CTA_PROTO_DST_PORT]) + return -EINVAL; + + t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]); + } return 0; } diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 9ddfcd002d3b..d7bd8b1f27d5 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -54,6 +54,8 @@ #include #include +#include "nf_internals.h" + MODULE_LICENSE("GPL"); static int ctnetlink_dump_tuples_proto(struct sk_buff *skb, @@ -544,14 +546,16 @@ static int ctnetlink_dump_info(struct sk_buff *skb, struct nf_conn *ct) static int ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, - struct nf_conn *ct, bool extinfo) + struct nf_conn *ct, bool extinfo, unsigned int flags) { const struct nf_conntrack_zone *zone; struct nlmsghdr *nlh; struct nfgenmsg *nfmsg; struct nlattr *nest_parms; - unsigned int flags = portid ? NLM_F_MULTI : 0, event; + unsigned int event; + if (portid) + flags |= NLM_F_MULTI; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, IPCTNL_MSG_CT_NEW); nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); if (nlh == NULL) @@ -847,17 +851,70 @@ static int ctnetlink_done(struct netlink_callback *cb) } struct ctnetlink_filter { + u_int32_t cta_flags; u8 family; + + u_int32_t orig_flags; + u_int32_t reply_flags; + + struct nf_conntrack_tuple orig; + struct nf_conntrack_tuple reply; + struct nf_conntrack_zone zone; + struct { u_int32_t val; u_int32_t mask; } mark; }; +static const struct nla_policy cta_filter_nla_policy[CTA_FILTER_MAX + 1] = { + [CTA_FILTER_ORIG_FLAGS] = { .type = NLA_U32 }, + [CTA_FILTER_REPLY_FLAGS] = { .type = NLA_U32 }, +}; + +static int ctnetlink_parse_filter(const struct nlattr *attr, + struct ctnetlink_filter *filter) +{ + struct nlattr *tb[CTA_FILTER_MAX + 1]; + int ret = 0; + + ret = nla_parse_nested(tb, CTA_FILTER_MAX, attr, cta_filter_nla_policy, + NULL); + if (ret) + return ret; + + if (tb[CTA_FILTER_ORIG_FLAGS]) { + filter->orig_flags = nla_get_u32(tb[CTA_FILTER_ORIG_FLAGS]); + if (filter->orig_flags & ~CTA_FILTER_F_ALL) + return -EOPNOTSUPP; + } + + if (tb[CTA_FILTER_REPLY_FLAGS]) { + filter->reply_flags = nla_get_u32(tb[CTA_FILTER_REPLY_FLAGS]); + if (filter->reply_flags & ~CTA_FILTER_F_ALL) + return -EOPNOTSUPP; + } + + return 0; +} + +static int ctnetlink_parse_zone(const struct nlattr *attr, + struct nf_conntrack_zone *zone); +static int ctnetlink_parse_tuple_filter(const struct nlattr * const cda[], + struct nf_conntrack_tuple *tuple, + u32 type, u_int8_t l3num, + struct nf_conntrack_zone *zone, + u_int32_t flags); + +/* applied on filters */ +#define CTA_FILTER_F_CTA_MARK (1 << 0) +#define CTA_FILTER_F_CTA_MARK_MASK (1 << 1) + static struct ctnetlink_filter * ctnetlink_alloc_filter(const struct nlattr * const cda[], u8 family) { struct ctnetlink_filter *filter; + int err; #ifndef CONFIG_NF_CONNTRACK_MARK if (cda[CTA_MARK] || cda[CTA_MARK_MASK]) @@ -871,14 +928,65 @@ ctnetlink_alloc_filter(const struct nlattr * const cda[], u8 family) filter->family = family; #ifdef CONFIG_NF_CONNTRACK_MARK - if (cda[CTA_MARK] && cda[CTA_MARK_MASK]) { + if (cda[CTA_MARK]) { filter->mark.val = ntohl(nla_get_be32(cda[CTA_MARK])); - filter->mark.mask = ntohl(nla_get_be32(cda[CTA_MARK_MASK])); + filter->cta_flags |= CTA_FILTER_FLAG(CTA_MARK); + + if (cda[CTA_MARK_MASK]) { + filter->mark.mask = ntohl(nla_get_be32(cda[CTA_MARK_MASK])); + filter->cta_flags |= CTA_FILTER_FLAG(CTA_MARK_MASK); + } else { + filter->mark.mask = 0xffffffff; + } + } else if (cda[CTA_MARK_MASK]) { + return ERR_PTR(-EINVAL); } #endif + if (!cda[CTA_FILTER]) + return filter; + + err = ctnetlink_parse_zone(cda[CTA_ZONE], &filter->zone); + if (err < 0) + return ERR_PTR(err); + + err = ctnetlink_parse_filter(cda[CTA_FILTER], filter); + if (err < 0) + return ERR_PTR(err); + + if (filter->orig_flags) { + if (!cda[CTA_TUPLE_ORIG]) + return ERR_PTR(-EINVAL); + + err = ctnetlink_parse_tuple_filter(cda, &filter->orig, + CTA_TUPLE_ORIG, + filter->family, + &filter->zone, + filter->orig_flags); + if (err < 0) + return ERR_PTR(err); + } + + if (filter->reply_flags) { + if (!cda[CTA_TUPLE_REPLY]) + return ERR_PTR(-EINVAL); + + err = ctnetlink_parse_tuple_filter(cda, &filter->reply, + CTA_TUPLE_REPLY, + filter->family, + &filter->zone, + filter->orig_flags); + if (err < 0) + return ERR_PTR(err); + } + return filter; } +static bool ctnetlink_needs_filter(u8 family, const struct nlattr * const *cda) +{ + return family || cda[CTA_MARK] || cda[CTA_FILTER]; +} + static int ctnetlink_start(struct netlink_callback *cb) { const struct nlattr * const *cda = cb->data; @@ -886,7 +994,7 @@ static int ctnetlink_start(struct netlink_callback *cb) struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); u8 family = nfmsg->nfgen_family; - if (family || (cda[CTA_MARK] && cda[CTA_MARK_MASK])) { + if (ctnetlink_needs_filter(family, cda)) { filter = ctnetlink_alloc_filter(cda, family); if (IS_ERR(filter)) return PTR_ERR(filter); @@ -896,9 +1004,79 @@ static int ctnetlink_start(struct netlink_callback *cb) return 0; } +static int ctnetlink_filter_match_tuple(struct nf_conntrack_tuple *filter_tuple, + struct nf_conntrack_tuple *ct_tuple, + u_int32_t flags, int family) +{ + switch (family) { + case NFPROTO_IPV4: + if ((flags & CTA_FILTER_FLAG(CTA_IP_SRC)) && + filter_tuple->src.u3.ip != ct_tuple->src.u3.ip) + return 0; + + if ((flags & CTA_FILTER_FLAG(CTA_IP_DST)) && + filter_tuple->dst.u3.ip != ct_tuple->dst.u3.ip) + return 0; + break; + case NFPROTO_IPV6: + if ((flags & CTA_FILTER_FLAG(CTA_IP_SRC)) && + !ipv6_addr_cmp(&filter_tuple->src.u3.in6, + &ct_tuple->src.u3.in6)) + return 0; + + if ((flags & CTA_FILTER_FLAG(CTA_IP_DST)) && + !ipv6_addr_cmp(&filter_tuple->dst.u3.in6, + &ct_tuple->dst.u3.in6)) + return 0; + break; + } + + if ((flags & CTA_FILTER_FLAG(CTA_PROTO_NUM)) && + filter_tuple->dst.protonum != ct_tuple->dst.protonum) + return 0; + + switch (ct_tuple->dst.protonum) { + case IPPROTO_TCP: + case IPPROTO_UDP: + if ((flags & CTA_FILTER_FLAG(CTA_PROTO_SRC_PORT)) && + filter_tuple->src.u.tcp.port != ct_tuple->src.u.tcp.port) + return 0; + + if ((flags & CTA_FILTER_FLAG(CTA_PROTO_DST_PORT)) && + filter_tuple->dst.u.tcp.port != ct_tuple->dst.u.tcp.port) + return 0; + break; + case IPPROTO_ICMP: + if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_TYPE)) && + filter_tuple->dst.u.icmp.type != ct_tuple->dst.u.icmp.type) + return 0; + if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_CODE)) && + filter_tuple->dst.u.icmp.code != ct_tuple->dst.u.icmp.code) + return 0; + if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_ID)) && + filter_tuple->src.u.icmp.id != ct_tuple->src.u.icmp.id) + return 0; + break; + case IPPROTO_ICMPV6: + if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_TYPE)) && + filter_tuple->dst.u.icmp.type != ct_tuple->dst.u.icmp.type) + return 0; + if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_CODE)) && + filter_tuple->dst.u.icmp.code != ct_tuple->dst.u.icmp.code) + return 0; + if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_ID)) && + filter_tuple->src.u.icmp.id != ct_tuple->src.u.icmp.id) + return 0; + break; + } + + return 1; +} + static int ctnetlink_filter_match(struct nf_conn *ct, void *data) { struct ctnetlink_filter *filter = data; + struct nf_conntrack_tuple *tuple; if (filter == NULL) goto out; @@ -910,8 +1088,28 @@ static int ctnetlink_filter_match(struct nf_conn *ct, void *data) if (filter->family && nf_ct_l3num(ct) != filter->family) goto ignore_entry; + if (filter->orig_flags) { + tuple = nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL); + if (!ctnetlink_filter_match_tuple(&filter->orig, tuple, + filter->orig_flags, + filter->family)) + goto ignore_entry; + } + + if (filter->reply_flags) { + tuple = nf_ct_tuple(ct, IP_CT_DIR_REPLY); + if (!ctnetlink_filter_match_tuple(&filter->reply, tuple, + filter->reply_flags, + filter->family)) + goto ignore_entry; + } + #ifdef CONFIG_NF_CONNTRACK_MARK - if ((ct->mark & filter->mark.mask) != filter->mark.val) + if ((filter->cta_flags & CTA_FILTER_FLAG(CTA_MARK_MASK)) && + (ct->mark & filter->mark.mask) != filter->mark.val) + goto ignore_entry; + else if ((filter->cta_flags & CTA_FILTER_FLAG(CTA_MARK)) && + ct->mark != filter->mark.val) goto ignore_entry; #endif @@ -925,6 +1123,7 @@ ignore_entry: static int ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) { + unsigned int flags = cb->data ? NLM_F_DUMP_FILTERED : 0; struct net *net = sock_net(skb->sk); struct nf_conn *ct, *last; struct nf_conntrack_tuple_hash *h; @@ -979,7 +1178,7 @@ restart: ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFNL_MSG_TYPE(cb->nlh->nlmsg_type), - ct, true); + ct, true, flags); if (res < 0) { nf_conntrack_get(&ct->ct_general); cb->args[1] = (unsigned long)ct; @@ -1014,31 +1213,50 @@ out: } static int ipv4_nlattr_to_tuple(struct nlattr *tb[], - struct nf_conntrack_tuple *t) + struct nf_conntrack_tuple *t, + u_int32_t flags) { - if (!tb[CTA_IP_V4_SRC] || !tb[CTA_IP_V4_DST]) - return -EINVAL; + if (flags & CTA_FILTER_FLAG(CTA_IP_SRC)) { + if (!tb[CTA_IP_V4_SRC]) + return -EINVAL; + + t->src.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_SRC]); + } + + if (flags & CTA_FILTER_FLAG(CTA_IP_DST)) { + if (!tb[CTA_IP_V4_DST]) + return -EINVAL; - t->src.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_SRC]); - t->dst.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_DST]); + t->dst.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_DST]); + } return 0; } static int ipv6_nlattr_to_tuple(struct nlattr *tb[], - struct nf_conntrack_tuple *t) + struct nf_conntrack_tuple *t, + u_int32_t flags) { - if (!tb[CTA_IP_V6_SRC] || !tb[CTA_IP_V6_DST]) - return -EINVAL; + if (flags & CTA_FILTER_FLAG(CTA_IP_SRC)) { + if (!tb[CTA_IP_V6_SRC]) + return -EINVAL; - t->src.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_SRC]); - t->dst.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_DST]); + t->src.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_SRC]); + } + + if (flags & CTA_FILTER_FLAG(CTA_IP_DST)) { + if (!tb[CTA_IP_V6_DST]) + return -EINVAL; + + t->dst.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_DST]); + } return 0; } static int ctnetlink_parse_tuple_ip(struct nlattr *attr, - struct nf_conntrack_tuple *tuple) + struct nf_conntrack_tuple *tuple, + u_int32_t flags) { struct nlattr *tb[CTA_IP_MAX+1]; int ret = 0; @@ -1054,10 +1272,10 @@ static int ctnetlink_parse_tuple_ip(struct nlattr *attr, switch (tuple->src.l3num) { case NFPROTO_IPV4: - ret = ipv4_nlattr_to_tuple(tb, tuple); + ret = ipv4_nlattr_to_tuple(tb, tuple, flags); break; case NFPROTO_IPV6: - ret = ipv6_nlattr_to_tuple(tb, tuple); + ret = ipv6_nlattr_to_tuple(tb, tuple, flags); break; } @@ -1069,7 +1287,8 @@ static const struct nla_policy proto_nla_policy[CTA_PROTO_MAX+1] = { }; static int ctnetlink_parse_tuple_proto(struct nlattr *attr, - struct nf_conntrack_tuple *tuple) + struct nf_conntrack_tuple *tuple, + u_int32_t flags) { const struct nf_conntrack_l4proto *l4proto; struct nlattr *tb[CTA_PROTO_MAX+1]; @@ -1080,8 +1299,12 @@ static int ctnetlink_parse_tuple_proto(struct nlattr *attr, if (ret < 0) return ret; + if (!(flags & CTA_FILTER_FLAG(CTA_PROTO_NUM))) + return 0; + if (!tb[CTA_PROTO_NUM]) return -EINVAL; + tuple->dst.protonum = nla_get_u8(tb[CTA_PROTO_NUM]); rcu_read_lock(); @@ -1092,7 +1315,7 @@ static int ctnetlink_parse_tuple_proto(struct nlattr *attr, l4proto->nla_policy, NULL); if (ret == 0) - ret = l4proto->nlattr_to_tuple(tb, tuple); + ret = l4proto->nlattr_to_tuple(tb, tuple, flags); } rcu_read_unlock(); @@ -1143,10 +1366,21 @@ static const struct nla_policy tuple_nla_policy[CTA_TUPLE_MAX+1] = { [CTA_TUPLE_ZONE] = { .type = NLA_U16 }, }; +#define CTA_FILTER_F_ALL_CTA_PROTO \ + (CTA_FILTER_F_CTA_PROTO_SRC_PORT | \ + CTA_FILTER_F_CTA_PROTO_DST_PORT | \ + CTA_FILTER_F_CTA_PROTO_ICMP_TYPE | \ + CTA_FILTER_F_CTA_PROTO_ICMP_CODE | \ + CTA_FILTER_F_CTA_PROTO_ICMP_ID | \ + CTA_FILTER_F_CTA_PROTO_ICMPV6_TYPE | \ + CTA_FILTER_F_CTA_PROTO_ICMPV6_CODE | \ + CTA_FILTER_F_CTA_PROTO_ICMPV6_ID) + static int -ctnetlink_parse_tuple(const struct nlattr * const cda[], - struct nf_conntrack_tuple *tuple, u32 type, - u_int8_t l3num, struct nf_conntrack_zone *zone) +ctnetlink_parse_tuple_filter(const struct nlattr * const cda[], + struct nf_conntrack_tuple *tuple, u32 type, + u_int8_t l3num, struct nf_conntrack_zone *zone, + u_int32_t flags) { struct nlattr *tb[CTA_TUPLE_MAX+1]; int err; @@ -1158,23 +1392,32 @@ ctnetlink_parse_tuple(const struct nlattr * const cda[], if (err < 0) return err; - if (!tb[CTA_TUPLE_IP]) - return -EINVAL; tuple->src.l3num = l3num; - err = ctnetlink_parse_tuple_ip(tb[CTA_TUPLE_IP], tuple); - if (err < 0) - return err; + if (flags & CTA_FILTER_FLAG(CTA_IP_DST) || + flags & CTA_FILTER_FLAG(CTA_IP_SRC)) { + if (!tb[CTA_TUPLE_IP]) + return -EINVAL; - if (!tb[CTA_TUPLE_PROTO]) - return -EINVAL; + err = ctnetlink_parse_tuple_ip(tb[CTA_TUPLE_IP], tuple, flags); + if (err < 0) + return err; + } - err = ctnetlink_parse_tuple_proto(tb[CTA_TUPLE_PROTO], tuple); - if (err < 0) - return err; + if (flags & CTA_FILTER_FLAG(CTA_PROTO_NUM)) { + if (!tb[CTA_TUPLE_PROTO]) + return -EINVAL; - if (tb[CTA_TUPLE_ZONE]) { + err = ctnetlink_parse_tuple_proto(tb[CTA_TUPLE_PROTO], tuple, flags); + if (err < 0) + return err; + } else if (flags & CTA_FILTER_FLAG(ALL_CTA_PROTO)) { + /* Can't manage proto flags without a protonum */ + return -EINVAL; + } + + if ((flags & CTA_FILTER_FLAG(CTA_TUPLE_ZONE)) && tb[CTA_TUPLE_ZONE]) { if (!zone) return -EINVAL; @@ -1193,6 +1436,15 @@ ctnetlink_parse_tuple(const struct nlattr * const cda[], return 0; } +static int +ctnetlink_parse_tuple(const struct nlattr * const cda[], + struct nf_conntrack_tuple *tuple, u32 type, + u_int8_t l3num, struct nf_conntrack_zone *zone) +{ + return ctnetlink_parse_tuple_filter(cda, tuple, type, l3num, zone, + CTA_FILTER_FLAG(ALL)); +} + static const struct nla_policy help_nla_policy[CTA_HELP_MAX+1] = { [CTA_HELP_NAME] = { .type = NLA_NUL_STRING, .len = NF_CT_HELPER_NAME_LEN - 1 }, @@ -1240,6 +1492,7 @@ static const struct nla_policy ct_nla_policy[CTA_MAX+1] = { .len = NF_CT_LABELS_MAX_SIZE }, [CTA_LABELS_MASK] = { .type = NLA_BINARY, .len = NF_CT_LABELS_MAX_SIZE }, + [CTA_FILTER] = { .type = NLA_NESTED }, }; static int ctnetlink_flush_iterate(struct nf_conn *ct, void *data) @@ -1256,7 +1509,10 @@ static int ctnetlink_flush_conntrack(struct net *net, { struct ctnetlink_filter *filter = NULL; - if (family || (cda[CTA_MARK] && cda[CTA_MARK_MASK])) { + if (ctnetlink_needs_filter(family, cda)) { + if (cda[CTA_FILTER]) + return -EOPNOTSUPP; + filter = ctnetlink_alloc_filter(cda, family); if (IS_ERR(filter)) return PTR_ERR(filter); @@ -1385,7 +1641,7 @@ static int ctnetlink_get_conntrack(struct net *net, struct sock *ctnl, } err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, - NFNL_MSG_TYPE(nlh->nlmsg_type), ct, true); + NFNL_MSG_TYPE(nlh->nlmsg_type), ct, true, 0); nf_ct_put(ct); if (err <= 0) goto free; @@ -1458,7 +1714,7 @@ restart: res = ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFNL_MSG_TYPE(cb->nlh->nlmsg_type), - ct, dying ? true : false); + ct, dying ? true : false, 0); if (res < 0) { if (!atomic_inc_not_zero(&ct->ct_general.use)) continue; diff --git a/net/netfilter/nf_conntrack_proto_icmp.c b/net/netfilter/nf_conntrack_proto_icmp.c index c2e3dff773bc..4efd8741c105 100644 --- a/net/netfilter/nf_conntrack_proto_icmp.c +++ b/net/netfilter/nf_conntrack_proto_icmp.c @@ -20,6 +20,8 @@ #include #include +#include "nf_internals.h" + static const unsigned int nf_ct_icmp_timeout = 30*HZ; bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, @@ -271,20 +273,32 @@ static const struct nla_policy icmp_nla_policy[CTA_PROTO_MAX+1] = { }; static int icmp_nlattr_to_tuple(struct nlattr *tb[], - struct nf_conntrack_tuple *tuple) + struct nf_conntrack_tuple *tuple, + u_int32_t flags) { - if (!tb[CTA_PROTO_ICMP_TYPE] || - !tb[CTA_PROTO_ICMP_CODE] || - !tb[CTA_PROTO_ICMP_ID]) - return -EINVAL; - - tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMP_TYPE]); - tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMP_CODE]); - tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMP_ID]); - - if (tuple->dst.u.icmp.type >= sizeof(invmap) || - !invmap[tuple->dst.u.icmp.type]) - return -EINVAL; + if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_TYPE)) { + if (!tb[CTA_PROTO_ICMP_TYPE]) + return -EINVAL; + + tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMP_TYPE]); + if (tuple->dst.u.icmp.type >= sizeof(invmap) || + !invmap[tuple->dst.u.icmp.type]) + return -EINVAL; + } + + if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_CODE)) { + if (!tb[CTA_PROTO_ICMP_CODE]) + return -EINVAL; + + tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMP_CODE]); + } + + if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_ID)) { + if (!tb[CTA_PROTO_ICMP_ID]) + return -EINVAL; + + tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMP_ID]); + } return 0; } diff --git a/net/netfilter/nf_conntrack_proto_icmpv6.c b/net/netfilter/nf_conntrack_proto_icmpv6.c index 6f9144e1f1c1..facd8c64ec4e 100644 --- a/net/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/netfilter/nf_conntrack_proto_icmpv6.c @@ -24,6 +24,8 @@ #include #include +#include "nf_internals.h" + static const unsigned int nf_ct_icmpv6_timeout = 30*HZ; bool icmpv6_pkt_to_tuple(const struct sk_buff *skb, @@ -193,21 +195,33 @@ static const struct nla_policy icmpv6_nla_policy[CTA_PROTO_MAX+1] = { }; static int icmpv6_nlattr_to_tuple(struct nlattr *tb[], - struct nf_conntrack_tuple *tuple) + struct nf_conntrack_tuple *tuple, + u_int32_t flags) { - if (!tb[CTA_PROTO_ICMPV6_TYPE] || - !tb[CTA_PROTO_ICMPV6_CODE] || - !tb[CTA_PROTO_ICMPV6_ID]) - return -EINVAL; - - tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMPV6_TYPE]); - tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMPV6_CODE]); - tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMPV6_ID]); - - if (tuple->dst.u.icmp.type < 128 || - tuple->dst.u.icmp.type - 128 >= sizeof(invmap) || - !invmap[tuple->dst.u.icmp.type - 128]) - return -EINVAL; + if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_TYPE)) { + if (!tb[CTA_PROTO_ICMPV6_TYPE]) + return -EINVAL; + + tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMPV6_TYPE]); + if (tuple->dst.u.icmp.type < 128 || + tuple->dst.u.icmp.type - 128 >= sizeof(invmap) || + !invmap[tuple->dst.u.icmp.type - 128]) + return -EINVAL; + } + + if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_CODE)) { + if (!tb[CTA_PROTO_ICMPV6_CODE]) + return -EINVAL; + + tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMPV6_CODE]); + } + + if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_ID)) { + if (!tb[CTA_PROTO_ICMPV6_ID]) + return -EINVAL; + + tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMPV6_ID]); + } return 0; } diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h index d6c43902ebd7..832ae64179f0 100644 --- a/net/netfilter/nf_internals.h +++ b/net/netfilter/nf_internals.h @@ -6,6 +6,23 @@ #include #include +/* nf_conntrack_netlink.c: applied on tuple filters */ +#define CTA_FILTER_F_CTA_IP_SRC (1 << 0) +#define CTA_FILTER_F_CTA_IP_DST (1 << 1) +#define CTA_FILTER_F_CTA_TUPLE_ZONE (1 << 2) +#define CTA_FILTER_F_CTA_PROTO_NUM (1 << 3) +#define CTA_FILTER_F_CTA_PROTO_SRC_PORT (1 << 4) +#define CTA_FILTER_F_CTA_PROTO_DST_PORT (1 << 5) +#define CTA_FILTER_F_CTA_PROTO_ICMP_TYPE (1 << 6) +#define CTA_FILTER_F_CTA_PROTO_ICMP_CODE (1 << 7) +#define CTA_FILTER_F_CTA_PROTO_ICMP_ID (1 << 8) +#define CTA_FILTER_F_CTA_PROTO_ICMPV6_TYPE (1 << 9) +#define CTA_FILTER_F_CTA_PROTO_ICMPV6_CODE (1 << 10) +#define CTA_FILTER_F_CTA_PROTO_ICMPV6_ID (1 << 11) +#define CTA_FILTER_F_MAX (1 << 12) +#define CTA_FILTER_F_ALL (CTA_FILTER_F_MAX-1) +#define CTA_FILTER_FLAG(ctattr) CTA_FILTER_F_ ## ctattr + /* nf_queue.c */ void nf_queue_nf_hook_drop(struct net *net); -- cgit v1.2.3 From d9246a53752fdb777ed176d5f091c4ac0e482bba Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 20 May 2020 13:42:44 +0200 Subject: netfilter: nf_tables: generalise flowtable hook parsing Update nft_flowtable_parse_hook() to take the flowtable hook list as parameter. This allows to reuse this function to update the hooks. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 36 +++++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 3558e76e2733..87945b4a6789 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -6178,21 +6178,30 @@ nft_flowtable_lookup_byhandle(const struct nft_table *table, return ERR_PTR(-ENOENT); } +struct nft_flowtable_hook { + u32 num; + int priority; + struct list_head list; +}; + static const struct nla_policy nft_flowtable_hook_policy[NFTA_FLOWTABLE_HOOK_MAX + 1] = { [NFTA_FLOWTABLE_HOOK_NUM] = { .type = NLA_U32 }, [NFTA_FLOWTABLE_HOOK_PRIORITY] = { .type = NLA_U32 }, [NFTA_FLOWTABLE_HOOK_DEVS] = { .type = NLA_NESTED }, }; -static int nf_tables_flowtable_parse_hook(const struct nft_ctx *ctx, - const struct nlattr *attr, - struct nft_flowtable *flowtable) +static int nft_flowtable_parse_hook(const struct nft_ctx *ctx, + const struct nlattr *attr, + struct nft_flowtable_hook *flowtable_hook, + struct nf_flowtable *ft) { struct nlattr *tb[NFTA_FLOWTABLE_HOOK_MAX + 1]; struct nft_hook *hook; int hooknum, priority; int err; + INIT_LIST_HEAD(&flowtable_hook->list); + err = nla_parse_nested_deprecated(tb, NFTA_FLOWTABLE_HOOK_MAX, attr, nft_flowtable_hook_policy, NULL); if (err < 0) @@ -6211,19 +6220,19 @@ static int nf_tables_flowtable_parse_hook(const struct nft_ctx *ctx, err = nf_tables_parse_netdev_hooks(ctx->net, tb[NFTA_FLOWTABLE_HOOK_DEVS], - &flowtable->hook_list); + &flowtable_hook->list); if (err < 0) return err; - flowtable->hooknum = hooknum; - flowtable->data.priority = priority; + flowtable_hook->priority = priority; + flowtable_hook->num = hooknum; - list_for_each_entry(hook, &flowtable->hook_list, list) { + list_for_each_entry(hook, &flowtable_hook->list, list) { hook->ops.pf = NFPROTO_NETDEV; hook->ops.hooknum = hooknum; hook->ops.priority = priority; - hook->ops.priv = &flowtable->data; - hook->ops.hook = flowtable->data.type->hook; + hook->ops.priv = ft; + hook->ops.hook = ft->type->hook; } return err; @@ -6336,6 +6345,7 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, struct netlink_ext_ack *extack) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + struct nft_flowtable_hook flowtable_hook; const struct nf_flowtable_type *type; u8 genmask = nft_genmask_next(net); int family = nfmsg->nfgen_family; @@ -6409,11 +6419,15 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, if (err < 0) goto err3; - err = nf_tables_flowtable_parse_hook(&ctx, nla[NFTA_FLOWTABLE_HOOK], - flowtable); + err = nft_flowtable_parse_hook(&ctx, nla[NFTA_FLOWTABLE_HOOK], + &flowtable_hook, &flowtable->data); if (err < 0) goto err4; + list_splice(&flowtable_hook.list, &flowtable->hook_list); + flowtable->data.priority = flowtable_hook.priority; + flowtable->hooknum = flowtable_hook.num; + err = nft_register_flowtable_net_hooks(ctx.net, table, flowtable); if (err < 0) { list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) { -- cgit v1.2.3 From f9382669cf5e75ebc7636bd78e637facf27d53f7 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 19 May 2020 01:00:07 +0200 Subject: netfilter: nf_tables: pass hook list to nft_{un,}register_flowtable_net_hooks() This patch prepares for incremental flowtable hook updates. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 87945b4a6789..1505552aaa74 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -6279,23 +6279,24 @@ static void nft_unregister_flowtable_hook(struct net *net, } static void nft_unregister_flowtable_net_hooks(struct net *net, - struct nft_flowtable *flowtable) + struct list_head *hook_list) { struct nft_hook *hook; - list_for_each_entry(hook, &flowtable->hook_list, list) + list_for_each_entry(hook, hook_list, list) nf_unregister_net_hook(net, &hook->ops); } static int nft_register_flowtable_net_hooks(struct net *net, struct nft_table *table, + struct list_head *hook_list, struct nft_flowtable *flowtable) { struct nft_hook *hook, *hook2, *next; struct nft_flowtable *ft; int err, i = 0; - list_for_each_entry(hook, &flowtable->hook_list, list) { + list_for_each_entry(hook, hook_list, list) { list_for_each_entry(ft, &table->flowtables, list) { list_for_each_entry(hook2, &ft->hook_list, list) { if (hook->ops.dev == hook2->ops.dev && @@ -6326,7 +6327,7 @@ static int nft_register_flowtable_net_hooks(struct net *net, return 0; err_unregister_net_hooks: - list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) { + list_for_each_entry_safe(hook, next, hook_list, list) { if (i-- <= 0) break; @@ -6428,7 +6429,9 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, flowtable->data.priority = flowtable_hook.priority; flowtable->hooknum = flowtable_hook.num; - err = nft_register_flowtable_net_hooks(ctx.net, table, flowtable); + err = nft_register_flowtable_net_hooks(ctx.net, table, + &flowtable->hook_list, + flowtable); if (err < 0) { list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) { list_del_rcu(&hook->list); @@ -7493,7 +7496,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_trans_flowtable(trans), NFT_MSG_DELFLOWTABLE); nft_unregister_flowtable_net_hooks(net, - nft_trans_flowtable(trans)); + &nft_trans_flowtable(trans)->hook_list); break; } } @@ -7652,7 +7655,7 @@ static int __nf_tables_abort(struct net *net, bool autoload) trans->ctx.table->use--; list_del_rcu(&nft_trans_flowtable(trans)->list); nft_unregister_flowtable_net_hooks(net, - nft_trans_flowtable(trans)); + &nft_trans_flowtable(trans)->hook_list); break; case NFT_MSG_DELFLOWTABLE: trans->ctx.table->use++; -- cgit v1.2.3 From 389a2cbcb7f15e2af9babdc0c63cec318537e7ed Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 20 May 2020 13:43:43 +0200 Subject: netfilter: nf_tables: add nft_flowtable_hooks_destroy() This patch adds a helper function destroy the flowtable hooks. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 1505552aaa74..f5d100787ccb 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -6339,6 +6339,16 @@ err_unregister_net_hooks: return err; } +static void nft_flowtable_hooks_destroy(struct list_head *hook_list) +{ + struct nft_hook *hook, *next; + + list_for_each_entry_safe(hook, next, hook_list, list) { + list_del_rcu(&hook->list); + kfree_rcu(hook, rcu); + } +} + static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, @@ -6433,10 +6443,7 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, &flowtable->hook_list, flowtable); if (err < 0) { - list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) { - list_del_rcu(&hook->list); - kfree_rcu(hook, rcu); - } + nft_flowtable_hooks_destroy(&flowtable->hook_list); goto err4; } -- cgit v1.2.3 From c42d8bda69e291c5551497a02db71b50d95510d4 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 20 May 2020 13:44:18 +0200 Subject: netfilter: nf_tables: pass hook list to flowtable event notifier Update the flowtable netlink notifier to take the list of hooks as input. This allows to reuse this function in incremental flowtable hook updates. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index f5d100787ccb..4db70e68d7f4 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -6523,7 +6523,8 @@ static int nf_tables_delflowtable(struct net *net, struct sock *nlsk, static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, u32 portid, u32 seq, int event, u32 flags, int family, - struct nft_flowtable *flowtable) + struct nft_flowtable *flowtable, + struct list_head *hook_list) { struct nlattr *nest, *nest_devs; struct nfgenmsg *nfmsg; @@ -6559,7 +6560,7 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, if (!nest_devs) goto nla_put_failure; - list_for_each_entry_rcu(hook, &flowtable->hook_list, list) { + list_for_each_entry_rcu(hook, hook_list, list) { if (nla_put_string(skb, NFTA_DEVICE_NAME, hook->ops.dev->name)) goto nla_put_failure; } @@ -6612,7 +6613,9 @@ static int nf_tables_dump_flowtable(struct sk_buff *skb, cb->nlh->nlmsg_seq, NFT_MSG_NEWFLOWTABLE, NLM_F_MULTI | NLM_F_APPEND, - table->family, flowtable) < 0) + table->family, + flowtable, + &flowtable->hook_list) < 0) goto done; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); @@ -6709,7 +6712,7 @@ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk, err = nf_tables_fill_flowtable_info(skb2, net, NETLINK_CB(skb).portid, nlh->nlmsg_seq, NFT_MSG_NEWFLOWTABLE, 0, family, - flowtable); + flowtable, &flowtable->hook_list); if (err < 0) goto err; @@ -6721,6 +6724,7 @@ err: static void nf_tables_flowtable_notify(struct nft_ctx *ctx, struct nft_flowtable *flowtable, + struct list_head *hook_list, int event) { struct sk_buff *skb; @@ -6736,7 +6740,7 @@ static void nf_tables_flowtable_notify(struct nft_ctx *ctx, err = nf_tables_fill_flowtable_info(skb, ctx->net, ctx->portid, ctx->seq, event, 0, - ctx->family, flowtable); + ctx->family, flowtable, hook_list); if (err < 0) { kfree_skb(skb); goto err; @@ -7494,6 +7498,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_clear(net, nft_trans_flowtable(trans)); nf_tables_flowtable_notify(&trans->ctx, nft_trans_flowtable(trans), + &nft_trans_flowtable(trans)->hook_list, NFT_MSG_NEWFLOWTABLE); nft_trans_destroy(trans); break; @@ -7501,6 +7506,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) list_del_rcu(&nft_trans_flowtable(trans)->list); nf_tables_flowtable_notify(&trans->ctx, nft_trans_flowtable(trans), + &nft_trans_flowtable(trans)->hook_list, NFT_MSG_DELFLOWTABLE); nft_unregister_flowtable_net_hooks(net, &nft_trans_flowtable(trans)->hook_list); -- cgit v1.2.3 From 78d9f48f7f44431a25da2b46b3a8812f6ff2b981 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 20 May 2020 13:46:47 +0200 Subject: netfilter: nf_tables: add devices to existing flowtable This patch allows users to add devices to an existing flowtable. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 6 +++ net/netfilter/nf_tables_api.c | 97 ++++++++++++++++++++++++++++++++++----- 2 files changed, 92 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index d4e29c952c40..4f58c4411bb4 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1481,10 +1481,16 @@ struct nft_trans_obj { struct nft_trans_flowtable { struct nft_flowtable *flowtable; + bool update; + struct list_head hook_list; }; #define nft_trans_flowtable(trans) \ (((struct nft_trans_flowtable *)trans->data)->flowtable) +#define nft_trans_flowtable_update(trans) \ + (((struct nft_trans_flowtable *)trans->data)->update) +#define nft_trans_flowtable_hooks(trans) \ + (((struct nft_trans_flowtable *)trans->data)->hook_list) int __init nft_chain_filter_init(void); void nft_chain_filter_fini(void); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 4db70e68d7f4..98f2cbb97e39 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -6349,6 +6349,62 @@ static void nft_flowtable_hooks_destroy(struct list_head *hook_list) } } +static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh, + struct nft_flowtable *flowtable) +{ + const struct nlattr * const *nla = ctx->nla; + struct nft_flowtable_hook flowtable_hook; + struct nft_hook *hook, *next; + struct nft_trans *trans; + bool unregister = false; + int err; + + err = nft_flowtable_parse_hook(ctx, nla[NFTA_FLOWTABLE_HOOK], + &flowtable_hook, &flowtable->data); + if (err < 0) + return err; + + list_for_each_entry_safe(hook, next, &flowtable_hook.list, list) { + if (nft_hook_list_find(&flowtable->hook_list, hook)) { + list_del(&hook->list); + kfree(hook); + } + } + + err = nft_register_flowtable_net_hooks(ctx->net, ctx->table, + &flowtable_hook.list, flowtable); + if (err < 0) + goto err_flowtable_update_hook; + + trans = nft_trans_alloc(ctx, NFT_MSG_NEWFLOWTABLE, + sizeof(struct nft_trans_flowtable)); + if (!trans) { + unregister = true; + err = -ENOMEM; + goto err_flowtable_update_hook; + } + + nft_trans_flowtable(trans) = flowtable; + nft_trans_flowtable_update(trans) = true; + INIT_LIST_HEAD(&nft_trans_flowtable_hooks(trans)); + list_splice(&flowtable_hook.list, &nft_trans_flowtable_hooks(trans)); + + list_add_tail(&trans->list, &ctx->net->nft.commit_list); + + return 0; + +err_flowtable_update_hook: + list_for_each_entry_safe(hook, next, &flowtable_hook.list, list) { + if (unregister) + nft_unregister_flowtable_hook(ctx->net, flowtable, hook); + list_del_rcu(&hook->list); + kfree_rcu(hook, rcu); + } + + return err; + +} + static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, @@ -6392,7 +6448,9 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, return -EEXIST; } - return 0; + nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); + + return nft_flowtable_update(&ctx, nlh, flowtable); } nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); @@ -7495,11 +7553,20 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) NFT_MSG_DELOBJ); break; case NFT_MSG_NEWFLOWTABLE: - nft_clear(net, nft_trans_flowtable(trans)); - nf_tables_flowtable_notify(&trans->ctx, - nft_trans_flowtable(trans), - &nft_trans_flowtable(trans)->hook_list, - NFT_MSG_NEWFLOWTABLE); + if (nft_trans_flowtable_update(trans)) { + nf_tables_flowtable_notify(&trans->ctx, + nft_trans_flowtable(trans), + &nft_trans_flowtable_hooks(trans), + NFT_MSG_NEWFLOWTABLE); + list_splice(&nft_trans_flowtable_hooks(trans), + &nft_trans_flowtable(trans)->hook_list); + } else { + nft_clear(net, nft_trans_flowtable(trans)); + nf_tables_flowtable_notify(&trans->ctx, + nft_trans_flowtable(trans), + &nft_trans_flowtable(trans)->hook_list, + NFT_MSG_NEWFLOWTABLE); + } nft_trans_destroy(trans); break; case NFT_MSG_DELFLOWTABLE: @@ -7558,7 +7625,10 @@ static void nf_tables_abort_release(struct nft_trans *trans) nft_obj_destroy(&trans->ctx, nft_trans_obj(trans)); break; case NFT_MSG_NEWFLOWTABLE: - nf_tables_flowtable_destroy(nft_trans_flowtable(trans)); + if (nft_trans_flowtable_update(trans)) + nft_flowtable_hooks_destroy(&nft_trans_flowtable_hooks(trans)); + else + nf_tables_flowtable_destroy(nft_trans_flowtable(trans)); break; } kfree(trans); @@ -7665,10 +7735,15 @@ static int __nf_tables_abort(struct net *net, bool autoload) nft_trans_destroy(trans); break; case NFT_MSG_NEWFLOWTABLE: - trans->ctx.table->use--; - list_del_rcu(&nft_trans_flowtable(trans)->list); - nft_unregister_flowtable_net_hooks(net, - &nft_trans_flowtable(trans)->hook_list); + if (nft_trans_flowtable_update(trans)) { + nft_unregister_flowtable_net_hooks(net, + &nft_trans_flowtable_hooks(trans)); + } else { + trans->ctx.table->use--; + list_del_rcu(&nft_trans_flowtable(trans)->list); + nft_unregister_flowtable_net_hooks(net, + &nft_trans_flowtable(trans)->hook_list); + } break; case NFT_MSG_DELFLOWTABLE: trans->ctx.table->use++; -- cgit v1.2.3 From abadb2f865d72a223d691fc68e006943ecadf0d9 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 20 May 2020 13:46:51 +0200 Subject: netfilter: nf_tables: delete devices from flowtable This patch allows users to delete devices from existing flowtables. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 1 + net/netfilter/nf_tables_api.c | 113 ++++++++++++++++++++++++++++++++------ 2 files changed, 98 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 4f58c4411bb4..6f0f6fca9ac3 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1002,6 +1002,7 @@ struct nft_stats { struct nft_hook { struct list_head list; + bool inactive; struct nf_hook_ops ops; struct rcu_head rcu; }; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 98f2cbb97e39..1c2c3bb78fa0 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1669,6 +1669,7 @@ static struct nft_hook *nft_netdev_hook_alloc(struct net *net, goto err_hook_dev; } hook->ops.dev = dev; + hook->inactive = false; return hook; @@ -1678,17 +1679,17 @@ err_hook_alloc: return ERR_PTR(err); } -static bool nft_hook_list_find(struct list_head *hook_list, - const struct nft_hook *this) +static struct nft_hook *nft_hook_list_find(struct list_head *hook_list, + const struct nft_hook *this) { struct nft_hook *hook; list_for_each_entry(hook, hook_list, list) { if (this->ops.dev == hook->ops.dev) - return true; + return hook; } - return false; + return NULL; } static int nf_tables_parse_netdev_hooks(struct net *net, @@ -6530,6 +6531,51 @@ err1: return err; } +static int nft_delflowtable_hook(struct nft_ctx *ctx, + struct nft_flowtable *flowtable) +{ + const struct nlattr * const *nla = ctx->nla; + struct nft_flowtable_hook flowtable_hook; + struct nft_hook *this, *next, *hook; + struct nft_trans *trans; + int err; + + err = nft_flowtable_parse_hook(ctx, nla[NFTA_FLOWTABLE_HOOK], + &flowtable_hook, &flowtable->data); + if (err < 0) + return err; + + list_for_each_entry_safe(this, next, &flowtable_hook.list, list) { + hook = nft_hook_list_find(&flowtable->hook_list, this); + if (!hook) { + err = -ENOENT; + goto err_flowtable_del_hook; + } + hook->inactive = true; + list_del(&this->list); + kfree(this); + } + + trans = nft_trans_alloc(ctx, NFT_MSG_DELFLOWTABLE, + sizeof(struct nft_trans_flowtable)); + if (!trans) + return -ENOMEM; + + nft_trans_flowtable(trans) = flowtable; + nft_trans_flowtable_update(trans) = true; + INIT_LIST_HEAD(&nft_trans_flowtable_hooks(trans)); + + list_add_tail(&trans->list, &ctx->net->nft.commit_list); + + return 0; + +err_flowtable_del_hook: + list_for_each_entry(hook, &flowtable_hook.list, list) + hook->inactive = false; + + return err; +} + static int nf_tables_delflowtable(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, @@ -6568,13 +6614,17 @@ static int nf_tables_delflowtable(struct net *net, struct sock *nlsk, NL_SET_BAD_ATTR(extack, attr); return PTR_ERR(flowtable); } + + nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); + + if (nla[NFTA_FLOWTABLE_HOOK]) + return nft_delflowtable_hook(&ctx, flowtable); + if (flowtable->use > 0) { NL_SET_BAD_ATTR(extack, attr); return -EBUSY; } - nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); - return nft_delflowtable(&ctx, flowtable); } @@ -7184,7 +7234,10 @@ static void nft_commit_release(struct nft_trans *trans) nft_obj_destroy(&trans->ctx, nft_trans_obj(trans)); break; case NFT_MSG_DELFLOWTABLE: - nf_tables_flowtable_destroy(nft_trans_flowtable(trans)); + if (nft_trans_flowtable_update(trans)) + nft_flowtable_hooks_destroy(&nft_trans_flowtable_hooks(trans)); + else + nf_tables_flowtable_destroy(nft_trans_flowtable(trans)); break; } @@ -7345,6 +7398,17 @@ static void nft_chain_del(struct nft_chain *chain) list_del_rcu(&chain->list); } +static void nft_flowtable_hooks_del(struct nft_flowtable *flowtable, + struct list_head *hook_list) +{ + struct nft_hook *hook, *next; + + list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) { + if (hook->inactive) + list_move(&hook->list, hook_list); + } +} + static void nf_tables_module_autoload_cleanup(struct net *net) { struct nft_module_request *req, *next; @@ -7570,13 +7634,24 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_trans_destroy(trans); break; case NFT_MSG_DELFLOWTABLE: - list_del_rcu(&nft_trans_flowtable(trans)->list); - nf_tables_flowtable_notify(&trans->ctx, - nft_trans_flowtable(trans), - &nft_trans_flowtable(trans)->hook_list, - NFT_MSG_DELFLOWTABLE); - nft_unregister_flowtable_net_hooks(net, - &nft_trans_flowtable(trans)->hook_list); + if (nft_trans_flowtable_update(trans)) { + nft_flowtable_hooks_del(nft_trans_flowtable(trans), + &nft_trans_flowtable_hooks(trans)); + nf_tables_flowtable_notify(&trans->ctx, + nft_trans_flowtable(trans), + &nft_trans_flowtable_hooks(trans), + NFT_MSG_DELFLOWTABLE); + nft_unregister_flowtable_net_hooks(net, + &nft_trans_flowtable_hooks(trans)); + } else { + list_del_rcu(&nft_trans_flowtable(trans)->list); + nf_tables_flowtable_notify(&trans->ctx, + nft_trans_flowtable(trans), + &nft_trans_flowtable(trans)->hook_list, + NFT_MSG_DELFLOWTABLE); + nft_unregister_flowtable_net_hooks(net, + &nft_trans_flowtable(trans)->hook_list); + } break; } } @@ -7638,6 +7713,7 @@ static int __nf_tables_abort(struct net *net, bool autoload) { struct nft_trans *trans, *next; struct nft_trans_elem *te; + struct nft_hook *hook; list_for_each_entry_safe_reverse(trans, next, &net->nft.commit_list, list) { @@ -7746,8 +7822,13 @@ static int __nf_tables_abort(struct net *net, bool autoload) } break; case NFT_MSG_DELFLOWTABLE: - trans->ctx.table->use++; - nft_clear(trans->ctx.net, nft_trans_flowtable(trans)); + if (nft_trans_flowtable_update(trans)) { + list_for_each_entry(hook, &nft_trans_flowtable(trans)->hook_list, list) + hook->inactive = false; + } else { + trans->ctx.table->use++; + nft_clear(trans->ctx.net, nft_trans_flowtable(trans)); + } nft_trans_destroy(trans); break; } -- cgit v1.2.3 From 05abe4456fa376040f6cc3cc6830d2e328723478 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 20 May 2020 15:44:37 +0200 Subject: netfilter: nf_tables: allow to register flowtable with no devices A flowtable might be composed of dynamic interfaces only. Such dynamic interfaces might show up at a later stage. This patch allows users to register a flowtable with no devices. Once the dynamic interface becomes available, the user adds the dynamic devices to the flowtable. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 1c2c3bb78fa0..897ac5fbe079 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1724,8 +1724,6 @@ static int nf_tables_parse_netdev_hooks(struct net *net, goto err_hook; } } - if (!n) - return -EINVAL; return 0; @@ -1762,6 +1760,9 @@ static int nft_chain_parse_netdev(struct net *net, hook_list); if (err < 0) return err; + + if (list_empty(hook_list)) + return -EINVAL; } else { return -EINVAL; } @@ -6209,8 +6210,7 @@ static int nft_flowtable_parse_hook(const struct nft_ctx *ctx, return err; if (!tb[NFTA_FLOWTABLE_HOOK_NUM] || - !tb[NFTA_FLOWTABLE_HOOK_PRIORITY] || - !tb[NFTA_FLOWTABLE_HOOK_DEVS]) + !tb[NFTA_FLOWTABLE_HOOK_PRIORITY]) return -EINVAL; hooknum = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_NUM])); @@ -6219,11 +6219,13 @@ static int nft_flowtable_parse_hook(const struct nft_ctx *ctx, priority = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_PRIORITY])); - err = nf_tables_parse_netdev_hooks(ctx->net, - tb[NFTA_FLOWTABLE_HOOK_DEVS], - &flowtable_hook->list); - if (err < 0) - return err; + if (tb[NFTA_FLOWTABLE_HOOK_DEVS]) { + err = nf_tables_parse_netdev_hooks(ctx->net, + tb[NFTA_FLOWTABLE_HOOK_DEVS], + &flowtable_hook->list); + if (err < 0) + return err; + } flowtable_hook->priority = priority; flowtable_hook->num = hooknum; -- cgit v1.2.3 From 5b6743fb2c2a1fcb31c8b227558f537095dbece4 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sat, 23 May 2020 12:05:22 +0200 Subject: netfilter: nf_tables: skip flowtable hooknum and priority on device updates On device updates, the hooknum and priority attributes are not required. This patch makes optional these two netlink attributes. Moreover, bail out with EOPNOTSUPP if userspace tries to update the hooknum and priority for existing flowtables. While at this, turn EINVAL into EOPNOTSUPP in case the hooknum is not ingress. EINVAL is reserved for missing netlink attribute / malformed netlink messages. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 53 ++++++++++++++++++++++++++++--------------- 1 file changed, 35 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 897ac5fbe079..073aa1051d43 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -6195,7 +6195,7 @@ static const struct nla_policy nft_flowtable_hook_policy[NFTA_FLOWTABLE_HOOK_MAX static int nft_flowtable_parse_hook(const struct nft_ctx *ctx, const struct nlattr *attr, struct nft_flowtable_hook *flowtable_hook, - struct nf_flowtable *ft) + struct nft_flowtable *flowtable, bool add) { struct nlattr *tb[NFTA_FLOWTABLE_HOOK_MAX + 1]; struct nft_hook *hook; @@ -6209,15 +6209,35 @@ static int nft_flowtable_parse_hook(const struct nft_ctx *ctx, if (err < 0) return err; - if (!tb[NFTA_FLOWTABLE_HOOK_NUM] || - !tb[NFTA_FLOWTABLE_HOOK_PRIORITY]) - return -EINVAL; + if (add) { + if (!tb[NFTA_FLOWTABLE_HOOK_NUM] || + !tb[NFTA_FLOWTABLE_HOOK_PRIORITY]) + return -EINVAL; - hooknum = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_NUM])); - if (hooknum != NF_NETDEV_INGRESS) - return -EINVAL; + hooknum = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_NUM])); + if (hooknum != NF_NETDEV_INGRESS) + return -EOPNOTSUPP; + + priority = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_PRIORITY])); + + flowtable_hook->priority = priority; + flowtable_hook->num = hooknum; + } else { + if (tb[NFTA_FLOWTABLE_HOOK_NUM]) { + hooknum = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_NUM])); + if (hooknum != flowtable->hooknum) + return -EOPNOTSUPP; + } + + if (tb[NFTA_FLOWTABLE_HOOK_PRIORITY]) { + priority = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_PRIORITY])); + if (priority != flowtable->data.priority) + return -EOPNOTSUPP; + } - priority = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_PRIORITY])); + flowtable_hook->priority = flowtable->data.priority; + flowtable_hook->num = flowtable->hooknum; + } if (tb[NFTA_FLOWTABLE_HOOK_DEVS]) { err = nf_tables_parse_netdev_hooks(ctx->net, @@ -6227,15 +6247,12 @@ static int nft_flowtable_parse_hook(const struct nft_ctx *ctx, return err; } - flowtable_hook->priority = priority; - flowtable_hook->num = hooknum; - list_for_each_entry(hook, &flowtable_hook->list, list) { hook->ops.pf = NFPROTO_NETDEV; - hook->ops.hooknum = hooknum; - hook->ops.priority = priority; - hook->ops.priv = ft; - hook->ops.hook = ft->type->hook; + hook->ops.hooknum = flowtable_hook->num; + hook->ops.priority = flowtable_hook->priority; + hook->ops.priv = &flowtable->data; + hook->ops.hook = flowtable->data.type->hook; } return err; @@ -6363,7 +6380,7 @@ static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh, int err; err = nft_flowtable_parse_hook(ctx, nla[NFTA_FLOWTABLE_HOOK], - &flowtable_hook, &flowtable->data); + &flowtable_hook, flowtable, false); if (err < 0) return err; @@ -6492,7 +6509,7 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, goto err3; err = nft_flowtable_parse_hook(&ctx, nla[NFTA_FLOWTABLE_HOOK], - &flowtable_hook, &flowtable->data); + &flowtable_hook, flowtable, true); if (err < 0) goto err4; @@ -6543,7 +6560,7 @@ static int nft_delflowtable_hook(struct nft_ctx *ctx, int err; err = nft_flowtable_parse_hook(ctx, nla[NFTA_FLOWTABLE_HOOK], - &flowtable_hook, &flowtable->data); + &flowtable_hook, flowtable, false); if (err < 0) return err; -- cgit v1.2.3 From f745664257b62f6ba29f45fd21fbe7193b4da57b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 May 2020 19:48:49 -0700 Subject: tcp: add tcp_ld_RTO_revert() helper RFC 6069 logic has been implemented for IPv4 only so far, right in the middle of tcp_v4_err() and was error prone. Move this code to one helper, to make tcp_v4_err() more readable and to eventually expand RFC 6069 to IPv6 in the future. Also perform sock_owned_by_user() check a bit sooner. Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Tested-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 85 ++++++++++++++++++++++++++++------------------------- 1 file changed, 45 insertions(+), 40 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 6789671f0f5a..f32dcadc91b7 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -403,6 +403,45 @@ void tcp_req_err(struct sock *sk, u32 seq, bool abort) } EXPORT_SYMBOL(tcp_req_err); +/* TCP-LD (RFC 6069) logic */ +static void tcp_ld_RTO_revert(struct sock *sk, u32 seq) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + s32 remaining; + u32 delta_us; + + if (sock_owned_by_user(sk)) + return; + + if (seq != tp->snd_una || !icsk->icsk_retransmits || + !icsk->icsk_backoff) + return; + + skb = tcp_rtx_queue_head(sk); + if (WARN_ON_ONCE(!skb)) + return; + + icsk->icsk_backoff--; + icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT; + icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); + + tcp_mstamp_refresh(tp); + delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); + remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us); + + if (remaining > 0) { + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + remaining, TCP_RTO_MAX); + } else { + /* RTO revert clocked out retransmission. + * Will retransmit now. + */ + tcp_retransmit_timer(sk); + } +} + /* * This routine is called by the ICMP module when it gets some * sort of error condition. If err < 0 then the socket should @@ -423,17 +462,13 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info) { const struct iphdr *iph = (const struct iphdr *)icmp_skb->data; struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2)); - struct inet_connection_sock *icsk; struct tcp_sock *tp; struct inet_sock *inet; const int type = icmp_hdr(icmp_skb)->type; const int code = icmp_hdr(icmp_skb)->code; struct sock *sk; - struct sk_buff *skb; struct request_sock *fastopen; u32 seq, snd_una; - s32 remaining; - u32 delta_us; int err; struct net *net = dev_net(icmp_skb->dev); @@ -476,7 +511,6 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info) goto out; } - icsk = inet_csk(sk); tp = tcp_sk(sk); /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ fastopen = rcu_dereference(tp->fastopen_rsk); @@ -521,41 +555,12 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info) } err = icmp_err_convert[code].errno; - /* check if icmp_skb allows revert of backoff - * (see draft-zimmermann-tcp-lcd) */ - if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH) - break; - if (seq != tp->snd_una || !icsk->icsk_retransmits || - !icsk->icsk_backoff || fastopen) - break; - - if (sock_owned_by_user(sk)) - break; - - skb = tcp_rtx_queue_head(sk); - if (WARN_ON_ONCE(!skb)) - break; - - icsk->icsk_backoff--; - icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : - TCP_TIMEOUT_INIT; - icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); - - - tcp_mstamp_refresh(tp); - delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); - remaining = icsk->icsk_rto - - usecs_to_jiffies(delta_us); - - if (remaining > 0) { - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - remaining, TCP_RTO_MAX); - } else { - /* RTO revert clocked out retransmission. - * Will retransmit now */ - tcp_retransmit_timer(sk); - } - + /* check if this ICMP message allows revert of backoff. + * (see RFC 6069) + */ + if (!fastopen && + (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH)) + tcp_ld_RTO_revert(sk, seq); break; case ICMP_TIME_EXCEEDED: err = EHOSTUNREACH; -- cgit v1.2.3 From a12daf13a4492bb7fb26231e52afb381927f938e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 May 2020 19:48:50 -0700 Subject: tcp: rename tcp_v4_err() skb parameter This essentially reverts 4d1a2d9ec1c1 ("Revert Backoff [v3]: Rename skb to icmp_skb in tcp_v4_err()") Now we have tcp_ld_RTO_revert() helper, we can use the usual name for sk_buff parameter, so that tcp_v4_err() and tcp_v6_err() use similar names. Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index f32dcadc91b7..4eef5b84fff1 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -458,23 +458,23 @@ static void tcp_ld_RTO_revert(struct sock *sk, u32 seq) * */ -int tcp_v4_err(struct sk_buff *icmp_skb, u32 info) +int tcp_v4_err(struct sk_buff *skb, u32 info) { - const struct iphdr *iph = (const struct iphdr *)icmp_skb->data; - struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2)); + const struct iphdr *iph = (const struct iphdr *)skb->data; + struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); struct tcp_sock *tp; struct inet_sock *inet; - const int type = icmp_hdr(icmp_skb)->type; - const int code = icmp_hdr(icmp_skb)->code; + const int type = icmp_hdr(skb)->type; + const int code = icmp_hdr(skb)->code; struct sock *sk; struct request_sock *fastopen; u32 seq, snd_una; int err; - struct net *net = dev_net(icmp_skb->dev); + struct net *net = dev_net(skb->dev); sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr, th->dest, iph->saddr, ntohs(th->source), - inet_iif(icmp_skb), 0); + inet_iif(skb), 0); if (!sk) { __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); return -ENOENT; @@ -524,7 +524,7 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info) switch (type) { case ICMP_REDIRECT: if (!sock_owned_by_user(sk)) - do_redirect(icmp_skb, sk); + do_redirect(skb, sk); goto out; case ICMP_SOURCE_QUENCH: /* Just silently ignore these. */ @@ -578,7 +578,7 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info) if (fastopen && !fastopen->sk) break; - ip_icmp_error(sk, icmp_skb, err, th->dest, info, (u8 *)th); + ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th); if (!sock_owned_by_user(sk)) { sk->sk_err = err; -- cgit v1.2.3 From 4909daba37846317ec7dcba16fba009636f7fe21 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 26 May 2020 21:35:23 -0700 Subject: net_sched: use qdisc_reset() in qdisc_destroy() qdisc_destroy() calls ops->reset() and cleans up qdisc->gso_skb and qdisc->skb_bad_txq, these are nearly same with qdisc_reset(), so just call it directly, and cosolidate the code for the next patch. Cc: Jamal Hadi Salim Cc: Jiri Pirko Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/sch_generic.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index ebc55d884247..7a0b06001e48 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -949,7 +949,6 @@ static void qdisc_free_cb(struct rcu_head *head) static void qdisc_destroy(struct Qdisc *qdisc) { const struct Qdisc_ops *ops = qdisc->ops; - struct sk_buff *skb, *tmp; #ifdef CONFIG_NET_SCHED qdisc_hash_del(qdisc); @@ -957,24 +956,15 @@ static void qdisc_destroy(struct Qdisc *qdisc) qdisc_put_stab(rtnl_dereference(qdisc->stab)); #endif gen_kill_estimator(&qdisc->rate_est); - if (ops->reset) - ops->reset(qdisc); + + qdisc_reset(qdisc); + if (ops->destroy) ops->destroy(qdisc); module_put(ops->owner); dev_put(qdisc_dev(qdisc)); - skb_queue_walk_safe(&qdisc->gso_skb, skb, tmp) { - __skb_unlink(skb, &qdisc->gso_skb); - kfree_skb_list(skb); - } - - skb_queue_walk_safe(&qdisc->skb_bad_txq, skb, tmp) { - __skb_unlink(skb, &qdisc->skb_bad_txq); - kfree_skb_list(skb); - } - call_rcu(&qdisc->rcu, qdisc_free_cb); } -- cgit v1.2.3 From a34dac0b9055202cf9c64e08d8d8dc5e23029d3a Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 26 May 2020 21:35:24 -0700 Subject: net_sched: add tracepoints for qdisc_reset() and qdisc_destroy() Add two tracepoints for qdisc_reset() and qdisc_destroy() to track qdisc resetting and destroying. Sample output: tc-756 [000] ...3 138.355662: qdisc_reset: dev=ens3 kind=pfifo_fast parent=ffff:ffff handle=0:0 tc-756 [000] ...1 138.355720: qdisc_reset: dev=ens3 kind=pfifo_fast parent=ffff:ffff handle=0:0 tc-756 [000] ...1 138.355867: qdisc_reset: dev=ens3 kind=pfifo_fast parent=ffff:ffff handle=0:0 tc-756 [000] ...1 138.355930: qdisc_destroy: dev=ens3 kind=pfifo_fast parent=ffff:ffff handle=0:0 tc-757 [000] ...2 143.073780: qdisc_reset: dev=ens3 kind=fq_codel parent=ffff:ffff handle=8001:0 tc-757 [000] ...1 143.073878: qdisc_reset: dev=ens3 kind=fq_codel parent=ffff:ffff handle=8001:0 tc-757 [000] ...1 143.074114: qdisc_reset: dev=ens3 kind=fq_codel parent=ffff:ffff handle=8001:0 tc-757 [000] ...1 143.074228: qdisc_destroy: dev=ens3 kind=fq_codel parent=ffff:ffff handle=8001:0 Cc: Jamal Hadi Salim Cc: Jiri Pirko Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/trace/events/qdisc.h | 52 ++++++++++++++++++++++++++++++++++++++++++++ net/sched/sch_generic.c | 4 ++++ 2 files changed, 56 insertions(+) (limited to 'net') diff --git a/include/trace/events/qdisc.h b/include/trace/events/qdisc.h index 0d1a9ebf55ba..2b948801afa3 100644 --- a/include/trace/events/qdisc.h +++ b/include/trace/events/qdisc.h @@ -8,6 +8,8 @@ #include #include #include +#include +#include TRACE_EVENT(qdisc_dequeue, @@ -44,6 +46,56 @@ TRACE_EVENT(qdisc_dequeue, __entry->txq_state, __entry->packets, __entry->skbaddr ) ); +TRACE_EVENT(qdisc_reset, + + TP_PROTO(struct Qdisc *q), + + TP_ARGS(q), + + TP_STRUCT__entry( + __string( dev, qdisc_dev(q) ) + __string( kind, q->ops->id ) + __field( u32, parent ) + __field( u32, handle ) + ), + + TP_fast_assign( + __assign_str(dev, qdisc_dev(q)); + __assign_str(kind, q->ops->id); + __entry->parent = q->parent; + __entry->handle = q->handle; + ), + + TP_printk("dev=%s kind=%s parent=%x:%x handle=%x:%x", __get_str(dev), + __get_str(kind), TC_H_MAJ(__entry->parent) >> 16, TC_H_MIN(__entry->parent), + TC_H_MAJ(__entry->handle) >> 16, TC_H_MIN(__entry->handle)) +); + +TRACE_EVENT(qdisc_destroy, + + TP_PROTO(struct Qdisc *q), + + TP_ARGS(q), + + TP_STRUCT__entry( + __string( dev, qdisc_dev(q) ) + __string( kind, q->ops->id ) + __field( u32, parent ) + __field( u32, handle ) + ), + + TP_fast_assign( + __assign_str(dev, qdisc_dev(q)); + __assign_str(kind, q->ops->id); + __entry->parent = q->parent; + __entry->handle = q->handle; + ), + + TP_printk("dev=%s kind=%s parent=%x:%x handle=%x:%x", __get_str(dev), + __get_str(kind), TC_H_MAJ(__entry->parent) >> 16, TC_H_MIN(__entry->parent), + TC_H_MAJ(__entry->handle) >> 16, TC_H_MIN(__entry->handle)) +); + #endif /* _TRACE_QDISC_H */ /* This part must be outside protection */ diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 7a0b06001e48..abaa446ed01a 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -911,6 +911,8 @@ void qdisc_reset(struct Qdisc *qdisc) const struct Qdisc_ops *ops = qdisc->ops; struct sk_buff *skb, *tmp; + trace_qdisc_reset(qdisc); + if (ops->reset) ops->reset(qdisc); @@ -965,6 +967,8 @@ static void qdisc_destroy(struct Qdisc *qdisc) module_put(ops->owner); dev_put(qdisc_dev(qdisc)); + trace_qdisc_destroy(qdisc); + call_rcu(&qdisc->rcu, qdisc_free_cb); } -- cgit v1.2.3 From f5a7833e83628f18c1ee94e6ffcb1d232f029be9 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 26 May 2020 21:35:25 -0700 Subject: net_sched: add a tracepoint for qdisc creation With this tracepoint, we could know when qdisc's are created, especially those default qdisc's. Sample output: tc-736 [001] ...1 56.230107: qdisc_create: dev=ens3 kind=pfifo parent=1:0 tc-736 [001] ...1 56.230113: qdisc_create: dev=ens3 kind=hfsc parent=ffff:ffff tc-738 [001] ...1 56.256816: qdisc_create: dev=ens3 kind=pfifo parent=1:100 tc-739 [001] ...1 56.267584: qdisc_create: dev=ens3 kind=pfifo parent=1:200 tc-740 [001] ...1 56.279649: qdisc_create: dev=ens3 kind=fq_codel parent=1:100 tc-741 [001] ...1 56.289996: qdisc_create: dev=ens3 kind=pfifo_fast parent=1:200 tc-745 [000] .N.1 111.687483: qdisc_create: dev=ens3 kind=ingress parent=ffff:fff1 Cc: Jamal Hadi Salim Cc: Jiri Pirko Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/trace/events/qdisc.h | 23 +++++++++++++++++++++++ net/sched/sch_api.c | 3 +++ net/sched/sch_generic.c | 4 +++- 3 files changed, 29 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/trace/events/qdisc.h b/include/trace/events/qdisc.h index 2b948801afa3..330d32d84485 100644 --- a/include/trace/events/qdisc.h +++ b/include/trace/events/qdisc.h @@ -96,6 +96,29 @@ TRACE_EVENT(qdisc_destroy, TC_H_MAJ(__entry->handle) >> 16, TC_H_MIN(__entry->handle)) ); +TRACE_EVENT(qdisc_create, + + TP_PROTO(const struct Qdisc_ops *ops, struct net_device *dev, u32 parent), + + TP_ARGS(ops, dev, parent), + + TP_STRUCT__entry( + __string( dev, dev->name ) + __string( kind, ops->id ) + __field( u32, parent ) + ), + + TP_fast_assign( + __assign_str(dev, dev->name); + __assign_str(kind, ops->id); + __entry->parent = parent; + ), + + TP_printk("dev=%s kind=%s parent=%x:%x", + __get_str(dev), __get_str(kind), + TC_H_MAJ(__entry->parent) >> 16, TC_H_MIN(__entry->parent)) +); + #endif /* _TRACE_QDISC_H */ /* This part must be outside protection */ diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 0d99df1e764d..9a3449b56bd6 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -32,6 +32,8 @@ #include #include +#include + /* Short review. @@ -1283,6 +1285,7 @@ static struct Qdisc *qdisc_create(struct net_device *dev, } qdisc_hash_add(sch, false); + trace_qdisc_create(ops, dev, parent); return sch; diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index abaa446ed01a..a4271e47f220 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -896,8 +896,10 @@ struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue, } sch->parent = parentid; - if (!ops->init || ops->init(sch, NULL, extack) == 0) + if (!ops->init || ops->init(sch, NULL, extack) == 0) { + trace_qdisc_create(ops, dev_queue->dev, parentid); return sch; + } qdisc_put(sch); return NULL; -- cgit v1.2.3 From 70f50965338a12e17454c31ad5ece27069719358 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 26 May 2020 21:35:26 -0700 Subject: net_sched: avoid resetting active qdisc for multiple times MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Except for sch_mq and sch_mqprio, each dev queue points to the same root qdisc, so when we reset the dev queues with netdev_for_each_tx_queue() we end up resetting the same instance of the root qdisc for multiple times. Avoid this by checking the __QDISC_STATE_DEACTIVATED bit in each iteration, so for sch_mq/sch_mqprio, we still reset all of them like before, for the rest, we only reset it once. Reported-by: Václav Zindulka Tested-by: Václav Zindulka Cc: Jamal Hadi Salim Cc: Jiri Pirko Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/sch_generic.c | 37 +++++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index a4271e47f220..d13e27467470 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -1128,6 +1128,28 @@ void dev_activate(struct net_device *dev) } EXPORT_SYMBOL(dev_activate); +static void qdisc_deactivate(struct Qdisc *qdisc) +{ + bool nolock = qdisc->flags & TCQ_F_NOLOCK; + + if (qdisc->flags & TCQ_F_BUILTIN) + return; + if (test_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state)) + return; + + if (nolock) + spin_lock_bh(&qdisc->seqlock); + spin_lock_bh(qdisc_lock(qdisc)); + + set_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state); + + qdisc_reset(qdisc); + + spin_unlock_bh(qdisc_lock(qdisc)); + if (nolock) + spin_unlock_bh(&qdisc->seqlock); +} + static void dev_deactivate_queue(struct net_device *dev, struct netdev_queue *dev_queue, void *_qdisc_default) @@ -1137,21 +1159,8 @@ static void dev_deactivate_queue(struct net_device *dev, qdisc = rtnl_dereference(dev_queue->qdisc); if (qdisc) { - bool nolock = qdisc->flags & TCQ_F_NOLOCK; - - if (nolock) - spin_lock_bh(&qdisc->seqlock); - spin_lock_bh(qdisc_lock(qdisc)); - - if (!(qdisc->flags & TCQ_F_BUILTIN)) - set_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state); - + qdisc_deactivate(qdisc); rcu_assign_pointer(dev_queue->qdisc, qdisc_default); - qdisc_reset(qdisc); - - spin_unlock_bh(qdisc_lock(qdisc)); - if (nolock) - spin_unlock_bh(&qdisc->seqlock); } } -- cgit v1.2.3 From 759ae57f1b7bf5dfea1816f786f568d69efe34e2 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 26 May 2020 21:35:27 -0700 Subject: net_sched: get rid of unnecessary dev_qdisc_reset() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Resetting old qdisc on dev_queue->qdisc_sleeping in dev_qdisc_reset() is redundant, because this qdisc, even if not same with dev_queue->qdisc, is reset via qdisc_put() right after calling dev_graft_qdisc() when hitting refcnt 0. This is very easy to observe with qdisc_reset() tracepoint and stack traces. Reported-by: Václav Zindulka Tested-by: Václav Zindulka Cc: Jamal Hadi Salim Cc: Jiri Pirko Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/sch_generic.c | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'net') diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index d13e27467470..b19a0021a0bd 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -1191,16 +1191,6 @@ static bool some_qdisc_is_busy(struct net_device *dev) return false; } -static void dev_qdisc_reset(struct net_device *dev, - struct netdev_queue *dev_queue, - void *none) -{ - struct Qdisc *qdisc = dev_queue->qdisc_sleeping; - - if (qdisc) - qdisc_reset(qdisc); -} - /** * dev_deactivate_many - deactivate transmissions on several devices * @head: list of devices to deactivate @@ -1237,12 +1227,6 @@ void dev_deactivate_many(struct list_head *head) */ schedule_timeout_uninterruptible(1); } - /* The new qdisc is assigned at this point so we can safely - * unwind stale skb lists and qdisc statistics - */ - netdev_for_each_tx_queue(dev, dev_qdisc_reset, NULL); - if (dev_ingress_queue(dev)) - dev_qdisc_reset(dev, dev_ingress_queue(dev), NULL); } } -- cgit v1.2.3 From b3ae2459f89773adcbf16fef4b68deaaa3be1929 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Wed, 27 May 2020 12:25:26 +0300 Subject: net/tls: Add force_resync for driver resync This patch adds a field to the tls rx offload context which enables drivers to force a send_resync call. This field can be used by drivers to request a resync at the next possible tls record. It is beneficial for hardware that provides the resync sequence number asynchronously. In such cases, the packet that triggered the resync does not contain the information required for a resync. Instead, the driver requests resync for all the following TLS record until the asynchronous notification with the resync request TCP sequence arrives. A following series for mlx5e ConnectX-6DX TLS RX offload support will use this mechanism. Signed-off-by: Boris Pismenny Signed-off-by: Tariq Toukan Reviewed-by: Maxim Mikityanskiy Reviewed-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/net/tls.h | 12 +++++++++++- net/tls/tls_device.c | 9 ++++++--- 2 files changed, 17 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/net/tls.h b/include/net/tls.h index bf9eb4823933..cf9ec152fbb7 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -594,12 +594,22 @@ tls_driver_ctx(const struct sock *sk, enum tls_offload_ctx_dir direction) #endif /* The TLS context is valid until sk_destruct is called */ +#define RESYNC_REQ (1 << 0) +#define RESYNC_REQ_FORCE (1 << 1) static inline void tls_offload_rx_resync_request(struct sock *sk, __be32 seq) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_offload_context_rx *rx_ctx = tls_offload_ctx_rx(tls_ctx); - atomic64_set(&rx_ctx->resync_req, ((u64)ntohl(seq) << 32) | 1); + atomic64_set(&rx_ctx->resync_req, ((u64)ntohl(seq) << 32) | RESYNC_REQ); +} + +static inline void tls_offload_rx_force_resync_request(struct sock *sk) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_offload_context_rx *rx_ctx = tls_offload_ctx_rx(tls_ctx); + + atomic64_set(&rx_ctx->resync_req, RESYNC_REQ | RESYNC_REQ_FORCE); } static inline void diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index a562ebaaa33c..0e55f8365ce2 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -694,10 +694,11 @@ void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_offload_context_rx *rx_ctx; + bool is_req_pending, is_force_resync; u8 rcd_sn[TLS_MAX_REC_SEQ_SIZE]; - u32 sock_data, is_req_pending; struct tls_prot_info *prot; s64 resync_req; + u32 sock_data; u32 req_seq; if (tls_ctx->rx_conf != TLS_HW) @@ -712,9 +713,11 @@ void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq) resync_req = atomic64_read(&rx_ctx->resync_req); req_seq = resync_req >> 32; seq += TLS_HEADER_SIZE - 1; - is_req_pending = resync_req; + is_req_pending = resync_req & RESYNC_REQ; + is_force_resync = resync_req & RESYNC_REQ_FORCE; - if (likely(!is_req_pending) || req_seq != seq || + if (likely(!is_req_pending) || + (!is_force_resync && req_seq != seq) || !atomic64_try_cmpxchg(&rx_ctx->resync_req, &resync_req, 0)) return; break; -- cgit v1.2.3 From 50ce4c099bebf56be86c9448f7f4bcd34f33663c Mon Sep 17 00:00:00 2001 From: Jonas Falkevik Date: Wed, 27 May 2020 11:59:43 +0200 Subject: sctp: fix typo sctp_ulpevent_nofity_peer_addr_change change typo in function name "nofity" to "notify" sctp_ulpevent_nofity_peer_addr_change -> sctp_ulpevent_notify_peer_addr_change Signed-off-by: Jonas Falkevik Signed-off-by: David S. Miller --- include/net/sctp/ulpevent.h | 2 +- net/sctp/associola.c | 8 ++++---- net/sctp/ulpevent.c | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/include/net/sctp/ulpevent.h b/include/net/sctp/ulpevent.h index 0b032b92da0b..994e984eef32 100644 --- a/include/net/sctp/ulpevent.h +++ b/include/net/sctp/ulpevent.h @@ -80,7 +80,7 @@ struct sctp_ulpevent *sctp_ulpevent_make_assoc_change( struct sctp_chunk *chunk, gfp_t gfp); -void sctp_ulpevent_nofity_peer_addr_change(struct sctp_transport *transport, +void sctp_ulpevent_notify_peer_addr_change(struct sctp_transport *transport, int state, int error); struct sctp_ulpevent *sctp_ulpevent_make_remote_error( diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 437079a4883d..72315137d7e7 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -432,7 +432,7 @@ void sctp_assoc_set_primary(struct sctp_association *asoc, changeover = 1 ; asoc->peer.primary_path = transport; - sctp_ulpevent_nofity_peer_addr_change(transport, + sctp_ulpevent_notify_peer_addr_change(transport, SCTP_ADDR_MADE_PRIM, 0); /* Set a default msg_name for events. */ @@ -574,7 +574,7 @@ void sctp_assoc_rm_peer(struct sctp_association *asoc, asoc->peer.transport_count--; - sctp_ulpevent_nofity_peer_addr_change(peer, SCTP_ADDR_REMOVED, 0); + sctp_ulpevent_notify_peer_addr_change(peer, SCTP_ADDR_REMOVED, 0); sctp_transport_free(peer); } @@ -714,7 +714,7 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, list_add_tail_rcu(&peer->transports, &asoc->peer.transport_addr_list); asoc->peer.transport_count++; - sctp_ulpevent_nofity_peer_addr_change(peer, SCTP_ADDR_ADDED, 0); + sctp_ulpevent_notify_peer_addr_change(peer, SCTP_ADDR_ADDED, 0); /* If we do not yet have a primary path, set one. */ if (!asoc->peer.primary_path) { @@ -840,7 +840,7 @@ void sctp_assoc_control_transport(struct sctp_association *asoc, * to the user. */ if (ulp_notify) - sctp_ulpevent_nofity_peer_addr_change(transport, + sctp_ulpevent_notify_peer_addr_change(transport, spc_state, error); /* Select new active and retran paths. */ diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index c82dbdcf13f2..f0640306e77f 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c @@ -336,7 +336,7 @@ fail: return NULL; } -void sctp_ulpevent_nofity_peer_addr_change(struct sctp_transport *transport, +void sctp_ulpevent_notify_peer_addr_change(struct sctp_transport *transport, int state, int error) { struct sctp_association *asoc = transport->asoc; -- cgit v1.2.3 From 7a15b2e013f535a125ad7351ffc808c79bc6de35 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 27 May 2020 20:22:29 +0200 Subject: net: remove kernel_getsockopt No users left. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/linux/net.h | 2 -- net/socket.c | 34 ---------------------------------- 2 files changed, 36 deletions(-) (limited to 'net') diff --git a/include/linux/net.h b/include/linux/net.h index 6451425e828f..74ef5d7315f7 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -303,8 +303,6 @@ int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen, int flags); int kernel_getsockname(struct socket *sock, struct sockaddr *addr); int kernel_getpeername(struct socket *sock, struct sockaddr *addr); -int kernel_getsockopt(struct socket *sock, int level, int optname, char *optval, - int *optlen); int kernel_setsockopt(struct socket *sock, int level, int optname, char *optval, unsigned int optlen); int kernel_sendpage(struct socket *sock, struct page *page, int offset, diff --git a/net/socket.c b/net/socket.c index 80422fc3c836..81a98b6cbd08 100644 --- a/net/socket.c +++ b/net/socket.c @@ -3624,40 +3624,6 @@ int kernel_getpeername(struct socket *sock, struct sockaddr *addr) } EXPORT_SYMBOL(kernel_getpeername); -/** - * kernel_getsockopt - get a socket option (kernel space) - * @sock: socket - * @level: API level (SOL_SOCKET, ...) - * @optname: option tag - * @optval: option value - * @optlen: option length - * - * Assigns the option length to @optlen. - * Returns 0 or an error. - */ - -int kernel_getsockopt(struct socket *sock, int level, int optname, - char *optval, int *optlen) -{ - mm_segment_t oldfs = get_fs(); - char __user *uoptval; - int __user *uoptlen; - int err; - - uoptval = (char __user __force *) optval; - uoptlen = (int __user __force *) optlen; - - set_fs(KERNEL_DS); - if (level == SOL_SOCKET) - err = sock_getsockopt(sock, level, optname, uoptval, uoptlen); - else - err = sock->ops->getsockopt(sock, level, optname, uoptval, - uoptlen); - set_fs(oldfs); - return err; -} -EXPORT_SYMBOL(kernel_getsockopt); - /** * kernel_setsockopt - set a socket option (kernel space) * @sock: socket -- cgit v1.2.3 From a7528198add88a6f51b8ade17a5cf86804b8f7ee Mon Sep 17 00:00:00 2001 From: Markus Theil Date: Wed, 27 May 2020 18:03:34 +0200 Subject: mac80211: support control port TX status reporting Add support for TX status reporting for the control port TX API; this will be used by hostapd when it moves to the control port TX API. Signed-off-by: Markus Theil Link: https://lore.kernel.org/r/20200527160334.19224-1-markus.theil@tu-ilmenau.de [fix commit message, it was referring to nl80211] Signed-off-by: Johannes Berg --- net/mac80211/ieee80211_i.h | 3 ++- net/mac80211/main.c | 2 ++ net/mac80211/status.c | 9 ++++++- net/mac80211/tdls.c | 2 +- net/mac80211/tx.c | 62 ++++++++++++++++++++++++++++++++-------------- 5 files changed, 56 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index b87dc873825b..b7935f3d000d 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1783,7 +1783,8 @@ netdev_tx_t ieee80211_subif_start_xmit_8023(struct sk_buff *skb, void __ieee80211_subif_start_xmit(struct sk_buff *skb, struct net_device *dev, u32 info_flags, - u32 ctrl_flags); + u32 ctrl_flags, + u64 *cookie); void ieee80211_purge_tx_queue(struct ieee80211_hw *hw, struct sk_buff_head *skbs); struct sk_buff * diff --git a/net/mac80211/main.c b/net/mac80211/main.c index ac74bd780b42..b4a2efe8e83a 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -596,6 +596,8 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211); wiphy_ext_feature_set(wiphy, NL80211_EXT_FEATURE_CONTROL_PORT_NO_PREAUTH); + wiphy_ext_feature_set(wiphy, + NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211_TX_STATUS); wiphy_ext_feature_set(wiphy, NL80211_EXT_FEATURE_SCAN_FREQ_KHZ); diff --git a/net/mac80211/status.c b/net/mac80211/status.c index 22512805eafb..7b1bacac39c6 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -649,10 +649,17 @@ static void ieee80211_report_ack_skb(struct ieee80211_local *local, info->status.ack_signal, info->status.is_valid_ack_signal, GFP_ATOMIC); - else + else if (ieee80211_is_mgmt(hdr->frame_control)) cfg80211_mgmt_tx_status(&sdata->wdev, cookie, skb->data, skb->len, acked, GFP_ATOMIC); + else + cfg80211_control_port_tx_status(&sdata->wdev, + cookie, + skb->data, + skb->len, + acked, + GFP_ATOMIC); } rcu_read_unlock(); diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index 8ad420db3766..4b0cff4a07bd 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -1054,7 +1054,7 @@ ieee80211_tdls_prep_mgmt_packet(struct wiphy *wiphy, struct net_device *dev, /* disable bottom halves when entering the Tx path */ local_bh_disable(); - __ieee80211_subif_start_xmit(skb, dev, flags, 0); + __ieee80211_subif_start_xmit(skb, dev, flags, 0, NULL); local_bh_enable(); return ret; diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 5931128e1855..e9ce658141f5 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -2436,13 +2436,19 @@ int ieee80211_lookup_ra_sta(struct ieee80211_sub_if_data *sdata, return 0; } -static int ieee80211_store_ack_skb(struct ieee80211_local *local, +static u16 ieee80211_store_ack_skb(struct ieee80211_local *local, struct sk_buff *skb, - u32 *info_flags) + u32 *info_flags, + u64 *cookie) { - struct sk_buff *ack_skb = skb_clone_sk(skb); + struct sk_buff *ack_skb; u16 info_id = 0; + if (skb->sk) + ack_skb = skb_clone_sk(skb); + else + ack_skb = skb_clone(skb, GFP_ATOMIC); + if (ack_skb) { unsigned long flags; int id; @@ -2455,6 +2461,10 @@ static int ieee80211_store_ack_skb(struct ieee80211_local *local, if (id >= 0) { info_id = id; *info_flags |= IEEE80211_TX_CTL_REQ_TX_STATUS; + if (cookie) { + *cookie = ieee80211_mgmt_tx_cookie(local); + IEEE80211_SKB_CB(ack_skb)->ack.cookie = *cookie; + } } else { kfree_skb(ack_skb); } @@ -2484,7 +2494,8 @@ static int ieee80211_store_ack_skb(struct ieee80211_local *local, */ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, u32 info_flags, - struct sta_info *sta, u32 ctrl_flags) + struct sta_info *sta, u32 ctrl_flags, + u64 *cookie) { struct ieee80211_local *local = sdata->local; struct ieee80211_tx_info *info; @@ -2755,9 +2766,11 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata, goto free; } - if (unlikely(!multicast && skb->sk && - skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS)) - info_id = ieee80211_store_ack_skb(local, skb, &info_flags); + if (unlikely(!multicast && ((skb->sk && + skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS) || + ctrl_flags & IEEE80211_TX_CTL_REQ_TX_STATUS))) + info_id = ieee80211_store_ack_skb(local, skb, &info_flags, + cookie); /* * If the skb is shared we need to obtain our own copy. @@ -3913,7 +3926,8 @@ EXPORT_SYMBOL(ieee80211_txq_schedule_start); void __ieee80211_subif_start_xmit(struct sk_buff *skb, struct net_device *dev, u32 info_flags, - u32 ctrl_flags) + u32 ctrl_flags, + u64 *cookie) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; @@ -3983,7 +3997,7 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb, skb_mark_not_on_list(skb); skb = ieee80211_build_hdr(sdata, skb, info_flags, - sta, ctrl_flags); + sta, ctrl_flags, cookie); if (IS_ERR(skb)) { kfree_skb_list(next); goto out; @@ -4125,9 +4139,9 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb, __skb_queue_head_init(&queue); ieee80211_convert_to_unicast(skb, dev, &queue); while ((skb = __skb_dequeue(&queue))) - __ieee80211_subif_start_xmit(skb, dev, 0, 0); + __ieee80211_subif_start_xmit(skb, dev, 0, 0, NULL); } else { - __ieee80211_subif_start_xmit(skb, dev, 0, 0); + __ieee80211_subif_start_xmit(skb, dev, 0, 0, NULL); } return NETDEV_TX_OK; @@ -4215,7 +4229,7 @@ static void ieee80211_8023_xmit(struct ieee80211_sub_if_data *sdata, if (unlikely(!multicast && skb->sk && skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS)) - ieee80211_store_ack_skb(local, skb, &info->flags); + ieee80211_store_ack_skb(local, skb, &info->flags, NULL); memset(info, 0, sizeof(*info)); @@ -4299,7 +4313,7 @@ ieee80211_build_data_template(struct ieee80211_sub_if_data *sdata, goto out; } - skb = ieee80211_build_hdr(sdata, skb, info_flags, sta, 0); + skb = ieee80211_build_hdr(sdata, skb, info_flags, sta, 0, NULL); if (IS_ERR(skb)) goto out; @@ -5347,7 +5361,7 @@ int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev, struct sk_buff *skb; struct ethhdr *ehdr; u32 ctrl_flags = 0; - u32 flags; + u32 flags = 0; /* Only accept CONTROL_PORT_PROTOCOL configured in CONNECT/ASSOCIATE * or Pre-Authentication @@ -5360,9 +5374,13 @@ int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev, ctrl_flags |= IEEE80211_TX_CTRL_PORT_CTRL_PROTO; if (unencrypted) - flags = IEEE80211_TX_INTFL_DONT_ENCRYPT; - else - flags = 0; + flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; + + if (cookie) + ctrl_flags |= IEEE80211_TX_CTL_REQ_TX_STATUS; + + flags |= IEEE80211_TX_INTFL_NL80211_FRAME_TX | + IEEE80211_TX_CTL_INJECTED; skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(struct ethhdr) + len); @@ -5383,10 +5401,15 @@ int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev, skb_reset_network_header(skb); skb_reset_mac_header(skb); + /* mutex lock is only needed for incrementing the cookie counter */ + mutex_lock(&local->mtx); + local_bh_disable(); - __ieee80211_subif_start_xmit(skb, skb->dev, flags, ctrl_flags); + __ieee80211_subif_start_xmit(skb, skb->dev, flags, ctrl_flags, cookie); local_bh_enable(); + mutex_unlock(&local->mtx); + return 0; } @@ -5413,7 +5436,8 @@ int ieee80211_probe_mesh_link(struct wiphy *wiphy, struct net_device *dev, local_bh_disable(); __ieee80211_subif_start_xmit(skb, skb->dev, 0, - IEEE80211_TX_CTRL_SKIP_MPATH_LOOKUP); + IEEE80211_TX_CTRL_SKIP_MPATH_LOOKUP, + NULL); local_bh_enable(); return 0; -- cgit v1.2.3 From d29245692a44d71d5e2e0770463184a693696232 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 27 May 2020 17:34:58 -0700 Subject: tcp: ipv6: support RFC 6069 (TCP-LD) Make tcp_ld_RTO_revert() helper available to IPv6, and implement RFC 6069 : Quoting this RFC : 3. Connectivity Disruption Indication For Internet Protocol version 6 (IPv6) [RFC2460], the counterpart of the ICMP destination unreachable message of code 0 (net unreachable) and of code 1 (host unreachable) is the ICMPv6 destination unreachable message of code 0 (no route to destination) [RFC4443]. As with IPv4, a router should generate an ICMPv6 destination unreachable message of code 0 in response to a packet that cannot be delivered to its destination address because it lacks a matching entry in its routing table. Signed-off-by: Eric Dumazet Acked-by: Yuchung Cheng Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- include/net/tcp.h | 1 + net/ipv4/tcp_ipv4.c | 3 ++- net/ipv6/tcp_ipv6.c | 9 +++++++++ 3 files changed, 12 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/tcp.h b/include/net/tcp.h index b681338a8320..66e4b8331850 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -437,6 +437,7 @@ u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops, void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb); void tcp_v4_mtu_reduced(struct sock *sk); void tcp_req_err(struct sock *sk, u32 seq, bool abort); +void tcp_ld_RTO_revert(struct sock *sk, u32 seq); int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb); struct sock *tcp_create_openreq_child(const struct sock *sk, struct request_sock *req, diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 4eef5b84fff1..ad6435ba6d72 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -404,7 +404,7 @@ void tcp_req_err(struct sock *sk, u32 seq, bool abort) EXPORT_SYMBOL(tcp_req_err); /* TCP-LD (RFC 6069) logic */ -static void tcp_ld_RTO_revert(struct sock *sk, u32 seq) +void tcp_ld_RTO_revert(struct sock *sk, u32 seq) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -441,6 +441,7 @@ static void tcp_ld_RTO_revert(struct sock *sk, u32 seq) tcp_retransmit_timer(sk); } } +EXPORT_SYMBOL(tcp_ld_RTO_revert); /* * This routine is called by the ICMP module when it gets some diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 01a6f5111a77..b7415ca75c2d 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -473,6 +473,15 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } else sk->sk_err_soft = err; goto out; + case TCP_LISTEN: + break; + default: + /* check if this ICMP message allows revert of backoff. + * (see RFC 6069) + */ + if (!fastopen && type == ICMPV6_DEST_UNREACH && + code == ICMPV6_NOROUTE) + tcp_ld_RTO_revert(sk, seq); } if (!sock_owned_by_user(sk) && np->recverr) { -- cgit v1.2.3 From b58f0e8f38c0a44afa59601a115bd231f23471e1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:09 +0200 Subject: net: add sock_set_reuseaddr Add a helper to directly set the SO_REUSEADDR sockopt from kernel space without going through a fake uaccess. For this the iscsi target now has to formally depend on inet to avoid a mostly theoretical compile failure. For actual operation it already did depend on having ipv4 or ipv6 support. Signed-off-by: Christoph Hellwig Acked-by: Sagi Grimberg Signed-off-by: David S. Miller --- drivers/infiniband/sw/siw/siw_cm.c | 18 +++++------------- drivers/nvme/target/tcp.c | 8 +------- drivers/target/iscsi/Kconfig | 2 +- drivers/target/iscsi/iscsi_target_login.c | 9 +-------- fs/dlm/lowcomms.c | 6 +----- include/net/sock.h | 2 ++ net/core/sock.c | 8 ++++++++ 7 files changed, 19 insertions(+), 34 deletions(-) (limited to 'net') diff --git a/drivers/infiniband/sw/siw/siw_cm.c b/drivers/infiniband/sw/siw/siw_cm.c index 559e5fd3bad8..d1860f3e8740 100644 --- a/drivers/infiniband/sw/siw/siw_cm.c +++ b/drivers/infiniband/sw/siw/siw_cm.c @@ -1312,17 +1312,14 @@ static void siw_cm_llp_state_change(struct sock *sk) static int kernel_bindconnect(struct socket *s, struct sockaddr *laddr, struct sockaddr *raddr) { - int rv, flags = 0, s_val = 1; + int rv, flags = 0; size_t size = laddr->sa_family == AF_INET ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6); /* * Make address available again asap. */ - rv = kernel_setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (char *)&s_val, - sizeof(s_val)); - if (rv < 0) - return rv; + sock_set_reuseaddr(s->sk); rv = s->ops->bind(s, laddr, size); if (rv < 0) @@ -1781,7 +1778,7 @@ int siw_create_listen(struct iw_cm_id *id, int backlog) struct siw_cep *cep = NULL; struct siw_device *sdev = to_siw_dev(id->device); int addr_family = id->local_addr.ss_family; - int rv = 0, s_val; + int rv = 0; if (addr_family != AF_INET && addr_family != AF_INET6) return -EAFNOSUPPORT; @@ -1793,13 +1790,8 @@ int siw_create_listen(struct iw_cm_id *id, int backlog) /* * Allow binding local port when still in TIME_WAIT from last close. */ - s_val = 1; - rv = kernel_setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (char *)&s_val, - sizeof(s_val)); - if (rv) { - siw_dbg(id->device, "setsockopt error: %d\n", rv); - goto error; - } + sock_set_reuseaddr(s->sk); + if (addr_family == AF_INET) { struct sockaddr_in *laddr = &to_sockaddr_in(id->local_addr); diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index f0da04e960f4..40757a63f455 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -1632,6 +1632,7 @@ static int nvmet_tcp_add_port(struct nvmet_port *nport) port->sock->sk->sk_user_data = port; port->data_ready = port->sock->sk->sk_data_ready; port->sock->sk->sk_data_ready = nvmet_tcp_listen_data_ready; + sock_set_reuseaddr(port->sock->sk); opt = 1; ret = kernel_setsockopt(port->sock, IPPROTO_TCP, @@ -1641,13 +1642,6 @@ static int nvmet_tcp_add_port(struct nvmet_port *nport) goto err_sock; } - ret = kernel_setsockopt(port->sock, SOL_SOCKET, SO_REUSEADDR, - (char *)&opt, sizeof(opt)); - if (ret) { - pr_err("failed to set SO_REUSEADDR sock opt %d\n", ret); - goto err_sock; - } - if (so_priority > 0) { ret = kernel_setsockopt(port->sock, SOL_SOCKET, SO_PRIORITY, (char *)&so_priority, sizeof(so_priority)); diff --git a/drivers/target/iscsi/Kconfig b/drivers/target/iscsi/Kconfig index 1f93ea381353..922484ea4e30 100644 --- a/drivers/target/iscsi/Kconfig +++ b/drivers/target/iscsi/Kconfig @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config ISCSI_TARGET tristate "Linux-iSCSI.org iSCSI Target Mode Stack" - depends on NET + depends on INET select CRYPTO select CRYPTO_CRC32C select CRYPTO_CRC32C_INTEL if X86 diff --git a/drivers/target/iscsi/iscsi_target_login.c b/drivers/target/iscsi/iscsi_target_login.c index 731ee67fe914..91acb3f07b4c 100644 --- a/drivers/target/iscsi/iscsi_target_login.c +++ b/drivers/target/iscsi/iscsi_target_login.c @@ -909,14 +909,7 @@ int iscsit_setup_np( } } - /* FIXME: Someone please explain why this is endian-safe */ - ret = kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, - (char *)&opt, sizeof(opt)); - if (ret < 0) { - pr_err("kernel_setsockopt() for SO_REUSEADDR" - " failed\n"); - goto fail; - } + sock_set_reuseaddr(sock->sk); ret = kernel_setsockopt(sock, IPPROTO_IP, IP_FREEBIND, (char *)&opt, sizeof(opt)); diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index f13dad0fd9ef..88f2574ca63a 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1127,12 +1127,8 @@ static struct socket *tcp_create_listen_sock(struct connection *con, kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one, sizeof(one)); - result = kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, - (char *)&one, sizeof(one)); + sock_set_reuseaddr(sock->sk); - if (result < 0) { - log_print("Failed to set SO_REUSEADDR on socket: %d", result); - } write_lock_bh(&sock->sk->sk_callback_lock); sock->sk->sk_user_data = con; save_listen_callbacks(sock); diff --git a/include/net/sock.h b/include/net/sock.h index 3e8c6d4b4b59..2ec085044790 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2688,4 +2688,6 @@ static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif) void sock_def_readable(struct sock *sk); +void sock_set_reuseaddr(struct sock *sk); + #endif /* _SOCK_H */ diff --git a/net/core/sock.c b/net/core/sock.c index fd85e651ce28..18eb84fdf5fb 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -712,6 +712,14 @@ bool sk_mc_loop(struct sock *sk) } EXPORT_SYMBOL(sk_mc_loop); +void sock_set_reuseaddr(struct sock *sk) +{ + lock_sock(sk); + sk->sk_reuse = SK_CAN_REUSE; + release_sock(sk); +} +EXPORT_SYMBOL(sock_set_reuseaddr); + /* * This is meant for all protocols to use and covers goings on * at the socket level. Everything here is generic. -- cgit v1.2.3 From c433594c07457d2b2e41a87014bfad9bec279abf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:10 +0200 Subject: net: add sock_no_linger Add a helper to directly set the SO_LINGER sockopt from kernel space with onoff set to true and a linger time of 0 without going through a fake uaccess. Signed-off-by: Christoph Hellwig Acked-by: Sagi Grimberg Signed-off-by: David S. Miller --- drivers/nvme/host/tcp.c | 9 +-------- drivers/nvme/target/tcp.c | 6 +----- include/net/sock.h | 1 + net/core/sock.c | 9 +++++++++ net/rds/tcp.h | 1 - net/rds/tcp_connect.c | 2 +- net/rds/tcp_listen.c | 13 +------------ net/sunrpc/svcsock.c | 12 ++---------- 8 files changed, 16 insertions(+), 37 deletions(-) (limited to 'net') diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index c15a92163c1f..e72d87482eb7 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1313,7 +1313,6 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, { struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); struct nvme_tcp_queue *queue = &ctrl->queues[qid]; - struct linger sol = { .l_onoff = 1, .l_linger = 0 }; int ret, opt, rcv_pdu_size; queue->ctrl = ctrl; @@ -1361,13 +1360,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, * close. This is done to prevent stale data from being sent should * the network connection be restored before TCP times out. */ - ret = kernel_setsockopt(queue->sock, SOL_SOCKET, SO_LINGER, - (char *)&sol, sizeof(sol)); - if (ret) { - dev_err(nctrl->device, - "failed to set SO_LINGER sock opt %d\n", ret); - goto err_sock; - } + sock_no_linger(queue->sock->sk); if (so_priority > 0) { ret = kernel_setsockopt(queue->sock, SOL_SOCKET, SO_PRIORITY, diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 40757a63f455..e0801494b097 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -1429,7 +1429,6 @@ static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue) { struct socket *sock = queue->sock; struct inet_sock *inet = inet_sk(sock->sk); - struct linger sol = { .l_onoff = 1, .l_linger = 0 }; int ret; ret = kernel_getsockname(sock, @@ -1447,10 +1446,7 @@ static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue) * close. This is done to prevent stale data from being sent should * the network connection be restored before TCP times out. */ - ret = kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER, - (char *)&sol, sizeof(sol)); - if (ret) - return ret; + sock_no_linger(sock->sk); if (so_priority > 0) { ret = kernel_setsockopt(sock, SOL_SOCKET, SO_PRIORITY, diff --git a/include/net/sock.h b/include/net/sock.h index 2ec085044790..6ed00bf009bb 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2688,6 +2688,7 @@ static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif) void sock_def_readable(struct sock *sk); +void sock_no_linger(struct sock *sk); void sock_set_reuseaddr(struct sock *sk); #endif /* _SOCK_H */ diff --git a/net/core/sock.c b/net/core/sock.c index 18eb84fdf5fb..f0f09524911c 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -720,6 +720,15 @@ void sock_set_reuseaddr(struct sock *sk) } EXPORT_SYMBOL(sock_set_reuseaddr); +void sock_no_linger(struct sock *sk) +{ + lock_sock(sk); + sk->sk_lingertime = 0; + sock_set_flag(sk, SOCK_LINGER); + release_sock(sk); +} +EXPORT_SYMBOL(sock_no_linger); + /* * This is meant for all protocols to use and covers goings on * at the socket level. Everything here is generic. diff --git a/net/rds/tcp.h b/net/rds/tcp.h index 3c69361d21c7..d640e210b97b 100644 --- a/net/rds/tcp.h +++ b/net/rds/tcp.h @@ -73,7 +73,6 @@ void rds_tcp_listen_data_ready(struct sock *sk); int rds_tcp_accept_one(struct socket *sock); int rds_tcp_keepalive(struct socket *sock); void *rds_tcp_listen_sock_def_readable(struct net *net); -void rds_tcp_set_linger(struct socket *sock); /* tcp_recv.c */ int rds_tcp_recv_init(void); diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index 008f50fb25dd..4e64598176b0 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -207,7 +207,7 @@ void rds_tcp_conn_path_shutdown(struct rds_conn_path *cp) if (sock) { if (rds_destroy_pending(cp->cp_conn)) - rds_tcp_set_linger(sock); + sock_no_linger(sock->sk); sock->ops->shutdown(sock, RCV_SHUTDOWN | SEND_SHUTDOWN); lock_sock(sock->sk); rds_tcp_restore_callbacks(sock, tc); /* tc->tc_sock = NULL */ diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 810a3a49e947..bbb31b9c0b39 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -111,17 +111,6 @@ struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn) return NULL; } -void rds_tcp_set_linger(struct socket *sock) -{ - struct linger no_linger = { - .l_onoff = 1, - .l_linger = 0, - }; - - kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER, - (char *)&no_linger, sizeof(no_linger)); -} - int rds_tcp_accept_one(struct socket *sock) { struct socket *new_sock = NULL; @@ -241,7 +230,7 @@ rst_nsk: * be pending on it. By setting linger, we achieve the side-effect * of avoiding TIME_WAIT state on new_sock. */ - rds_tcp_set_linger(new_sock); + sock_no_linger(new_sock->sk); kernel_sock_shutdown(new_sock, SHUT_RDWR); ret = 0; out: diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 023514e392b3..6773dacc64d8 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -323,17 +323,9 @@ static int svc_tcp_has_wspace(struct svc_xprt *xprt) static void svc_tcp_kill_temp_xprt(struct svc_xprt *xprt) { - struct svc_sock *svsk; - struct socket *sock; - struct linger no_linger = { - .l_onoff = 1, - .l_linger = 0, - }; + struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); - svsk = container_of(xprt, struct svc_sock, sk_xprt); - sock = svsk->sk_sock; - kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER, - (char *)&no_linger, sizeof(no_linger)); + sock_no_linger(svsk->sk_sock->sk); } /* -- cgit v1.2.3 From 6e43496745e75ac49d644df984d2f4ee5b5b6b4e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:11 +0200 Subject: net: add sock_set_priority Add a helper to directly set the SO_PRIORITY sockopt from kernel space without going through a fake uaccess. Signed-off-by: Christoph Hellwig Acked-by: Sagi Grimberg Signed-off-by: David S. Miller --- drivers/nvme/host/tcp.c | 12 ++---------- drivers/nvme/target/tcp.c | 18 ++++-------------- include/net/sock.h | 1 + net/core/sock.c | 8 ++++++++ 4 files changed, 15 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index e72d87482eb7..a307972d33a0 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1362,16 +1362,8 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, */ sock_no_linger(queue->sock->sk); - if (so_priority > 0) { - ret = kernel_setsockopt(queue->sock, SOL_SOCKET, SO_PRIORITY, - (char *)&so_priority, sizeof(so_priority)); - if (ret) { - dev_err(ctrl->ctrl.device, - "failed to set SO_PRIORITY sock opt, ret %d\n", - ret); - goto err_sock; - } - } + if (so_priority > 0) + sock_set_priority(queue->sock->sk, so_priority); /* Set socket type of service */ if (nctrl->opts->tos >= 0) { diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index e0801494b097..f3088156d01d 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -1448,12 +1448,8 @@ static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue) */ sock_no_linger(sock->sk); - if (so_priority > 0) { - ret = kernel_setsockopt(sock, SOL_SOCKET, SO_PRIORITY, - (char *)&so_priority, sizeof(so_priority)); - if (ret) - return ret; - } + if (so_priority > 0) + sock_set_priority(sock->sk, so_priority); /* Set socket type of service */ if (inet->rcv_tos > 0) { @@ -1638,14 +1634,8 @@ static int nvmet_tcp_add_port(struct nvmet_port *nport) goto err_sock; } - if (so_priority > 0) { - ret = kernel_setsockopt(port->sock, SOL_SOCKET, SO_PRIORITY, - (char *)&so_priority, sizeof(so_priority)); - if (ret) { - pr_err("failed to set SO_PRIORITY sock opt %d\n", ret); - goto err_sock; - } - } + if (so_priority > 0) + sock_set_priority(port->sock->sk, so_priority); ret = kernel_bind(port->sock, (struct sockaddr *)&port->addr, sizeof(port->addr)); diff --git a/include/net/sock.h b/include/net/sock.h index 6ed00bf009bb..a3a43141a4be 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2689,6 +2689,7 @@ static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif) void sock_def_readable(struct sock *sk); void sock_no_linger(struct sock *sk); +void sock_set_priority(struct sock *sk, u32 priority); void sock_set_reuseaddr(struct sock *sk); #endif /* _SOCK_H */ diff --git a/net/core/sock.c b/net/core/sock.c index f0f09524911c..ceda1a9248b3 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -729,6 +729,14 @@ void sock_no_linger(struct sock *sk) } EXPORT_SYMBOL(sock_no_linger); +void sock_set_priority(struct sock *sk, u32 priority) +{ + lock_sock(sk); + sk->sk_priority = priority; + release_sock(sk); +} +EXPORT_SYMBOL(sock_set_priority); + /* * This is meant for all protocols to use and covers goings on * at the socket level. Everything here is generic. -- cgit v1.2.3 From 76ee0785f42afbc0418072b7179d95f450d3c9a8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:12 +0200 Subject: net: add sock_set_sndtimeo Add a helper to directly set the SO_SNDTIMEO_NEW sockopt from kernel space without going through a fake uaccess. The interface is simplified to only pass the seconds value, as that is the only thing needed at the moment. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- fs/dlm/lowcomms.c | 8 ++------ include/net/sock.h | 1 + net/core/sock.c | 11 +++++++++++ 3 files changed, 14 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 88f2574ca63a..b79711d0aac7 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -918,7 +918,6 @@ static void sctp_connect_to_sock(struct connection *con) int result; int addr_len; struct socket *sock; - struct __kernel_sock_timeval tv = { .tv_sec = 5, .tv_usec = 0 }; if (con->nodeid == 0) { log_print("attempt to connect sock 0 foiled"); @@ -970,13 +969,10 @@ static void sctp_connect_to_sock(struct connection *con) * since O_NONBLOCK argument in connect() function does not work here, * then, we should restore the default value of this attribute. */ - kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO_NEW, (char *)&tv, - sizeof(tv)); + sock_set_sndtimeo(sock->sk, 5); result = sock->ops->connect(sock, (struct sockaddr *)&daddr, addr_len, 0); - memset(&tv, 0, sizeof(tv)); - kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO_NEW, (char *)&tv, - sizeof(tv)); + sock_set_sndtimeo(sock->sk, 0); if (result == -EINPROGRESS) result = 0; diff --git a/include/net/sock.h b/include/net/sock.h index a3a43141a4be..9a7b9e98685a 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2691,5 +2691,6 @@ void sock_def_readable(struct sock *sk); void sock_no_linger(struct sock *sk); void sock_set_priority(struct sock *sk, u32 priority); void sock_set_reuseaddr(struct sock *sk); +void sock_set_sndtimeo(struct sock *sk, s64 secs); #endif /* _SOCK_H */ diff --git a/net/core/sock.c b/net/core/sock.c index ceda1a9248b3..d3b1d61e4f76 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -737,6 +737,17 @@ void sock_set_priority(struct sock *sk, u32 priority) } EXPORT_SYMBOL(sock_set_priority); +void sock_set_sndtimeo(struct sock *sk, s64 secs) +{ + lock_sock(sk); + if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1) + sk->sk_sndtimeo = secs * HZ; + else + sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; + release_sock(sk); +} +EXPORT_SYMBOL(sock_set_sndtimeo); + /* * This is meant for all protocols to use and covers goings on * at the socket level. Everything here is generic. -- cgit v1.2.3 From 7594888c782e735f8a7b110094307a4dbe7b3f03 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:13 +0200 Subject: net: add sock_bindtoindex Add a helper to directly set the SO_BINDTOIFINDEX sockopt from kernel space without going through a fake uaccess. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/net/sock.h | 1 + net/core/sock.c | 21 +++++++++++++++------ net/ipv4/udp_tunnel.c | 4 +--- net/ipv6/ip6_udp_tunnel.c | 4 +--- 4 files changed, 18 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/include/net/sock.h b/include/net/sock.h index 9a7b9e98685a..cdec7bc055d5 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2688,6 +2688,7 @@ static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif) void sock_def_readable(struct sock *sk); +int sock_bindtoindex(struct sock *sk, int ifindex); void sock_no_linger(struct sock *sk); void sock_set_priority(struct sock *sk, u32 priority); void sock_set_reuseaddr(struct sock *sk); diff --git a/net/core/sock.c b/net/core/sock.c index d3b1d61e4f76..23f80880fbb2 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -566,7 +566,7 @@ struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) } EXPORT_SYMBOL(sk_dst_check); -static int sock_setbindtodevice_locked(struct sock *sk, int ifindex) +static int sock_bindtoindex_locked(struct sock *sk, int ifindex) { int ret = -ENOPROTOOPT; #ifdef CONFIG_NETDEVICES @@ -594,6 +594,18 @@ out: return ret; } +int sock_bindtoindex(struct sock *sk, int ifindex) +{ + int ret; + + lock_sock(sk); + ret = sock_bindtoindex_locked(sk, ifindex); + release_sock(sk); + + return ret; +} +EXPORT_SYMBOL(sock_bindtoindex); + static int sock_setbindtodevice(struct sock *sk, char __user *optval, int optlen) { @@ -634,10 +646,7 @@ static int sock_setbindtodevice(struct sock *sk, char __user *optval, goto out; } - lock_sock(sk); - ret = sock_setbindtodevice_locked(sk, index); - release_sock(sk); - + return sock_bindtoindex(sk, index); out: #endif @@ -1216,7 +1225,7 @@ set_rcvbuf: break; case SO_BINDTOIFINDEX: - ret = sock_setbindtodevice_locked(sk, val); + ret = sock_bindtoindex_locked(sk, val); break; default: diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c index 150e6f0fdbf5..2158e8bddf41 100644 --- a/net/ipv4/udp_tunnel.c +++ b/net/ipv4/udp_tunnel.c @@ -22,9 +22,7 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, goto error; if (cfg->bind_ifindex) { - err = kernel_setsockopt(sock, SOL_SOCKET, SO_BINDTOIFINDEX, - (void *)&cfg->bind_ifindex, - sizeof(cfg->bind_ifindex)); + err = sock_bindtoindex(sock->sk, cfg->bind_ifindex); if (err < 0) goto error; } diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c index 58956a6b66a2..6523609516d2 100644 --- a/net/ipv6/ip6_udp_tunnel.c +++ b/net/ipv6/ip6_udp_tunnel.c @@ -33,9 +33,7 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, goto error; } if (cfg->bind_ifindex) { - err = kernel_setsockopt(sock, SOL_SOCKET, SO_BINDTOIFINDEX, - (void *)&cfg->bind_ifindex, - sizeof(cfg->bind_ifindex)); + err = sock_bindtoindex(sock->sk, cfg->bind_ifindex); if (err < 0) goto error; } -- cgit v1.2.3 From 783da70e83967efeacf3c02c9dcfdc2b17bd62eb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:14 +0200 Subject: net: add sock_enable_timestamps Add a helper to directly enable timestamps instead of setting the SO_TIMESTAMP* sockopts from kernel space and going through a fake uaccess. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/net/sock.h | 1 + net/core/sock.c | 47 +++++++++++++++++++++++++++++------------------ net/rxrpc/local_object.c | 8 +------- 3 files changed, 31 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/include/net/sock.h b/include/net/sock.h index cdec7bc055d5..99ef43508d2b 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2689,6 +2689,7 @@ static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif) void sock_def_readable(struct sock *sk); int sock_bindtoindex(struct sock *sk, int ifindex); +void sock_enable_timestamps(struct sock *sk); void sock_no_linger(struct sock *sk); void sock_set_priority(struct sock *sk, u32 priority); void sock_set_reuseaddr(struct sock *sk); diff --git a/net/core/sock.c b/net/core/sock.c index 23f80880fbb2..e4a4dd2b3d8b 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -757,6 +757,28 @@ void sock_set_sndtimeo(struct sock *sk, s64 secs) } EXPORT_SYMBOL(sock_set_sndtimeo); +static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns) +{ + if (val) { + sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new); + sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns); + sock_set_flag(sk, SOCK_RCVTSTAMP); + sock_enable_timestamp(sk, SOCK_TIMESTAMP); + } else { + sock_reset_flag(sk, SOCK_RCVTSTAMP); + sock_reset_flag(sk, SOCK_RCVTSTAMPNS); + sock_reset_flag(sk, SOCK_TSTAMP_NEW); + } +} + +void sock_enable_timestamps(struct sock *sk) +{ + lock_sock(sk); + __sock_set_timestamps(sk, true, false, true); + release_sock(sk); +} +EXPORT_SYMBOL(sock_enable_timestamps); + /* * This is meant for all protocols to use and covers goings on * at the socket level. Everything here is generic. @@ -948,28 +970,17 @@ set_rcvbuf: break; case SO_TIMESTAMP_OLD: + __sock_set_timestamps(sk, valbool, false, false); + break; case SO_TIMESTAMP_NEW: + __sock_set_timestamps(sk, valbool, true, false); + break; case SO_TIMESTAMPNS_OLD: + __sock_set_timestamps(sk, valbool, false, true); + break; case SO_TIMESTAMPNS_NEW: - if (valbool) { - if (optname == SO_TIMESTAMP_NEW || optname == SO_TIMESTAMPNS_NEW) - sock_set_flag(sk, SOCK_TSTAMP_NEW); - else - sock_reset_flag(sk, SOCK_TSTAMP_NEW); - - if (optname == SO_TIMESTAMP_OLD || optname == SO_TIMESTAMP_NEW) - sock_reset_flag(sk, SOCK_RCVTSTAMPNS); - else - sock_set_flag(sk, SOCK_RCVTSTAMPNS); - sock_set_flag(sk, SOCK_RCVTSTAMP); - sock_enable_timestamp(sk, SOCK_TIMESTAMP); - } else { - sock_reset_flag(sk, SOCK_RCVTSTAMP); - sock_reset_flag(sk, SOCK_RCVTSTAMPNS); - sock_reset_flag(sk, SOCK_TSTAMP_NEW); - } + __sock_set_timestamps(sk, valbool, true, true); break; - case SO_TIMESTAMPING_NEW: sock_set_flag(sk, SOCK_TSTAMP_NEW); /* fall through */ diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 01135e54d95d..5ea2bd01fdd5 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -189,13 +189,7 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) } /* We want receive timestamps. */ - opt = 1; - ret = kernel_setsockopt(local->socket, SOL_SOCKET, SO_TIMESTAMPNS_OLD, - (char *)&opt, sizeof(opt)); - if (ret < 0) { - _debug("setsockopt failed"); - goto error; - } + sock_enable_timestamps(local->socket->sk); break; default: -- cgit v1.2.3 From ce3d9544cecacd40389c399d2b7ca31acc533b70 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:15 +0200 Subject: net: add sock_set_keepalive Add a helper to directly set the SO_KEEPALIVE sockopt from kernel space without going through a fake uaccess. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- fs/dlm/lowcomms.c | 6 +----- include/net/sock.h | 1 + net/core/sock.c | 10 ++++++++++ net/rds/tcp_listen.c | 6 +----- net/sunrpc/xprtsock.c | 4 +--- 5 files changed, 14 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index b79711d0aac7..b6e6dba28154 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1142,11 +1142,7 @@ static struct socket *tcp_create_listen_sock(struct connection *con, con->sock = NULL; goto create_out; } - result = kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, - (char *)&one, sizeof(one)); - if (result < 0) { - log_print("Set keepalive failed: %d", result); - } + sock_set_keepalive(sock->sk); result = sock->ops->listen(sock, 5); if (result < 0) { diff --git a/include/net/sock.h b/include/net/sock.h index 99ef43508d2b..dc08c176238f 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2691,6 +2691,7 @@ void sock_def_readable(struct sock *sk); int sock_bindtoindex(struct sock *sk, int ifindex); void sock_enable_timestamps(struct sock *sk); void sock_no_linger(struct sock *sk); +void sock_set_keepalive(struct sock *sk); void sock_set_priority(struct sock *sk, u32 priority); void sock_set_reuseaddr(struct sock *sk); void sock_set_sndtimeo(struct sock *sk, s64 secs); diff --git a/net/core/sock.c b/net/core/sock.c index e4a4dd2b3d8b..728f5fb156a0 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -779,6 +779,16 @@ void sock_enable_timestamps(struct sock *sk) } EXPORT_SYMBOL(sock_enable_timestamps); +void sock_set_keepalive(struct sock *sk) +{ + lock_sock(sk); + if (sk->sk_prot->keepalive) + sk->sk_prot->keepalive(sk, true); + sock_valbool_flag(sk, SOCK_KEEPOPEN, true); + release_sock(sk); +} +EXPORT_SYMBOL(sock_set_keepalive); + /* * This is meant for all protocols to use and covers goings on * at the socket level. Everything here is generic. diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index bbb31b9c0b39..d8bd13276959 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -43,13 +43,9 @@ int rds_tcp_keepalive(struct socket *sock) /* values below based on xs_udp_default_timeout */ int keepidle = 5; /* send a probe 'keepidle' secs after last data */ int keepcnt = 5; /* number of unack'ed probes before declaring dead */ - int keepalive = 1; int ret = 0; - ret = kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, - (char *)&keepalive, sizeof(keepalive)); - if (ret < 0) - goto bail; + sock_set_keepalive(sock->sk); ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPCNT, (char *)&keepcnt, sizeof(keepcnt)); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 845d0be805ec..30082cd03996 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2110,7 +2110,6 @@ static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt, struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); unsigned int keepidle; unsigned int keepcnt; - unsigned int opt_on = 1; unsigned int timeo; spin_lock(&xprt->transport_lock); @@ -2122,8 +2121,7 @@ static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt, spin_unlock(&xprt->transport_lock); /* TCP Keepalive options */ - kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, - (char *)&opt_on, sizeof(opt_on)); + sock_set_keepalive(sock->sk); kernel_setsockopt(sock, SOL_TCP, TCP_KEEPIDLE, (char *)&keepidle, sizeof(keepidle)); kernel_setsockopt(sock, SOL_TCP, TCP_KEEPINTVL, -- cgit v1.2.3 From 26cfabf9cdd273650126d84a48a7f8dedbcded48 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:16 +0200 Subject: net: add sock_set_rcvbuf Add a helper to directly set the SO_RCVBUFFORCE sockopt from kernel space without going through a fake uaccess. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- fs/dlm/lowcomms.c | 7 +------ include/net/sock.h | 1 + net/core/sock.c | 59 +++++++++++++++++++++++++++++------------------------- 3 files changed, 34 insertions(+), 33 deletions(-) (limited to 'net') diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index b6e6dba28154..2822a430a2b4 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1180,7 +1180,6 @@ static int sctp_listen_for_all(void) struct socket *sock = NULL; int result = -EINVAL; struct connection *con = nodeid2con(0, GFP_NOFS); - int bufsize = NEEDED_RMEM; int one = 1; if (!con) @@ -1195,11 +1194,7 @@ static int sctp_listen_for_all(void) goto out; } - result = kernel_setsockopt(sock, SOL_SOCKET, SO_RCVBUFFORCE, - (char *)&bufsize, sizeof(bufsize)); - if (result) - log_print("Error increasing buffer space on socket %d", result); - + sock_set_rcvbuf(sock->sk, NEEDED_RMEM); result = kernel_setsockopt(sock, SOL_SCTP, SCTP_NODELAY, (char *)&one, sizeof(one)); if (result < 0) diff --git a/include/net/sock.h b/include/net/sock.h index dc08c176238f..c997289aabbf 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2693,6 +2693,7 @@ void sock_enable_timestamps(struct sock *sk); void sock_no_linger(struct sock *sk); void sock_set_keepalive(struct sock *sk); void sock_set_priority(struct sock *sk, u32 priority); +void sock_set_rcvbuf(struct sock *sk, int val); void sock_set_reuseaddr(struct sock *sk); void sock_set_sndtimeo(struct sock *sk, s64 secs); diff --git a/net/core/sock.c b/net/core/sock.c index 728f5fb156a0..3c6ebf952e9a 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -789,6 +789,35 @@ void sock_set_keepalive(struct sock *sk) } EXPORT_SYMBOL(sock_set_keepalive); +static void __sock_set_rcvbuf(struct sock *sk, int val) +{ + /* Ensure val * 2 fits into an int, to prevent max_t() from treating it + * as a negative value. + */ + val = min_t(int, val, INT_MAX / 2); + sk->sk_userlocks |= SOCK_RCVBUF_LOCK; + + /* We double it on the way in to account for "struct sk_buff" etc. + * overhead. Applications assume that the SO_RCVBUF setting they make + * will allow that much actual data to be received on that socket. + * + * Applications are unaware that "struct sk_buff" and other overheads + * allocate from the receive buffer during socket buffer allocation. + * + * And after considering the possible alternatives, returning the value + * we actually used in getsockopt is the most desirable behavior. + */ + WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF)); +} + +void sock_set_rcvbuf(struct sock *sk, int val) +{ + lock_sock(sk); + __sock_set_rcvbuf(sk, val); + release_sock(sk); +} +EXPORT_SYMBOL(sock_set_rcvbuf); + /* * This is meant for all protocols to use and covers goings on * at the socket level. Everything here is generic. @@ -885,30 +914,7 @@ set_sndbuf: * play 'guess the biggest size' games. RCVBUF/SNDBUF * are treated in BSD as hints */ - val = min_t(u32, val, sysctl_rmem_max); -set_rcvbuf: - /* Ensure val * 2 fits into an int, to prevent max_t() - * from treating it as a negative value. - */ - val = min_t(int, val, INT_MAX / 2); - sk->sk_userlocks |= SOCK_RCVBUF_LOCK; - /* - * We double it on the way in to account for - * "struct sk_buff" etc. overhead. Applications - * assume that the SO_RCVBUF setting they make will - * allow that much actual data to be received on that - * socket. - * - * Applications are unaware that "struct sk_buff" and - * other overheads allocate from the receive buffer - * during socket buffer allocation. - * - * And after considering the possible alternatives, - * returning the value we actually used in getsockopt - * is the most desirable behavior. - */ - WRITE_ONCE(sk->sk_rcvbuf, - max_t(int, val * 2, SOCK_MIN_RCVBUF)); + __sock_set_rcvbuf(sk, min_t(u32, val, sysctl_rmem_max)); break; case SO_RCVBUFFORCE: @@ -920,9 +926,8 @@ set_rcvbuf: /* No negative values (to prevent underflow, as val will be * multiplied by 2). */ - if (val < 0) - val = 0; - goto set_rcvbuf; + __sock_set_rcvbuf(sk, max(val, 0)); + break; case SO_KEEPALIVE: if (sk->sk_prot->keepalive) -- cgit v1.2.3 From fe31a326a4aadb4a3ba2b21deacc380d06802737 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:17 +0200 Subject: net: add sock_set_reuseport Add a helper to directly set the SO_REUSEPORT sockopt from kernel space without going through a fake uaccess. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/net/sock.h | 1 + net/core/sock.c | 8 ++++++++ net/sunrpc/xprtsock.c | 17 +---------------- 3 files changed, 10 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/include/net/sock.h b/include/net/sock.h index c997289aabbf..d994daa418ec 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2695,6 +2695,7 @@ void sock_set_keepalive(struct sock *sk); void sock_set_priority(struct sock *sk, u32 priority); void sock_set_rcvbuf(struct sock *sk, int val); void sock_set_reuseaddr(struct sock *sk); +void sock_set_reuseport(struct sock *sk); void sock_set_sndtimeo(struct sock *sk, s64 secs); #endif /* _SOCK_H */ diff --git a/net/core/sock.c b/net/core/sock.c index 3c6ebf952e9a..2ca3425b519c 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -729,6 +729,14 @@ void sock_set_reuseaddr(struct sock *sk) } EXPORT_SYMBOL(sock_set_reuseaddr); +void sock_set_reuseport(struct sock *sk) +{ + lock_sock(sk); + sk->sk_reuseport = true; + release_sock(sk); +} +EXPORT_SYMBOL(sock_set_reuseport); + void sock_no_linger(struct sock *sk) { lock_sock(sk); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 30082cd03996..399848c2bcb2 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1594,21 +1594,6 @@ static int xs_get_random_port(void) return rand + min; } -/** - * xs_set_reuseaddr_port - set the socket's port and address reuse options - * @sock: socket - * - * Note that this function has to be called on all sockets that share the - * same port, and it must be called before binding. - */ -static void xs_sock_set_reuseport(struct socket *sock) -{ - int opt = 1; - - kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEPORT, - (char *)&opt, sizeof(opt)); -} - static unsigned short xs_sock_getport(struct socket *sock) { struct sockaddr_storage buf; @@ -1801,7 +1786,7 @@ static struct socket *xs_create_sock(struct rpc_xprt *xprt, xs_reclassify_socket(family, sock); if (reuseport) - xs_sock_set_reuseport(sock); + sock_set_reuseport(sock->sk); err = xs_bind(transport, sock); if (err) { -- cgit v1.2.3 From db10538a4b997a77a1fd561adaaa58afc7dcfa2f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:18 +0200 Subject: tcp: add tcp_sock_set_cork Add a helper to directly set the TCP_CORK sockopt from kernel space without going through a fake uaccess. Cleanup the callers to avoid pointless wrappers now that this is a simple function call. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- drivers/block/drbd/drbd_int.h | 14 ----------- drivers/block/drbd/drbd_receiver.c | 4 +-- drivers/block/drbd/drbd_worker.c | 6 ++--- fs/cifs/transport.c | 8 ++---- include/linux/tcp.h | 2 ++ net/ipv4/tcp.c | 51 ++++++++++++++++++++++++-------------- net/rds/tcp_send.c | 9 ++----- 7 files changed, 43 insertions(+), 51 deletions(-) (limited to 'net') diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index aae99a2d7bd4..3550adc93c68 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1570,20 +1570,6 @@ extern void drbd_set_recv_tcq(struct drbd_device *device, int tcq_enabled); extern void _drbd_clear_done_ee(struct drbd_device *device, struct list_head *to_be_freed); extern int drbd_connected(struct drbd_peer_device *); -static inline void drbd_tcp_cork(struct socket *sock) -{ - int val = 1; - (void) kernel_setsockopt(sock, SOL_TCP, TCP_CORK, - (char*)&val, sizeof(val)); -} - -static inline void drbd_tcp_uncork(struct socket *sock) -{ - int val = 0; - (void) kernel_setsockopt(sock, SOL_TCP, TCP_CORK, - (char*)&val, sizeof(val)); -} - static inline void drbd_tcp_nodelay(struct socket *sock) { int val = 1; diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index c15e7083b13a..55ea907ad33c 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -6162,7 +6162,7 @@ void drbd_send_acks_wf(struct work_struct *ws) rcu_read_unlock(); if (tcp_cork) - drbd_tcp_cork(connection->meta.socket); + tcp_sock_set_cork(connection->meta.socket->sk, true); err = drbd_finish_peer_reqs(device); kref_put(&device->kref, drbd_destroy_device); @@ -6175,7 +6175,7 @@ void drbd_send_acks_wf(struct work_struct *ws) } if (tcp_cork) - drbd_tcp_uncork(connection->meta.socket); + tcp_sock_set_cork(connection->meta.socket->sk, false); return; } diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 0dc019da1f8d..2b89c9f2ca70 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -2098,7 +2098,7 @@ static void wait_for_work(struct drbd_connection *connection, struct list_head * if (uncork) { mutex_lock(&connection->data.mutex); if (connection->data.socket) - drbd_tcp_uncork(connection->data.socket); + tcp_sock_set_cork(connection->data.socket->sk, false); mutex_unlock(&connection->data.mutex); } @@ -2153,9 +2153,9 @@ static void wait_for_work(struct drbd_connection *connection, struct list_head * mutex_lock(&connection->data.mutex); if (connection->data.socket) { if (cork) - drbd_tcp_cork(connection->data.socket); + tcp_sock_set_cork(connection->data.socket->sk, true); else if (!uncork) - drbd_tcp_uncork(connection->data.socket); + tcp_sock_set_cork(connection->data.socket->sk, false); } mutex_unlock(&connection->data.mutex); } diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index c97570eb2c18..99760063e000 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -325,7 +325,6 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, size_t total_len = 0, sent, size; struct socket *ssocket = server->ssocket; struct msghdr smb_msg; - int val = 1; __be32 rfc1002_marker; if (cifs_rdma_enabled(server)) { @@ -345,8 +344,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, } /* cork the socket */ - kernel_setsockopt(ssocket, SOL_TCP, TCP_CORK, - (char *)&val, sizeof(val)); + tcp_sock_set_cork(ssocket->sk, true); for (j = 0; j < num_rqst; j++) send_length += smb_rqst_len(server, &rqst[j]); @@ -435,9 +433,7 @@ unmask: } /* uncork it */ - val = 0; - kernel_setsockopt(ssocket, SOL_TCP, TCP_CORK, - (char *)&val, sizeof(val)); + tcp_sock_set_cork(ssocket->sk, false); if ((total_len > 0) && (total_len != send_length)) { cifs_dbg(FYI, "partial send (wanted=%u sent=%zu): terminating session\n", diff --git a/include/linux/tcp.h b/include/linux/tcp.h index bf44e85d709d..889eeb2256c2 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -497,4 +497,6 @@ static inline u16 tcp_mss_clamp(const struct tcp_sock *tp, u16 mss) int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, int pcount, int shiftlen); +void tcp_sock_set_cork(struct sock *sk, bool on); + #endif /* _LINUX_TCP_H */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 970064996377..e6cf702e16d6 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2801,6 +2801,37 @@ static void tcp_enable_tx_delay(void) } } +/* When set indicates to always queue non-full frames. Later the user clears + * this option and we transmit any pending partial frames in the queue. This is + * meant to be used alongside sendfile() to get properly filled frames when the + * user (for example) must write out headers with a write() call first and then + * use sendfile to send out the data parts. + * + * TCP_CORK can be set together with TCP_NODELAY and it is stronger than + * TCP_NODELAY. + */ +static void __tcp_sock_set_cork(struct sock *sk, bool on) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (on) { + tp->nonagle |= TCP_NAGLE_CORK; + } else { + tp->nonagle &= ~TCP_NAGLE_CORK; + if (tp->nonagle & TCP_NAGLE_OFF) + tp->nonagle |= TCP_NAGLE_PUSH; + tcp_push_pending_frames(sk); + } +} + +void tcp_sock_set_cork(struct sock *sk, bool on) +{ + lock_sock(sk); + __tcp_sock_set_cork(sk, on); + release_sock(sk); +} +EXPORT_SYMBOL(tcp_sock_set_cork); + /* * Socket option code for TCP. */ @@ -2979,25 +3010,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, break; case TCP_CORK: - /* When set indicates to always queue non-full frames. - * Later the user clears this option and we transmit - * any pending partial frames in the queue. This is - * meant to be used alongside sendfile() to get properly - * filled frames when the user (for example) must write - * out headers with a write() call first and then use - * sendfile to send out the data parts. - * - * TCP_CORK can be set together with TCP_NODELAY and it is - * stronger than TCP_NODELAY. - */ - if (val) { - tp->nonagle |= TCP_NAGLE_CORK; - } else { - tp->nonagle &= ~TCP_NAGLE_CORK; - if (tp->nonagle&TCP_NAGLE_OFF) - tp->nonagle |= TCP_NAGLE_PUSH; - tcp_push_pending_frames(sk); - } + __tcp_sock_set_cork(sk, val); break; case TCP_KEEPIDLE: diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c index 78a2554a4497..8c4d1d6e9249 100644 --- a/net/rds/tcp_send.c +++ b/net/rds/tcp_send.c @@ -38,23 +38,18 @@ #include "rds.h" #include "tcp.h" -static void rds_tcp_cork(struct socket *sock, int val) -{ - kernel_setsockopt(sock, SOL_TCP, TCP_CORK, (void *)&val, sizeof(val)); -} - void rds_tcp_xmit_path_prepare(struct rds_conn_path *cp) { struct rds_tcp_connection *tc = cp->cp_transport_data; - rds_tcp_cork(tc->t_sock, 1); + tcp_sock_set_cork(tc->t_sock->sk, true); } void rds_tcp_xmit_path_complete(struct rds_conn_path *cp) { struct rds_tcp_connection *tc = cp->cp_transport_data; - rds_tcp_cork(tc->t_sock, 0); + tcp_sock_set_cork(tc->t_sock->sk, false); } /* the core send_sem serializes this with other xmit and shutdown */ -- cgit v1.2.3 From 12abc5ee7873a085cc280240822b8ac53c86fecd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:19 +0200 Subject: tcp: add tcp_sock_set_nodelay Add a helper to directly set the TCP_NODELAY sockopt from kernel space without going through a fake uaccess. Cleanup the callers to avoid pointless wrappers now that this is a simple function call. Signed-off-by: Christoph Hellwig Acked-by: Sagi Grimberg Acked-by: Jason Gunthorpe Signed-off-by: David S. Miller --- drivers/block/drbd/drbd_int.h | 7 ------ drivers/block/drbd/drbd_main.c | 2 +- drivers/block/drbd/drbd_receiver.c | 4 ++-- drivers/infiniband/sw/siw/siw_cm.c | 24 ++++--------------- drivers/nvme/host/tcp.c | 9 +------ drivers/nvme/target/tcp.c | 12 ++-------- drivers/target/iscsi/iscsi_target_login.c | 15 +++--------- fs/cifs/connect.c | 10 ++------ fs/dlm/lowcomms.c | 8 ++----- fs/ocfs2/cluster/tcp.c | 20 ++-------------- include/linux/tcp.h | 1 + net/ceph/messenger.c | 11 ++------- net/ipv4/tcp.c | 39 ++++++++++++++++++++----------- net/rds/tcp.c | 11 +-------- net/rds/tcp.h | 1 - net/rds/tcp_listen.c | 2 +- 16 files changed, 49 insertions(+), 127 deletions(-) (limited to 'net') diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 3550adc93c68..e24bba87c8e0 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1570,13 +1570,6 @@ extern void drbd_set_recv_tcq(struct drbd_device *device, int tcq_enabled); extern void _drbd_clear_done_ee(struct drbd_device *device, struct list_head *to_be_freed); extern int drbd_connected(struct drbd_peer_device *); -static inline void drbd_tcp_nodelay(struct socket *sock) -{ - int val = 1; - (void) kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, - (char*)&val, sizeof(val)); -} - static inline void drbd_tcp_quickack(struct socket *sock) { int val = 2; diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index c094c3c2c5d4..45fbd526c453 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -660,7 +660,7 @@ static int __send_command(struct drbd_connection *connection, int vnr, /* DRBD protocol "pings" are latency critical. * This is supposed to trigger tcp_push_pending_frames() */ if (!err && (cmd == P_PING || cmd == P_PING_ACK)) - drbd_tcp_nodelay(sock->socket); + tcp_sock_set_nodelay(sock->socket->sk); return err; } diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 55ea907ad33c..20a5e94494ac 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -1051,8 +1051,8 @@ randomize: /* we don't want delays. * we use TCP_CORK where appropriate, though */ - drbd_tcp_nodelay(sock.socket); - drbd_tcp_nodelay(msock.socket); + tcp_sock_set_nodelay(sock.socket->sk); + tcp_sock_set_nodelay(msock.socket->sk); connection->data.socket = sock.socket; connection->meta.socket = msock.socket; diff --git a/drivers/infiniband/sw/siw/siw_cm.c b/drivers/infiniband/sw/siw/siw_cm.c index d1860f3e8740..1662216be66d 100644 --- a/drivers/infiniband/sw/siw/siw_cm.c +++ b/drivers/infiniband/sw/siw/siw_cm.c @@ -947,16 +947,8 @@ static void siw_accept_newconn(struct siw_cep *cep) siw_cep_get(new_cep); new_s->sk->sk_user_data = new_cep; - if (siw_tcp_nagle == false) { - int val = 1; - - rv = kernel_setsockopt(new_s, SOL_TCP, TCP_NODELAY, - (char *)&val, sizeof(val)); - if (rv) { - siw_dbg_cep(cep, "setsockopt NODELAY error: %d\n", rv); - goto error; - } - } + if (siw_tcp_nagle == false) + tcp_sock_set_nodelay(new_s->sk); new_cep->state = SIW_EPSTATE_AWAIT_MPAREQ; rv = siw_cm_queue_work(new_cep, SIW_CM_WORK_MPATIMEOUT); @@ -1386,16 +1378,8 @@ int siw_connect(struct iw_cm_id *id, struct iw_cm_conn_param *params) siw_dbg_qp(qp, "kernel_bindconnect: error %d\n", rv); goto error; } - if (siw_tcp_nagle == false) { - int val = 1; - - rv = kernel_setsockopt(s, SOL_TCP, TCP_NODELAY, (char *)&val, - sizeof(val)); - if (rv) { - siw_dbg_qp(qp, "setsockopt NODELAY error: %d\n", rv); - goto error; - } - } + if (siw_tcp_nagle == false) + tcp_sock_set_nodelay(s->sk); cep = siw_cep_alloc(sdev); if (!cep) { rv = -ENOMEM; diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index a307972d33a0..4e4a750ecdb9 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1346,14 +1346,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, } /* Set TCP no delay */ - opt = 1; - ret = kernel_setsockopt(queue->sock, IPPROTO_TCP, - TCP_NODELAY, (char *)&opt, sizeof(opt)); - if (ret) { - dev_err(nctrl->device, - "failed to set TCP_NODELAY sock opt %d\n", ret); - goto err_sock; - } + tcp_sock_set_nodelay(queue->sock->sk); /* * Cleanup whatever is sitting in the TCP transmit queue on socket diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index f3088156d01d..55bc4c3c0a74 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -1580,7 +1580,7 @@ static int nvmet_tcp_add_port(struct nvmet_port *nport) { struct nvmet_tcp_port *port; __kernel_sa_family_t af; - int opt, ret; + int ret; port = kzalloc(sizeof(*port), GFP_KERNEL); if (!port) @@ -1625,15 +1625,7 @@ static int nvmet_tcp_add_port(struct nvmet_port *nport) port->data_ready = port->sock->sk->sk_data_ready; port->sock->sk->sk_data_ready = nvmet_tcp_listen_data_ready; sock_set_reuseaddr(port->sock->sk); - - opt = 1; - ret = kernel_setsockopt(port->sock, IPPROTO_TCP, - TCP_NODELAY, (char *)&opt, sizeof(opt)); - if (ret) { - pr_err("failed to set TCP_NODELAY sock opt %d\n", ret); - goto err_sock; - } - + tcp_sock_set_nodelay(port->sock->sk); if (so_priority > 0) sock_set_priority(port->sock->sk, so_priority); diff --git a/drivers/target/iscsi/iscsi_target_login.c b/drivers/target/iscsi/iscsi_target_login.c index 91acb3f07b4c..b561b07a869a 100644 --- a/drivers/target/iscsi/iscsi_target_login.c +++ b/drivers/target/iscsi/iscsi_target_login.c @@ -897,20 +897,11 @@ int iscsit_setup_np( /* * Set SO_REUSEADDR, and disable Nagel Algorithm with TCP_NODELAY. */ - /* FIXME: Someone please explain why this is endian-safe */ - opt = 1; - if (np->np_network_transport == ISCSI_TCP) { - ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_NODELAY, - (char *)&opt, sizeof(opt)); - if (ret < 0) { - pr_err("kernel_setsockopt() for TCP_NODELAY" - " failed: %d\n", ret); - goto fail; - } - } - + if (np->np_network_transport == ISCSI_TCP) + tcp_sock_set_nodelay(sock->sk); sock_set_reuseaddr(sock->sk); + opt = 1; ret = kernel_setsockopt(sock, IPPROTO_IP, IP_FREEBIND, (char *)&opt, sizeof(opt)); if (ret < 0) { diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 28268ed461b8..ad8fb53b3682 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3929,14 +3929,8 @@ generic_ip_connect(struct TCP_Server_Info *server) socket->sk->sk_rcvbuf = 140 * 1024; } - if (server->tcp_nodelay) { - int val = 1; - rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY, - (char *)&val, sizeof(val)); - if (rc) - cifs_dbg(FYI, "set TCP_NODELAY socket option error %d\n", - rc); - } + if (server->tcp_nodelay) + tcp_sock_set_nodelay(socket->sk); cifs_dbg(FYI, "sndbuf %d rcvbuf %d rcvtimeo 0x%lx\n", socket->sk->sk_sndbuf, diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 2822a430a2b4..69333728d871 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1011,7 +1011,6 @@ static void tcp_connect_to_sock(struct connection *con) struct sockaddr_storage saddr, src_addr; int addr_len; struct socket *sock = NULL; - int one = 1; int result; if (con->nodeid == 0) { @@ -1060,8 +1059,7 @@ static void tcp_connect_to_sock(struct connection *con) log_print("connecting to %d", con->nodeid); /* Turn off Nagle's algorithm */ - kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one, - sizeof(one)); + tcp_sock_set_nodelay(sock->sk); result = sock->ops->connect(sock, (struct sockaddr *)&saddr, addr_len, O_NONBLOCK); @@ -1103,7 +1101,6 @@ static struct socket *tcp_create_listen_sock(struct connection *con, { struct socket *sock = NULL; int result = 0; - int one = 1; int addr_len; if (dlm_local_addr[0]->ss_family == AF_INET) @@ -1120,8 +1117,7 @@ static struct socket *tcp_create_listen_sock(struct connection *con, } /* Turn off Nagle's algorithm */ - kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one, - sizeof(one)); + tcp_sock_set_nodelay(sock->sk); sock_set_reuseaddr(sock->sk); diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 2c512b40a940..4c70fe9d19ab 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1441,14 +1441,6 @@ static void o2net_rx_until_empty(struct work_struct *work) sc_put(sc); } -static int o2net_set_nodelay(struct socket *sock) -{ - int val = 1; - - return kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, - (void *)&val, sizeof(val)); -} - static int o2net_set_usertimeout(struct socket *sock) { int user_timeout = O2NET_TCP_USER_TIMEOUT; @@ -1636,11 +1628,7 @@ static void o2net_start_connect(struct work_struct *work) goto out; } - ret = o2net_set_nodelay(sc->sc_sock); - if (ret) { - mlog(ML_ERROR, "setting TCP_NODELAY failed with %d\n", ret); - goto out; - } + tcp_sock_set_nodelay(sc->sc_sock->sk); ret = o2net_set_usertimeout(sock); if (ret) { @@ -1832,11 +1820,7 @@ static int o2net_accept_one(struct socket *sock, int *more) *more = 1; new_sock->sk->sk_allocation = GFP_ATOMIC; - ret = o2net_set_nodelay(new_sock); - if (ret) { - mlog(ML_ERROR, "setting TCP_NODELAY failed with %d\n", ret); - goto out; - } + tcp_sock_set_nodelay(new_sock->sk); ret = o2net_set_usertimeout(new_sock); if (ret) { diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 889eeb2256c2..9e42c7fe50a8 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -498,5 +498,6 @@ int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, int pcount, int shiftlen); void tcp_sock_set_cork(struct sock *sk, bool on); +void tcp_sock_set_nodelay(struct sock *sk); #endif /* _LINUX_TCP_H */ diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index f8ca5edc5f2c..27d6ab11f9ee 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -490,15 +490,8 @@ static int ceph_tcp_connect(struct ceph_connection *con) return ret; } - if (ceph_test_opt(from_msgr(con->msgr), TCP_NODELAY)) { - int optval = 1; - - ret = kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, - (char *)&optval, sizeof(optval)); - if (ret) - pr_err("kernel_setsockopt(TCP_NODELAY) failed: %d", - ret); - } + if (ceph_test_opt(from_msgr(con->msgr), TCP_NODELAY)) + tcp_sock_set_nodelay(sock->sk); con->sock = sock; return 0; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e6cf702e16d6..a65f293a19fa 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2832,6 +2832,30 @@ void tcp_sock_set_cork(struct sock *sk, bool on) } EXPORT_SYMBOL(tcp_sock_set_cork); +/* TCP_NODELAY is weaker than TCP_CORK, so that this option on corked socket is + * remembered, but it is not activated until cork is cleared. + * + * However, when TCP_NODELAY is set we make an explicit push, which overrides + * even TCP_CORK for currently queued segments. + */ +static void __tcp_sock_set_nodelay(struct sock *sk, bool on) +{ + if (on) { + tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH; + tcp_push_pending_frames(sk); + } else { + tcp_sk(sk)->nonagle &= ~TCP_NAGLE_OFF; + } +} + +void tcp_sock_set_nodelay(struct sock *sk) +{ + lock_sock(sk); + __tcp_sock_set_nodelay(sk, true); + release_sock(sk); +} +EXPORT_SYMBOL(tcp_sock_set_nodelay); + /* * Socket option code for TCP. */ @@ -2929,20 +2953,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, break; case TCP_NODELAY: - if (val) { - /* TCP_NODELAY is weaker than TCP_CORK, so that - * this option on corked socket is remembered, but - * it is not activated until cork is cleared. - * - * However, when TCP_NODELAY is set we make - * an explicit push, which overrides even TCP_CORK - * for currently queued segments. - */ - tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH; - tcp_push_pending_frames(sk); - } else { - tp->nonagle &= ~TCP_NAGLE_OFF; - } + __tcp_sock_set_nodelay(sk, val); break; case TCP_THIN_LINEAR_TIMEOUTS: diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 46782fac4c16..43db0eca911f 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -89,15 +89,6 @@ static struct ctl_table rds_tcp_sysctl_table[] = { { } }; -/* doing it this way avoids calling tcp_sk() */ -void rds_tcp_nonagle(struct socket *sock) -{ - int val = 1; - - kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (void *)&val, - sizeof(val)); -} - u32 rds_tcp_write_seq(struct rds_tcp_connection *tc) { /* seq# of the last byte of data in tcp send buffer */ @@ -502,7 +493,7 @@ void rds_tcp_tune(struct socket *sock) struct net *net = sock_net(sk); struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid); - rds_tcp_nonagle(sock); + tcp_sock_set_nodelay(sock->sk); lock_sock(sk); if (rtn->sndbuf_size > 0) { sk->sk_sndbuf = rtn->sndbuf_size; diff --git a/net/rds/tcp.h b/net/rds/tcp.h index d640e210b97b..f6d75d8cb167 100644 --- a/net/rds/tcp.h +++ b/net/rds/tcp.h @@ -50,7 +50,6 @@ struct rds_tcp_statistics { /* tcp.c */ void rds_tcp_tune(struct socket *sock); -void rds_tcp_nonagle(struct socket *sock); void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp); void rds_tcp_reset_callbacks(struct socket *sock, struct rds_conn_path *cp); void rds_tcp_restore_callbacks(struct socket *sock, diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index d8bd13276959..6f90ea077adc 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -288,7 +288,7 @@ struct socket *rds_tcp_listen_init(struct net *net, bool isv6) } sock->sk->sk_reuse = SK_CAN_REUSE; - rds_tcp_nonagle(sock); + tcp_sock_set_nodelay(sock->sk); write_lock_bh(&sock->sk->sk_callback_lock); sock->sk->sk_user_data = sock->sk->sk_data_ready; -- cgit v1.2.3 From ddd061b8daed3ce0c01109a69c9a2a9f9669f01a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:20 +0200 Subject: tcp: add tcp_sock_set_quickack Add a helper to directly set the TCP_QUICKACK sockopt from kernel space without going through a fake uaccess. Cleanup the callers to avoid pointless wrappers now that this is a simple function call. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- drivers/block/drbd/drbd_int.h | 7 ------- drivers/block/drbd/drbd_receiver.c | 5 ++--- include/linux/tcp.h | 1 + net/ipv4/tcp.c | 39 +++++++++++++++++++++++++------------- 4 files changed, 29 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index e24bba87c8e0..14345a87c7cc 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1570,13 +1570,6 @@ extern void drbd_set_recv_tcq(struct drbd_device *device, int tcq_enabled); extern void _drbd_clear_done_ee(struct drbd_device *device, struct list_head *to_be_freed); extern int drbd_connected(struct drbd_peer_device *); -static inline void drbd_tcp_quickack(struct socket *sock) -{ - int val = 2; - (void) kernel_setsockopt(sock, SOL_TCP, TCP_QUICKACK, - (char*)&val, sizeof(val)); -} - /* sets the number of 512 byte sectors of our virtual device */ void drbd_set_my_capacity(struct drbd_device *device, sector_t size); diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 20a5e94494ac..3a3f2b6a821f 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -1223,7 +1223,7 @@ static int drbd_recv_header_maybe_unplug(struct drbd_connection *connection, str * quickly as possible, and let remote TCP know what we have * received so far. */ if (err == -EAGAIN) { - drbd_tcp_quickack(connection->data.socket); + tcp_sock_set_quickack(connection->data.socket->sk, 2); drbd_unplug_all_devices(connection); } if (err > 0) { @@ -4959,8 +4959,7 @@ static int receive_UnplugRemote(struct drbd_connection *connection, struct packe { /* Make sure we've acked all the TCP data associated * with the data requests being unplugged */ - drbd_tcp_quickack(connection->data.socket); - + tcp_sock_set_quickack(connection->data.socket->sk, 2); return 0; } diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 9e42c7fe50a8..2eaf8320b9db 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -499,5 +499,6 @@ int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, int pcount, void tcp_sock_set_cork(struct sock *sk, bool on); void tcp_sock_set_nodelay(struct sock *sk); +void tcp_sock_set_quickack(struct sock *sk, int val); #endif /* _LINUX_TCP_H */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index a65f293a19fa..27b5e7a4e2ef 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2856,6 +2856,31 @@ void tcp_sock_set_nodelay(struct sock *sk) } EXPORT_SYMBOL(tcp_sock_set_nodelay); +static void __tcp_sock_set_quickack(struct sock *sk, int val) +{ + if (!val) { + inet_csk_enter_pingpong_mode(sk); + return; + } + + inet_csk_exit_pingpong_mode(sk); + if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && + inet_csk_ack_scheduled(sk)) { + inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_PUSHED; + tcp_cleanup_rbuf(sk, 1); + if (!(val & 1)) + inet_csk_enter_pingpong_mode(sk); + } +} + +void tcp_sock_set_quickack(struct sock *sk, int val) +{ + lock_sock(sk); + __tcp_sock_set_quickack(sk, val); + release_sock(sk); +} +EXPORT_SYMBOL(tcp_sock_set_quickack); + /* * Socket option code for TCP. */ @@ -3096,19 +3121,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, break; case TCP_QUICKACK: - if (!val) { - inet_csk_enter_pingpong_mode(sk); - } else { - inet_csk_exit_pingpong_mode(sk); - if ((1 << sk->sk_state) & - (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && - inet_csk_ack_scheduled(sk)) { - icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; - tcp_cleanup_rbuf(sk, 1); - if (!(val & 1)) - inet_csk_enter_pingpong_mode(sk); - } - } + __tcp_sock_set_quickack(sk, val); break; #ifdef CONFIG_TCP_MD5SIG -- cgit v1.2.3 From 557eadfcc5ee8f8fa98a795e05ed21db58a65db5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:21 +0200 Subject: tcp: add tcp_sock_set_syncnt Add a helper to directly set the TCP_SYNCNT sockopt from kernel space without going through a fake uaccess. Signed-off-by: Christoph Hellwig Acked-by: Sagi Grimberg Signed-off-by: David S. Miller --- drivers/nvme/host/tcp.c | 9 +-------- include/linux/tcp.h | 1 + net/ipv4/tcp.c | 12 ++++++++++++ 3 files changed, 14 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 4e4a750ecdb9..2872584f52f6 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1336,14 +1336,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, } /* Single syn retry */ - opt = 1; - ret = kernel_setsockopt(queue->sock, IPPROTO_TCP, TCP_SYNCNT, - (char *)&opt, sizeof(opt)); - if (ret) { - dev_err(nctrl->device, - "failed to set TCP_SYNCNT sock opt %d\n", ret); - goto err_sock; - } + tcp_sock_set_syncnt(queue->sock->sk, 1); /* Set TCP no delay */ tcp_sock_set_nodelay(queue->sock->sk); diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 2eaf8320b9db..6aa4ae5ebf3d 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -500,5 +500,6 @@ int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, int pcount, void tcp_sock_set_cork(struct sock *sk, bool on); void tcp_sock_set_nodelay(struct sock *sk); void tcp_sock_set_quickack(struct sock *sk, int val); +int tcp_sock_set_syncnt(struct sock *sk, int val); #endif /* _LINUX_TCP_H */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 27b5e7a4e2ef..d2c67ae1da07 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2881,6 +2881,18 @@ void tcp_sock_set_quickack(struct sock *sk, int val) } EXPORT_SYMBOL(tcp_sock_set_quickack); +int tcp_sock_set_syncnt(struct sock *sk, int val) +{ + if (val < 1 || val > MAX_TCP_SYNCNT) + return -EINVAL; + + lock_sock(sk); + inet_csk(sk)->icsk_syn_retries = val; + release_sock(sk); + return 0; +} +EXPORT_SYMBOL(tcp_sock_set_syncnt); + /* * Socket option code for TCP. */ -- cgit v1.2.3 From c488aeadcbd002a992593e6090d54e8ac27c4310 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:22 +0200 Subject: tcp: add tcp_sock_set_user_timeout Add a helper to directly set the TCP_USER_TIMEOUT sockopt from kernel space without going through a fake uaccess. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- fs/ocfs2/cluster/tcp.c | 22 ++-------------------- include/linux/tcp.h | 1 + net/ipv4/tcp.c | 8 ++++++++ net/sunrpc/xprtsock.c | 3 +-- 4 files changed, 12 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 4c70fe9d19ab..79a231719460 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1441,14 +1441,6 @@ static void o2net_rx_until_empty(struct work_struct *work) sc_put(sc); } -static int o2net_set_usertimeout(struct socket *sock) -{ - int user_timeout = O2NET_TCP_USER_TIMEOUT; - - return kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT, - (void *)&user_timeout, sizeof(user_timeout)); -} - static void o2net_initialize_handshake(void) { o2net_hand->o2hb_heartbeat_timeout_ms = cpu_to_be32( @@ -1629,12 +1621,7 @@ static void o2net_start_connect(struct work_struct *work) } tcp_sock_set_nodelay(sc->sc_sock->sk); - - ret = o2net_set_usertimeout(sock); - if (ret) { - mlog(ML_ERROR, "set TCP_USER_TIMEOUT failed with %d\n", ret); - goto out; - } + tcp_sock_set_user_timeout(sock->sk, O2NET_TCP_USER_TIMEOUT); o2net_register_callbacks(sc->sc_sock->sk, sc); @@ -1821,12 +1808,7 @@ static int o2net_accept_one(struct socket *sock, int *more) new_sock->sk->sk_allocation = GFP_ATOMIC; tcp_sock_set_nodelay(new_sock->sk); - - ret = o2net_set_usertimeout(new_sock); - if (ret) { - mlog(ML_ERROR, "set TCP_USER_TIMEOUT failed with %d\n", ret); - goto out; - } + tcp_sock_set_user_timeout(new_sock->sk, O2NET_TCP_USER_TIMEOUT); ret = new_sock->ops->getname(new_sock, (struct sockaddr *) &sin, 1); if (ret < 0) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 6aa4ae5ebf3d..de682143efe4 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -501,5 +501,6 @@ void tcp_sock_set_cork(struct sock *sk, bool on); void tcp_sock_set_nodelay(struct sock *sk); void tcp_sock_set_quickack(struct sock *sk, int val); int tcp_sock_set_syncnt(struct sock *sk, int val); +void tcp_sock_set_user_timeout(struct sock *sk, u32 val); #endif /* _LINUX_TCP_H */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index d2c67ae1da07..0004bd9ae7b0 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2893,6 +2893,14 @@ int tcp_sock_set_syncnt(struct sock *sk, int val) } EXPORT_SYMBOL(tcp_sock_set_syncnt); +void tcp_sock_set_user_timeout(struct sock *sk, u32 val) +{ + lock_sock(sk); + inet_csk(sk)->icsk_user_timeout = val; + release_sock(sk); +} +EXPORT_SYMBOL(tcp_sock_set_user_timeout); + /* * Socket option code for TCP. */ diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 399848c2bcb2..231fd6162f68 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2115,8 +2115,7 @@ static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt, (char *)&keepcnt, sizeof(keepcnt)); /* TCP user timeout (see RFC5482) */ - kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT, - (char *)&timeo, sizeof(timeo)); + tcp_sock_set_user_timeout(sock->sk, timeo); } static void xs_tcp_set_connect_timeout(struct rpc_xprt *xprt, -- cgit v1.2.3 From 71c48eb81c9ecb6fed49dc33e7c9b621fdcb7bf8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:23 +0200 Subject: tcp: add tcp_sock_set_keepidle Add a helper to directly set the TCP_KEEP_IDLE sockopt from kernel space without going through a fake uaccess. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/linux/tcp.h | 1 + net/ipv4/tcp.c | 49 ++++++++++++++++++++++++++++++++++--------------- net/rds/tcp_listen.c | 5 +---- net/sunrpc/xprtsock.c | 3 +-- 4 files changed, 37 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index de682143efe4..5724dd84a85e 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -498,6 +498,7 @@ int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, int pcount, int shiftlen); void tcp_sock_set_cork(struct sock *sk, bool on); +int tcp_sock_set_keepidle(struct sock *sk, int val); void tcp_sock_set_nodelay(struct sock *sk); void tcp_sock_set_quickack(struct sock *sk, int val); int tcp_sock_set_syncnt(struct sock *sk, int val); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 0004bd9ae7b0..bdf0ff933351 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2901,6 +2901,39 @@ void tcp_sock_set_user_timeout(struct sock *sk, u32 val) } EXPORT_SYMBOL(tcp_sock_set_user_timeout); +static int __tcp_sock_set_keepidle(struct sock *sk, int val) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (val < 1 || val > MAX_TCP_KEEPIDLE) + return -EINVAL; + + tp->keepalive_time = val * HZ; + if (sock_flag(sk, SOCK_KEEPOPEN) && + !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) { + u32 elapsed = keepalive_time_elapsed(tp); + + if (tp->keepalive_time > elapsed) + elapsed = tp->keepalive_time - elapsed; + else + elapsed = 0; + inet_csk_reset_keepalive_timer(sk, elapsed); + } + + return 0; +} + +int tcp_sock_set_keepidle(struct sock *sk, int val) +{ + int err; + + lock_sock(sk); + err = __tcp_sock_set_keepidle(sk, val); + release_sock(sk); + return err; +} +EXPORT_SYMBOL(tcp_sock_set_keepidle); + /* * Socket option code for TCP. */ @@ -3070,21 +3103,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, break; case TCP_KEEPIDLE: - if (val < 1 || val > MAX_TCP_KEEPIDLE) - err = -EINVAL; - else { - tp->keepalive_time = val * HZ; - if (sock_flag(sk, SOCK_KEEPOPEN) && - !((1 << sk->sk_state) & - (TCPF_CLOSE | TCPF_LISTEN))) { - u32 elapsed = keepalive_time_elapsed(tp); - if (tp->keepalive_time > elapsed) - elapsed = tp->keepalive_time - elapsed; - else - elapsed = 0; - inet_csk_reset_keepalive_timer(sk, elapsed); - } - } + err = __tcp_sock_set_keepidle(sk, val); break; case TCP_KEEPINTVL: if (val < 1 || val > MAX_TCP_KEEPINTVL) diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 6f90ea077adc..79f9adc00811 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -52,10 +52,7 @@ int rds_tcp_keepalive(struct socket *sock) if (ret < 0) goto bail; - ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPIDLE, - (char *)&keepidle, sizeof(keepidle)); - if (ret < 0) - goto bail; + tcp_sock_set_keepidle(sock->sk, keepidle); /* KEEPINTVL is the interval between successive probes. We follow * the model in xs_tcp_finish_connecting() and re-use keepidle. diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 231fd6162f68..473290f7c5c0 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2107,8 +2107,7 @@ static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt, /* TCP Keepalive options */ sock_set_keepalive(sock->sk); - kernel_setsockopt(sock, SOL_TCP, TCP_KEEPIDLE, - (char *)&keepidle, sizeof(keepidle)); + tcp_sock_set_keepidle(sock->sk, keepidle); kernel_setsockopt(sock, SOL_TCP, TCP_KEEPINTVL, (char *)&keepidle, sizeof(keepidle)); kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT, -- cgit v1.2.3 From d41ecaac903c9f4658a71d4e7a708673cfb5abba Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:24 +0200 Subject: tcp: add tcp_sock_set_keepintvl Add a helper to directly set the TCP_KEEPINTVL sockopt from kernel space without going through a fake uaccess. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/linux/tcp.h | 1 + net/ipv4/tcp.c | 12 ++++++++++++ net/rds/tcp_listen.c | 4 +--- net/sunrpc/xprtsock.c | 3 +-- 4 files changed, 15 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 5724dd84a85e..1f9bada00faa 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -499,6 +499,7 @@ int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, int pcount, void tcp_sock_set_cork(struct sock *sk, bool on); int tcp_sock_set_keepidle(struct sock *sk, int val); +int tcp_sock_set_keepintvl(struct sock *sk, int val); void tcp_sock_set_nodelay(struct sock *sk); void tcp_sock_set_quickack(struct sock *sk, int val); int tcp_sock_set_syncnt(struct sock *sk, int val); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index bdf0ff933351..7eb083e09786 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2934,6 +2934,18 @@ int tcp_sock_set_keepidle(struct sock *sk, int val) } EXPORT_SYMBOL(tcp_sock_set_keepidle); +int tcp_sock_set_keepintvl(struct sock *sk, int val) +{ + if (val < 1 || val > MAX_TCP_KEEPINTVL) + return -EINVAL; + + lock_sock(sk); + tcp_sk(sk)->keepalive_intvl = val * HZ; + release_sock(sk); + return 0; +} +EXPORT_SYMBOL(tcp_sock_set_keepintvl); + /* * Socket option code for TCP. */ diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 79f9adc00811..9ad555c48d15 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -53,12 +53,10 @@ int rds_tcp_keepalive(struct socket *sock) goto bail; tcp_sock_set_keepidle(sock->sk, keepidle); - /* KEEPINTVL is the interval between successive probes. We follow * the model in xs_tcp_finish_connecting() and re-use keepidle. */ - ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPINTVL, - (char *)&keepidle, sizeof(keepidle)); + tcp_sock_set_keepintvl(sock->sk, keepidle); bail: return ret; } diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 473290f7c5c0..5ca64e12af0c 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2108,8 +2108,7 @@ static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt, /* TCP Keepalive options */ sock_set_keepalive(sock->sk); tcp_sock_set_keepidle(sock->sk, keepidle); - kernel_setsockopt(sock, SOL_TCP, TCP_KEEPINTVL, - (char *)&keepidle, sizeof(keepidle)); + tcp_sock_set_keepintvl(sock->sk, keepidle); kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT, (char *)&keepcnt, sizeof(keepcnt)); -- cgit v1.2.3 From 480aeb9639d6a077c611b303a22f9b1e5937d081 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:25 +0200 Subject: tcp: add tcp_sock_set_keepcnt Add a helper to directly set the TCP_KEEPCNT sockopt from kernel space without going through a fake uaccess. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/linux/tcp.h | 1 + net/ipv4/tcp.c | 12 ++++++++++++ net/rds/tcp.h | 2 +- net/rds/tcp_listen.c | 17 +++-------------- net/sunrpc/xprtsock.c | 3 +-- 5 files changed, 18 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 1f9bada00faa..9aac824c523c 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -498,6 +498,7 @@ int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, int pcount, int shiftlen); void tcp_sock_set_cork(struct sock *sk, bool on); +int tcp_sock_set_keepcnt(struct sock *sk, int val); int tcp_sock_set_keepidle(struct sock *sk, int val); int tcp_sock_set_keepintvl(struct sock *sk, int val); void tcp_sock_set_nodelay(struct sock *sk); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 7eb083e09786..15d47d5e7951 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2946,6 +2946,18 @@ int tcp_sock_set_keepintvl(struct sock *sk, int val) } EXPORT_SYMBOL(tcp_sock_set_keepintvl); +int tcp_sock_set_keepcnt(struct sock *sk, int val) +{ + if (val < 1 || val > MAX_TCP_KEEPCNT) + return -EINVAL; + + lock_sock(sk); + tcp_sk(sk)->keepalive_probes = val; + release_sock(sk); + return 0; +} +EXPORT_SYMBOL(tcp_sock_set_keepcnt); + /* * Socket option code for TCP. */ diff --git a/net/rds/tcp.h b/net/rds/tcp.h index f6d75d8cb167..bad9cf49d565 100644 --- a/net/rds/tcp.h +++ b/net/rds/tcp.h @@ -70,7 +70,7 @@ struct socket *rds_tcp_listen_init(struct net *net, bool isv6); void rds_tcp_listen_stop(struct socket *sock, struct work_struct *acceptor); void rds_tcp_listen_data_ready(struct sock *sk); int rds_tcp_accept_one(struct socket *sock); -int rds_tcp_keepalive(struct socket *sock); +void rds_tcp_keepalive(struct socket *sock); void *rds_tcp_listen_sock_def_readable(struct net *net); /* tcp_recv.c */ diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 9ad555c48d15..101cf14215a0 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -38,27 +38,19 @@ #include "rds.h" #include "tcp.h" -int rds_tcp_keepalive(struct socket *sock) +void rds_tcp_keepalive(struct socket *sock) { /* values below based on xs_udp_default_timeout */ int keepidle = 5; /* send a probe 'keepidle' secs after last data */ int keepcnt = 5; /* number of unack'ed probes before declaring dead */ - int ret = 0; sock_set_keepalive(sock->sk); - - ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPCNT, - (char *)&keepcnt, sizeof(keepcnt)); - if (ret < 0) - goto bail; - + tcp_sock_set_keepcnt(sock->sk, keepcnt); tcp_sock_set_keepidle(sock->sk, keepidle); /* KEEPINTVL is the interval between successive probes. We follow * the model in xs_tcp_finish_connecting() and re-use keepidle. */ tcp_sock_set_keepintvl(sock->sk, keepidle); -bail: - return ret; } /* rds_tcp_accept_one_path(): if accepting on cp_index > 0, make sure the @@ -140,10 +132,7 @@ int rds_tcp_accept_one(struct socket *sock) new_sock->ops = sock->ops; __module_get(new_sock->ops->owner); - ret = rds_tcp_keepalive(new_sock); - if (ret < 0) - goto out; - + rds_tcp_keepalive(new_sock); rds_tcp_tune(new_sock); inet = inet_sk(new_sock->sk); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 5ca64e12af0c..0d3ec055bc12 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2109,8 +2109,7 @@ static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt, sock_set_keepalive(sock->sk); tcp_sock_set_keepidle(sock->sk, keepidle); tcp_sock_set_keepintvl(sock->sk, keepidle); - kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT, - (char *)&keepcnt, sizeof(keepcnt)); + tcp_sock_set_keepcnt(sock->sk, keepcnt); /* TCP user timeout (see RFC5482) */ tcp_sock_set_user_timeout(sock->sk, timeo); -- cgit v1.2.3 From 6ebf71bab9fb476fc8132be4c12b88201278f0ca Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:26 +0200 Subject: ipv4: add ip_sock_set_tos Add a helper to directly set the IP_TOS sockopt from kernel space without going through a fake uaccess. Signed-off-by: Christoph Hellwig Acked-by: Sagi Grimberg Signed-off-by: David S. Miller --- drivers/nvme/host/tcp.c | 14 +++----------- drivers/nvme/target/tcp.c | 10 ++-------- include/net/ip.h | 2 ++ net/ipv4/ip_sockglue.c | 30 +++++++++++++++++++++--------- 4 files changed, 28 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 2872584f52f6..4c972d8abf31 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1313,7 +1313,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, { struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); struct nvme_tcp_queue *queue = &ctrl->queues[qid]; - int ret, opt, rcv_pdu_size; + int ret, rcv_pdu_size; queue->ctrl = ctrl; INIT_LIST_HEAD(&queue->send_list); @@ -1352,16 +1352,8 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, sock_set_priority(queue->sock->sk, so_priority); /* Set socket type of service */ - if (nctrl->opts->tos >= 0) { - opt = nctrl->opts->tos; - ret = kernel_setsockopt(queue->sock, SOL_IP, IP_TOS, - (char *)&opt, sizeof(opt)); - if (ret) { - dev_err(nctrl->device, - "failed to set IP_TOS sock opt %d\n", ret); - goto err_sock; - } - } + if (nctrl->opts->tos >= 0) + ip_sock_set_tos(queue->sock->sk, nctrl->opts->tos); queue->sock->sk->sk_allocation = GFP_ATOMIC; nvme_tcp_set_queue_io_cpu(queue); diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 55bc4c3c0a74..4546049a96b3 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -1452,14 +1452,8 @@ static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue) sock_set_priority(sock->sk, so_priority); /* Set socket type of service */ - if (inet->rcv_tos > 0) { - int tos = inet->rcv_tos; - - ret = kernel_setsockopt(sock, SOL_IP, IP_TOS, - (char *)&tos, sizeof(tos)); - if (ret) - return ret; - } + if (inet->rcv_tos > 0) + ip_sock_set_tos(sock->sk, inet->rcv_tos); write_lock_bh(&sock->sk->sk_callback_lock); sock->sk->sk_user_data = queue; diff --git a/include/net/ip.h b/include/net/ip.h index 5b317c9f4470..2fc52e26fa88 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -765,4 +765,6 @@ static inline bool inetdev_valid_mtu(unsigned int mtu) return likely(mtu >= IPV4_MIN_MTU); } +void ip_sock_set_tos(struct sock *sk, int val); + #endif /* _IP_H */ diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index f43d5f12aa86..b43a29e11f4a 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -560,6 +560,26 @@ out: return err; } +static void __ip_sock_set_tos(struct sock *sk, int val) +{ + if (sk->sk_type == SOCK_STREAM) { + val &= ~INET_ECN_MASK; + val |= inet_sk(sk)->tos & INET_ECN_MASK; + } + if (inet_sk(sk)->tos != val) { + inet_sk(sk)->tos = val; + sk->sk_priority = rt_tos2priority(val); + sk_dst_reset(sk); + } +} + +void ip_sock_set_tos(struct sock *sk, int val) +{ + lock_sock(sk); + __ip_sock_set_tos(sk, val); + release_sock(sk); +} +EXPORT_SYMBOL(ip_sock_set_tos); /* * Socket option code for IP. This is the end of the line after any @@ -823,15 +843,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, inet->cmsg_flags &= ~IP_CMSG_RECVFRAGSIZE; break; case IP_TOS: /* This sets both TOS and Precedence */ - if (sk->sk_type == SOCK_STREAM) { - val &= ~INET_ECN_MASK; - val |= inet->tos & INET_ECN_MASK; - } - if (inet->tos != val) { - inet->tos = val; - sk->sk_priority = rt_tos2priority(val); - sk_dst_reset(sk); - } + __ip_sock_set_tos(sk, val); break; case IP_TTL: if (optlen < 1) -- cgit v1.2.3 From c4e446bf5a06a1db24b4f0115a89f0380a495c62 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:27 +0200 Subject: ipv4: add ip_sock_set_freebind Add a helper to directly set the IP_FREEBIND sockopt from kernel space without going through a fake uaccess. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- drivers/target/iscsi/iscsi_target_login.c | 13 +++---------- include/net/ip.h | 1 + net/ipv4/ip_sockglue.c | 8 ++++++++ 3 files changed, 12 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/drivers/target/iscsi/iscsi_target_login.c b/drivers/target/iscsi/iscsi_target_login.c index b561b07a869a..85748e338858 100644 --- a/drivers/target/iscsi/iscsi_target_login.c +++ b/drivers/target/iscsi/iscsi_target_login.c @@ -15,6 +15,7 @@ #include #include #include /* TCP_NODELAY */ +#include #include /* ipv6_addr_v4mapped() */ #include #include @@ -855,7 +856,7 @@ int iscsit_setup_np( struct sockaddr_storage *sockaddr) { struct socket *sock = NULL; - int backlog = ISCSIT_TCP_BACKLOG, ret, opt = 0, len; + int backlog = ISCSIT_TCP_BACKLOG, ret, len; switch (np->np_network_transport) { case ISCSI_TCP: @@ -900,15 +901,7 @@ int iscsit_setup_np( if (np->np_network_transport == ISCSI_TCP) tcp_sock_set_nodelay(sock->sk); sock_set_reuseaddr(sock->sk); - - opt = 1; - ret = kernel_setsockopt(sock, IPPROTO_IP, IP_FREEBIND, - (char *)&opt, sizeof(opt)); - if (ret < 0) { - pr_err("kernel_setsockopt() for IP_FREEBIND" - " failed\n"); - goto fail; - } + ip_sock_set_freebind(sock->sk); ret = kernel_bind(sock, (struct sockaddr *)&np->np_sockaddr, len); if (ret < 0) { diff --git a/include/net/ip.h b/include/net/ip.h index 2fc52e26fa88..5f5d8226b6ab 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -765,6 +765,7 @@ static inline bool inetdev_valid_mtu(unsigned int mtu) return likely(mtu >= IPV4_MIN_MTU); } +void ip_sock_set_freebind(struct sock *sk); void ip_sock_set_tos(struct sock *sk, int val); #endif /* _IP_H */ diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index b43a29e11f4a..767838d2030d 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -581,6 +581,14 @@ void ip_sock_set_tos(struct sock *sk, int val) } EXPORT_SYMBOL(ip_sock_set_tos); +void ip_sock_set_freebind(struct sock *sk) +{ + lock_sock(sk); + inet_sk(sk)->freebind = true; + release_sock(sk); +} +EXPORT_SYMBOL(ip_sock_set_freebind); + /* * Socket option code for IP. This is the end of the line after any * TCP,UDP etc options on an IP socket. -- cgit v1.2.3 From db45c0ef258ef6c7ef3c1b8ea9e06e133e083c27 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:28 +0200 Subject: ipv4: add ip_sock_set_recverr Add a helper to directly set the IP_RECVERR sockopt from kernel space without going through a fake uaccess. Signed-off-by: Christoph Hellwig Reviewed-by: David Howells Signed-off-by: David S. Miller --- include/net/ip.h | 1 + net/ipv4/ip_sockglue.c | 8 ++++++++ net/rxrpc/local_object.c | 8 +------- 3 files changed, 10 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/net/ip.h b/include/net/ip.h index 5f5d8226b6ab..f063a491b906 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -766,6 +766,7 @@ static inline bool inetdev_valid_mtu(unsigned int mtu) } void ip_sock_set_freebind(struct sock *sk); +void ip_sock_set_recverr(struct sock *sk); void ip_sock_set_tos(struct sock *sk, int val); #endif /* _IP_H */ diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 767838d2030d..aca6b81da9ba 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -589,6 +589,14 @@ void ip_sock_set_freebind(struct sock *sk) } EXPORT_SYMBOL(ip_sock_set_freebind); +void ip_sock_set_recverr(struct sock *sk) +{ + lock_sock(sk); + inet_sk(sk)->recverr = true; + release_sock(sk); +} +EXPORT_SYMBOL(ip_sock_set_recverr); + /* * Socket option code for IP. This is the end of the line after any * TCP,UDP etc options on an IP socket. diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 5ea2bd01fdd5..4c0e8fe5ec1f 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -171,13 +171,7 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) /* Fall through */ case AF_INET: /* we want to receive ICMP errors */ - opt = 1; - ret = kernel_setsockopt(local->socket, SOL_IP, IP_RECVERR, - (char *) &opt, sizeof(opt)); - if (ret < 0) { - _debug("setsockopt failed"); - goto error; - } + ip_sock_set_recverr(local->socket->sk); /* we want to set the don't fragment bit */ opt = IP_PMTUDISC_DO; -- cgit v1.2.3 From 2de569bda2a66d1308ad3f205bb29cf4f95f5636 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:29 +0200 Subject: ipv4: add ip_sock_set_mtu_discover Add a helper to directly set the IP_MTU_DISCOVER sockopt from kernel space without going through a fake uaccess. Signed-off-by: Christoph Hellwig Reviewed-by: David Howells [rxrpc bits] Signed-off-by: David S. Miller --- include/net/ip.h | 1 + net/ipv4/ip_sockglue.c | 11 +++++++++++ net/rxrpc/local_object.c | 8 +------- net/rxrpc/output.c | 14 +++++--------- 4 files changed, 18 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/include/net/ip.h b/include/net/ip.h index f063a491b906..d3649c49dd33 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -766,6 +766,7 @@ static inline bool inetdev_valid_mtu(unsigned int mtu) } void ip_sock_set_freebind(struct sock *sk); +int ip_sock_set_mtu_discover(struct sock *sk, int val); void ip_sock_set_recverr(struct sock *sk); void ip_sock_set_tos(struct sock *sk, int val); diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index aca6b81da9ba..aa115be11dcf 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -597,6 +597,17 @@ void ip_sock_set_recverr(struct sock *sk) } EXPORT_SYMBOL(ip_sock_set_recverr); +int ip_sock_set_mtu_discover(struct sock *sk, int val) +{ + if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_OMIT) + return -EINVAL; + lock_sock(sk); + inet_sk(sk)->pmtudisc = val; + release_sock(sk); + return 0; +} +EXPORT_SYMBOL(ip_sock_set_mtu_discover); + /* * Socket option code for IP. This is the end of the line after any * TCP,UDP etc options on an IP socket. diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 4c0e8fe5ec1f..6f4e6b4817cf 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -174,13 +174,7 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) ip_sock_set_recverr(local->socket->sk); /* we want to set the don't fragment bit */ - opt = IP_PMTUDISC_DO; - ret = kernel_setsockopt(local->socket, SOL_IP, IP_MTU_DISCOVER, - (char *) &opt, sizeof(opt)); - if (ret < 0) { - _debug("setsockopt failed"); - goto error; - } + ip_sock_set_mtu_discover(local->socket->sk, IP_PMTUDISC_DO); /* We want receive timestamps. */ sock_enable_timestamps(local->socket->sk); diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index f8b632a5c619..1ba43c3df4ad 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -321,7 +321,7 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb, struct kvec iov[2]; rxrpc_serial_t serial; size_t len; - int ret, opt; + int ret; _enter(",{%d}", skb->len); @@ -473,18 +473,14 @@ send_fragmentable: switch (conn->params.local->srx.transport.family) { case AF_INET6: case AF_INET: - opt = IP_PMTUDISC_DONT; - kernel_setsockopt(conn->params.local->socket, - SOL_IP, IP_MTU_DISCOVER, - (char *)&opt, sizeof(opt)); + ip_sock_set_mtu_discover(conn->params.local->socket->sk, + IP_PMTUDISC_DONT); ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len); conn->params.peer->last_tx_at = ktime_get_seconds(); - opt = IP_PMTUDISC_DO; - kernel_setsockopt(conn->params.local->socket, - SOL_IP, IP_MTU_DISCOVER, - (char *)&opt, sizeof(opt)); + ip_sock_set_mtu_discover(conn->params.local->socket->sk, + IP_PMTUDISC_DO); break; default: -- cgit v1.2.3 From c1f9ec5776dd05eaf62cf6788ecdfc905dc8ec2b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:30 +0200 Subject: ipv4: add ip_sock_set_pktinfo Add a helper to directly set the IP_PKTINFO sockopt from kernel space without going through a fake uaccess. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/net/ip.h | 1 + net/ipv4/ip_sockglue.c | 8 ++++++++ net/sunrpc/svcsock.c | 5 ++--- 3 files changed, 11 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/net/ip.h b/include/net/ip.h index d3649c49dd33..04ebe7bf54c6 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -767,6 +767,7 @@ static inline bool inetdev_valid_mtu(unsigned int mtu) void ip_sock_set_freebind(struct sock *sk); int ip_sock_set_mtu_discover(struct sock *sk, int val); +void ip_sock_set_pktinfo(struct sock *sk); void ip_sock_set_recverr(struct sock *sk); void ip_sock_set_tos(struct sock *sk, int val); diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index aa115be11dcf..84ec3703c909 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -608,6 +608,14 @@ int ip_sock_set_mtu_discover(struct sock *sk, int val) } EXPORT_SYMBOL(ip_sock_set_mtu_discover); +void ip_sock_set_pktinfo(struct sock *sk) +{ + lock_sock(sk); + inet_sk(sk)->cmsg_flags |= IP_CMSG_PKTINFO; + release_sock(sk); +} +EXPORT_SYMBOL(ip_sock_set_pktinfo); + /* * Socket option code for IP. This is the end of the line after any * TCP,UDP etc options on an IP socket. diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 6773dacc64d8..7a805d165689 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -616,9 +616,8 @@ static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv) /* make sure we get destination address info */ switch (svsk->sk_sk->sk_family) { case AF_INET: - level = SOL_IP; - optname = IP_PKTINFO; - break; + ip_sock_set_pktinfo(svsk->sk_sock->sk); + return; case AF_INET6: level = SOL_IPV6; optname = IPV6_RECVPKTINFO; -- cgit v1.2.3 From 9b115749acb24d11083ded4fe947ddd654a940e3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:31 +0200 Subject: ipv6: add ip6_sock_set_v6only Add a helper to directly set the IPV6_V6ONLY sockopt from kernel space without going through a fake uaccess. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/net/ipv6.h | 11 +++++++++++ net/ipv6/ip6_udp_tunnel.c | 5 +---- net/sunrpc/svcsock.c | 6 +----- 3 files changed, 13 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 39a00d3ef5e2..9b91188c9a74 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -1177,4 +1177,15 @@ int ipv6_sock_mc_join_ssm(struct sock *sk, int ifindex, const struct in6_addr *addr, unsigned int mode); int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr); + +static inline int ip6_sock_set_v6only(struct sock *sk) +{ + if (inet_sk(sk)->inet_num) + return -EINVAL; + lock_sock(sk); + sk->sk_ipv6only = true; + release_sock(sk); + return 0; +} + #endif /* _NET_IPV6_H */ diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c index 6523609516d2..2e0ad1bc84a8 100644 --- a/net/ipv6/ip6_udp_tunnel.c +++ b/net/ipv6/ip6_udp_tunnel.c @@ -25,10 +25,7 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, goto error; if (cfg->ipv6_v6only) { - int val = 1; - - err = kernel_setsockopt(sock, IPPROTO_IPV6, IPV6_V6ONLY, - (char *) &val, sizeof(val)); + err = ip6_sock_set_v6only(sock->sk); if (err < 0) goto error; } diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 7a805d165689..a391892977cd 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1328,7 +1328,6 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv, struct sockaddr *newsin = (struct sockaddr *)&addr; int newlen; int family; - int val; RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); dprintk("svc: svc_create_socket(%s, %d, %s)\n", @@ -1364,11 +1363,8 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv, * getting requests from IPv4 remotes. Those should * be shunted to a PF_INET listener via rpcbind. */ - val = 1; if (family == PF_INET6) - kernel_setsockopt(sock, SOL_IPV6, IPV6_V6ONLY, - (char *)&val, sizeof(val)); - + ip6_sock_set_v6only(sock->sk); if (type == SOCK_STREAM) sock->sk->sk_reuse = SK_CAN_REUSE; /* allow address reuse */ error = kernel_bind(sock, sin, len); -- cgit v1.2.3 From fce934949c0f0003c1777fbf8c0706ba82a8cf7e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:32 +0200 Subject: ipv6: add ip6_sock_set_recverr Add a helper to directly set the IPV6_RECVERR sockopt from kernel space without going through a fake uaccess. Signed-off-by: Christoph Hellwig Reviewed-by: David Howells Signed-off-by: David S. Miller --- include/net/ipv6.h | 7 +++++++ net/rxrpc/local_object.c | 10 ++-------- 2 files changed, 9 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 9b91188c9a74..49c4abf99148 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -1188,4 +1188,11 @@ static inline int ip6_sock_set_v6only(struct sock *sk) return 0; } +static inline void ip6_sock_set_recverr(struct sock *sk) +{ + lock_sock(sk); + inet6_sk(sk)->recverr = true; + release_sock(sk); +} + #endif /* _NET_IPV6_H */ diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 6f4e6b4817cf..c8b2097f499c 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -107,7 +107,7 @@ static struct rxrpc_local *rxrpc_alloc_local(struct rxrpc_net *rxnet, static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) { struct sock *usk; - int ret, opt; + int ret; _enter("%p{%d,%d}", local, local->srx.transport_type, local->srx.transport.family); @@ -157,13 +157,7 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) switch (local->srx.transport.family) { case AF_INET6: /* we want to receive ICMPv6 errors */ - opt = 1; - ret = kernel_setsockopt(local->socket, SOL_IPV6, IPV6_RECVERR, - (char *) &opt, sizeof(opt)); - if (ret < 0) { - _debug("setsockopt failed"); - goto error; - } + ip6_sock_set_recverr(local->socket->sk); /* Fall through and set IPv4 options too otherwise we don't get * errors from IPv4 packets sent through the IPv6 socket. -- cgit v1.2.3 From 18d5ad62327576cbb1e5b9938a59d63ac0c15832 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:33 +0200 Subject: ipv6: add ip6_sock_set_addr_preferences Add a helper to directly set the IPV6_ADD_PREFERENCES sockopt from kernel space without going through a fake uaccess. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/net/ipv6.h | 67 ++++++++++++++++++++++++++++++++++++++++++++++++ net/ipv6/ipv6_sockglue.c | 59 +----------------------------------------- net/sunrpc/xprtsock.c | 7 ++--- 3 files changed, 72 insertions(+), 61 deletions(-) (limited to 'net') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 49c4abf99148..9a9075983016 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -1195,4 +1195,71 @@ static inline void ip6_sock_set_recverr(struct sock *sk) release_sock(sk); } +static inline int __ip6_sock_set_addr_preferences(struct sock *sk, int val) +{ + unsigned int pref = 0; + unsigned int prefmask = ~0; + + /* check PUBLIC/TMP/PUBTMP_DEFAULT conflicts */ + switch (val & (IPV6_PREFER_SRC_PUBLIC | + IPV6_PREFER_SRC_TMP | + IPV6_PREFER_SRC_PUBTMP_DEFAULT)) { + case IPV6_PREFER_SRC_PUBLIC: + pref |= IPV6_PREFER_SRC_PUBLIC; + prefmask &= ~(IPV6_PREFER_SRC_PUBLIC | + IPV6_PREFER_SRC_TMP); + break; + case IPV6_PREFER_SRC_TMP: + pref |= IPV6_PREFER_SRC_TMP; + prefmask &= ~(IPV6_PREFER_SRC_PUBLIC | + IPV6_PREFER_SRC_TMP); + break; + case IPV6_PREFER_SRC_PUBTMP_DEFAULT: + prefmask &= ~(IPV6_PREFER_SRC_PUBLIC | + IPV6_PREFER_SRC_TMP); + break; + case 0: + break; + default: + return -EINVAL; + } + + /* check HOME/COA conflicts */ + switch (val & (IPV6_PREFER_SRC_HOME | IPV6_PREFER_SRC_COA)) { + case IPV6_PREFER_SRC_HOME: + prefmask &= ~IPV6_PREFER_SRC_COA; + break; + case IPV6_PREFER_SRC_COA: + pref |= IPV6_PREFER_SRC_COA; + break; + case 0: + break; + default: + return -EINVAL; + } + + /* check CGA/NONCGA conflicts */ + switch (val & (IPV6_PREFER_SRC_CGA|IPV6_PREFER_SRC_NONCGA)) { + case IPV6_PREFER_SRC_CGA: + case IPV6_PREFER_SRC_NONCGA: + case 0: + break; + default: + return -EINVAL; + } + + inet6_sk(sk)->srcprefs = (inet6_sk(sk)->srcprefs & prefmask) | pref; + return 0; +} + +static inline int ip6_sock_set_addr_preferences(struct sock *sk, bool val) +{ + int ret; + + lock_sock(sk); + ret = __ip6_sock_set_addr_preferences(sk, val); + release_sock(sk); + return ret; +} + #endif /* _NET_IPV6_H */ diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index e10258c2210e..adbfed6adf11 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -845,67 +845,10 @@ done: break; case IPV6_ADDR_PREFERENCES: - { - unsigned int pref = 0; - unsigned int prefmask = ~0; - if (optlen < sizeof(int)) goto e_inval; - - retv = -EINVAL; - - /* check PUBLIC/TMP/PUBTMP_DEFAULT conflicts */ - switch (val & (IPV6_PREFER_SRC_PUBLIC| - IPV6_PREFER_SRC_TMP| - IPV6_PREFER_SRC_PUBTMP_DEFAULT)) { - case IPV6_PREFER_SRC_PUBLIC: - pref |= IPV6_PREFER_SRC_PUBLIC; - break; - case IPV6_PREFER_SRC_TMP: - pref |= IPV6_PREFER_SRC_TMP; - break; - case IPV6_PREFER_SRC_PUBTMP_DEFAULT: - break; - case 0: - goto pref_skip_pubtmp; - default: - goto e_inval; - } - - prefmask &= ~(IPV6_PREFER_SRC_PUBLIC| - IPV6_PREFER_SRC_TMP); -pref_skip_pubtmp: - - /* check HOME/COA conflicts */ - switch (val & (IPV6_PREFER_SRC_HOME|IPV6_PREFER_SRC_COA)) { - case IPV6_PREFER_SRC_HOME: - break; - case IPV6_PREFER_SRC_COA: - pref |= IPV6_PREFER_SRC_COA; - case 0: - goto pref_skip_coa; - default: - goto e_inval; - } - - prefmask &= ~IPV6_PREFER_SRC_COA; -pref_skip_coa: - - /* check CGA/NONCGA conflicts */ - switch (val & (IPV6_PREFER_SRC_CGA|IPV6_PREFER_SRC_NONCGA)) { - case IPV6_PREFER_SRC_CGA: - case IPV6_PREFER_SRC_NONCGA: - case 0: - break; - default: - goto e_inval; - } - - np->srcprefs = (np->srcprefs & prefmask) | pref; - retv = 0; - + retv = __ip6_sock_set_addr_preferences(sk, val); break; - } case IPV6_MINHOPCOUNT: if (optlen < sizeof(int)) goto e_inval; diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 0d3ec055bc12..3a143e250b9a 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2150,7 +2150,6 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) if (!transport->inet) { struct sock *sk = sock->sk; - unsigned int addr_pref = IPV6_PREFER_SRC_PUBLIC; /* Avoid temporary address, they are bad for long-lived * connections such as NFS mounts. @@ -2159,8 +2158,10 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) * knowledge about the normal duration of connections, * MAY override this as appropriate. */ - kernel_setsockopt(sock, SOL_IPV6, IPV6_ADDR_PREFERENCES, - (char *)&addr_pref, sizeof(addr_pref)); + if (xs_addr(xprt)->sa_family == PF_INET6) { + ip6_sock_set_addr_preferences(sk, + IPV6_PREFER_SRC_PUBLIC); + } xs_tcp_set_socket_timeouts(xprt, sock); -- cgit v1.2.3 From 7d7207c2d57080af93fc323dc6a85bd79207b4c6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:34 +0200 Subject: ipv6: add ip6_sock_set_recvpktinfo Add a helper to directly set the IPV6_RECVPKTINFO sockopt from kernel space without going through a fake uaccess. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/net/ipv6.h | 7 +++++++ net/sunrpc/svcsock.c | 10 ++-------- 2 files changed, 9 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 9a9075983016..5e65bf2fd32d 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -1262,4 +1262,11 @@ static inline int ip6_sock_set_addr_preferences(struct sock *sk, bool val) return ret; } +static inline void ip6_sock_set_recvpktinfo(struct sock *sk) +{ + lock_sock(sk); + inet6_sk(sk)->rxopt.bits.rxinfo = true; + release_sock(sk); +} + #endif /* _NET_IPV6_H */ diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index a391892977cd..e7a0037d9b56 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -595,8 +595,6 @@ static struct svc_xprt_class svc_udp_class = { static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv) { - int err, level, optname, one = 1; - svc_xprt_init(sock_net(svsk->sk_sock->sk), &svc_udp_class, &svsk->sk_xprt, serv); clear_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags); @@ -617,17 +615,13 @@ static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv) switch (svsk->sk_sk->sk_family) { case AF_INET: ip_sock_set_pktinfo(svsk->sk_sock->sk); - return; + break; case AF_INET6: - level = SOL_IPV6; - optname = IPV6_RECVPKTINFO; + ip6_sock_set_recvpktinfo(svsk->sk_sock->sk); break; default: BUG(); } - err = kernel_setsockopt(svsk->sk_sock, level, optname, - (char *)&one, sizeof(one)); - dprintk("svc: kernel_setsockopt returned %d\n", err); } /* -- cgit v1.2.3 From 298cd88a66a02c899772ffafbf648786ceb5ab95 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:35 +0200 Subject: rxrpc: add rxrpc_sock_set_min_security_level Add a helper to directly set the RXRPC_MIN_SECURITY_LEVEL sockopt from kernel space without going through a fake uaccess. Thanks to David Howells for the documentation updates. Signed-off-by: Christoph Hellwig Acked-by: David Howells Signed-off-by: David S. Miller --- Documentation/networking/rxrpc.rst | 13 +++++++++++-- fs/afs/rxrpc.c | 6 ++---- include/net/af_rxrpc.h | 2 ++ net/rxrpc/af_rxrpc.c | 13 +++++++++++++ 4 files changed, 28 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/Documentation/networking/rxrpc.rst b/Documentation/networking/rxrpc.rst index 5ad35113d0f4..68552b92dc44 100644 --- a/Documentation/networking/rxrpc.rst +++ b/Documentation/networking/rxrpc.rst @@ -477,7 +477,7 @@ AF_RXRPC sockets support a few socket options at the SOL_RXRPC level: Encrypted checksum plus packet padded and first eight bytes of packet encrypted - which includes the actual packet length. - (c) RXRPC_SECURITY_ENCRYPTED + (c) RXRPC_SECURITY_ENCRYPT Encrypted checksum plus entire packet padded and encrypted, including actual packet length. @@ -578,7 +578,7 @@ A client would issue an operation by: This issues a request_key() to get the key representing the security context. The minimum security level can be set:: - unsigned int sec = RXRPC_SECURITY_ENCRYPTED; + unsigned int sec = RXRPC_SECURITY_ENCRYPT; setsockopt(client, SOL_RXRPC, RXRPC_MIN_SECURITY_LEVEL, &sec, sizeof(sec)); @@ -1090,6 +1090,15 @@ The kernel interface functions are as follows: jiffies). In the event of the timeout occurring, the call will be aborted and -ETIME or -ETIMEDOUT will be returned. + (#) Apply the RXRPC_MIN_SECURITY_LEVEL sockopt to a socket from within in the + kernel:: + + int rxrpc_sock_set_min_security_level(struct sock *sk, + unsigned int val); + + This specifies the minimum security level required for calls on this + socket. + Configurable Parameters ======================= diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 1ecc67da6c1a..e313dae01674 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -37,7 +37,6 @@ int afs_open_socket(struct afs_net *net) { struct sockaddr_rxrpc srx; struct socket *socket; - unsigned int min_level; int ret; _enter(""); @@ -57,9 +56,8 @@ int afs_open_socket(struct afs_net *net) srx.transport.sin6.sin6_family = AF_INET6; srx.transport.sin6.sin6_port = htons(AFS_CM_PORT); - min_level = RXRPC_SECURITY_ENCRYPT; - ret = kernel_setsockopt(socket, SOL_RXRPC, RXRPC_MIN_SECURITY_LEVEL, - (void *)&min_level, sizeof(min_level)); + ret = rxrpc_sock_set_min_security_level(socket->sk, + RXRPC_SECURITY_ENCRYPT); if (ret < 0) goto error_2; diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index ab988940bf04..91eacbdcf33d 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -72,4 +72,6 @@ bool rxrpc_kernel_call_is_complete(struct rxrpc_call *); void rxrpc_kernel_set_max_life(struct socket *, struct rxrpc_call *, unsigned long); +int rxrpc_sock_set_min_security_level(struct sock *sk, unsigned int val); + #endif /* _NET_RXRPC_H */ diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 15ee92d79581..394189b81849 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -571,6 +571,19 @@ out: return ret; } +int rxrpc_sock_set_min_security_level(struct sock *sk, unsigned int val) +{ + if (sk->sk_state != RXRPC_UNBOUND) + return -EISCONN; + if (val > RXRPC_SECURITY_MAX) + return -EINVAL; + lock_sock(sk); + rxrpc_sk(sk)->min_sec_level = val; + release_sock(sk); + return 0; +} +EXPORT_SYMBOL(rxrpc_sock_set_min_security_level); + /* * set RxRPC socket options */ -- cgit v1.2.3 From 095ae612530c9465df6d372d688cb30c6abfc5f5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:36 +0200 Subject: tipc: call tsk_set_importance from tipc_topsrv_create_listener Avoid using kernel_setsockopt for the TIPC_IMPORTANCE option when we can just use the internal helper. The only change needed is to pass a struct sock instead of tipc_sock, which is private to socket.c Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- net/tipc/socket.c | 18 +++++++++--------- net/tipc/socket.h | 2 ++ net/tipc/topsrv.c | 6 +++--- 3 files changed, 14 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index d6b67d07d22e..3734cdbedc9c 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -196,17 +196,17 @@ static int tsk_importance(struct tipc_sock *tsk) return msg_importance(&tsk->phdr); } -static int tsk_set_importance(struct tipc_sock *tsk, int imp) +static struct tipc_sock *tipc_sk(const struct sock *sk) { - if (imp > TIPC_CRITICAL_IMPORTANCE) - return -EINVAL; - msg_set_importance(&tsk->phdr, (u32)imp); - return 0; + return container_of(sk, struct tipc_sock, sk); } -static struct tipc_sock *tipc_sk(const struct sock *sk) +int tsk_set_importance(struct sock *sk, int imp) { - return container_of(sk, struct tipc_sock, sk); + if (imp > TIPC_CRITICAL_IMPORTANCE) + return -EINVAL; + msg_set_importance(&tipc_sk(sk)->phdr, (u32)imp); + return 0; } static bool tsk_conn_cong(struct tipc_sock *tsk) @@ -2721,7 +2721,7 @@ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags, /* Connect new socket to it's peer */ tipc_sk_finish_conn(new_tsock, msg_origport(msg), msg_orignode(msg)); - tsk_set_importance(new_tsock, msg_importance(msg)); + tsk_set_importance(new_sk, msg_importance(msg)); if (msg_named(msg)) { new_tsock->conn_type = msg_nametype(msg); new_tsock->conn_instance = msg_nameinst(msg); @@ -3139,7 +3139,7 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt, switch (opt) { case TIPC_IMPORTANCE: - res = tsk_set_importance(tsk, value); + res = tsk_set_importance(sk, value); break; case TIPC_SRC_DROPPABLE: if (sock->type != SOCK_STREAM) diff --git a/net/tipc/socket.h b/net/tipc/socket.h index 235b9679acee..b11575afc66f 100644 --- a/net/tipc/socket.h +++ b/net/tipc/socket.h @@ -75,4 +75,6 @@ u32 tipc_sock_get_portid(struct sock *sk); bool tipc_sk_overlimit1(struct sock *sk, struct sk_buff *skb); bool tipc_sk_overlimit2(struct sock *sk, struct sk_buff *skb); +int tsk_set_importance(struct sock *sk, int imp); + #endif diff --git a/net/tipc/topsrv.c b/net/tipc/topsrv.c index 446af7bbd13e..1489cfb941d8 100644 --- a/net/tipc/topsrv.c +++ b/net/tipc/topsrv.c @@ -497,7 +497,6 @@ static void tipc_topsrv_listener_data_ready(struct sock *sk) static int tipc_topsrv_create_listener(struct tipc_topsrv *srv) { - int imp = TIPC_CRITICAL_IMPORTANCE; struct socket *lsock = NULL; struct sockaddr_tipc saddr; struct sock *sk; @@ -514,8 +513,9 @@ static int tipc_topsrv_create_listener(struct tipc_topsrv *srv) sk->sk_user_data = srv; write_unlock_bh(&sk->sk_callback_lock); - rc = kernel_setsockopt(lsock, SOL_TIPC, TIPC_IMPORTANCE, - (char *)&imp, sizeof(imp)); + lock_sock(sk); + rc = tsk_set_importance(sk, TIPC_CRITICAL_IMPORTANCE); + release_sock(sk); if (rc < 0) goto err; -- cgit v1.2.3 From e6da0edc24eecef2f6964d92fa9044e1821deace Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 28 May 2020 14:35:12 +0200 Subject: Bluetooth: Acquire sk_lock.slock without disabling interrupts There was a lockdep which led to commit fad003b6c8e3d ("Bluetooth: Fix inconsistent lock state with RFCOMM") Lockdep noticed that `sk->sk_lock.slock' was acquired without disabling the softirq while the lock was also used in softirq context. Unfortunately the solution back then was to disable interrupts before acquiring the lock which however made lockdep happy. It would have been enough to simply disable the softirq. Disabling interrupts before acquiring a spinlock_t is not allowed on PREEMPT_RT because these locks are converted to 'sleeping' spinlocks. Use spin_lock_bh() in order to acquire the `sk_lock.slock'. Reported-by: Luis Claudio R. Goncalves Reported-by: kbuild test robot [missing unlock] Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Marcel Holtmann --- net/bluetooth/rfcomm/sock.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index b4eaf21360ef..df14eebe80da 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -64,15 +64,13 @@ static void rfcomm_sk_data_ready(struct rfcomm_dlc *d, struct sk_buff *skb) static void rfcomm_sk_state_change(struct rfcomm_dlc *d, int err) { struct sock *sk = d->owner, *parent; - unsigned long flags; if (!sk) return; BT_DBG("dlc %p state %ld err %d", d, d->state, err); - local_irq_save(flags); - bh_lock_sock(sk); + spin_lock_bh(&sk->sk_lock.slock); if (err) sk->sk_err = err; @@ -93,8 +91,7 @@ static void rfcomm_sk_state_change(struct rfcomm_dlc *d, int err) sk->sk_state_change(sk); } - bh_unlock_sock(sk); - local_irq_restore(flags); + spin_unlock_bh(&sk->sk_lock.slock); if (parent && sock_flag(sk, SOCK_ZAPPED)) { /* We have to drop DLC lock here, otherwise -- cgit v1.2.3 From 05bfd3661448a46db3a258b316160d34cf0a1317 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 29 May 2020 14:09:41 +0200 Subject: sctp: refactor sctp_setsockopt_bindx Split out a sctp_setsockopt_bindx_kernel that takes a kernel pointer to the sockaddr and make sctp_setsockopt_bindx a small wrapper around it. This prepares for adding a new bind_add proto op. Signed-off-by: Christoph Hellwig Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/socket.c | 61 +++++++++++++++++++++++++------------------------------ 1 file changed, 28 insertions(+), 33 deletions(-) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 827a9903ee28..6e745ac3c4a5 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -972,23 +972,22 @@ int sctp_asconf_mgmt(struct sctp_sock *sp, struct sctp_sockaddr_entry *addrw) * it. * * sk The sk of the socket - * addrs The pointer to the addresses in user land + * addrs The pointer to the addresses * addrssize Size of the addrs buffer * op Operation to perform (add or remove, see the flags of * sctp_bindx) * * Returns 0 if ok, <0 errno code on error. */ -static int sctp_setsockopt_bindx(struct sock *sk, - struct sockaddr __user *addrs, - int addrs_size, int op) +static int sctp_setsockopt_bindx_kernel(struct sock *sk, + struct sockaddr *addrs, int addrs_size, + int op) { - struct sockaddr *kaddrs; int err; int addrcnt = 0; int walk_size = 0; struct sockaddr *sa_addr; - void *addr_buf; + void *addr_buf = addrs; struct sctp_af *af; pr_debug("%s: sk:%p addrs:%p addrs_size:%d opt:%d\n", @@ -997,17 +996,10 @@ static int sctp_setsockopt_bindx(struct sock *sk, if (unlikely(addrs_size <= 0)) return -EINVAL; - kaddrs = memdup_user(addrs, addrs_size); - if (IS_ERR(kaddrs)) - return PTR_ERR(kaddrs); - /* Walk through the addrs buffer and count the number of addresses. */ - addr_buf = kaddrs; while (walk_size < addrs_size) { - if (walk_size + sizeof(sa_family_t) > addrs_size) { - kfree(kaddrs); + if (walk_size + sizeof(sa_family_t) > addrs_size) return -EINVAL; - } sa_addr = addr_buf; af = sctp_get_af_specific(sa_addr->sa_family); @@ -1015,10 +1007,8 @@ static int sctp_setsockopt_bindx(struct sock *sk, /* If the address family is not supported or if this address * causes the address buffer to overflow return EINVAL. */ - if (!af || (walk_size + af->sockaddr_len) > addrs_size) { - kfree(kaddrs); + if (!af || (walk_size + af->sockaddr_len) > addrs_size) return -EINVAL; - } addrcnt++; addr_buf += af->sockaddr_len; walk_size += af->sockaddr_len; @@ -1029,31 +1019,36 @@ static int sctp_setsockopt_bindx(struct sock *sk, case SCTP_BINDX_ADD_ADDR: /* Allow security module to validate bindx addresses. */ err = security_sctp_bind_connect(sk, SCTP_SOCKOPT_BINDX_ADD, - (struct sockaddr *)kaddrs, - addrs_size); + addrs, addrs_size); if (err) - goto out; - err = sctp_bindx_add(sk, kaddrs, addrcnt); + return err; + err = sctp_bindx_add(sk, addrs, addrcnt); if (err) - goto out; - err = sctp_send_asconf_add_ip(sk, kaddrs, addrcnt); - break; - + return err; + return sctp_send_asconf_add_ip(sk, addrs, addrcnt); case SCTP_BINDX_REM_ADDR: - err = sctp_bindx_rem(sk, kaddrs, addrcnt); + err = sctp_bindx_rem(sk, addrs, addrcnt); if (err) - goto out; - err = sctp_send_asconf_del_ip(sk, kaddrs, addrcnt); - break; + return err; + return sctp_send_asconf_del_ip(sk, addrs, addrcnt); default: - err = -EINVAL; - break; + return -EINVAL; } +} -out: - kfree(kaddrs); +static int sctp_setsockopt_bindx(struct sock *sk, + struct sockaddr __user *addrs, + int addrs_size, int op) +{ + struct sockaddr *kaddrs; + int err; + kaddrs = memdup_user(addrs, addrs_size); + if (IS_ERR(kaddrs)) + return PTR_ERR(kaddrs); + err = sctp_setsockopt_bindx_kernel(sk, kaddrs, addrs_size, op); + kfree(kaddrs); return err; } -- cgit v1.2.3 From c0425a4249e9d313eec5f81c0bde8a286ebf9a63 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 29 May 2020 14:09:42 +0200 Subject: net: add a new bind_add method The SCTP protocol allows to bind multiple address to a socket. That feature is currently only exposed as a socket option. Add a bind_add method struct proto that allows to bind additional addresses, and switch the dlm code to use the method instead of going through the socket option from kernel space. Signed-off-by: Christoph Hellwig Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- fs/dlm/lowcomms.c | 9 +++------ include/net/sock.h | 6 +++++- net/core/sock.c | 8 ++++++++ net/sctp/socket.c | 14 ++++++++++++++ 4 files changed, 30 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 9f1c3cdc9d65..3543a8fec907 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -882,6 +882,7 @@ static void writequeue_entry_complete(struct writequeue_entry *e, int completed) static int sctp_bind_addrs(struct connection *con, uint16_t port) { struct sockaddr_storage localaddr; + struct sockaddr *addr = (struct sockaddr *)&localaddr; int i, addr_len, result = 0; for (i = 0; i < dlm_local_count; i++) { @@ -889,13 +890,9 @@ static int sctp_bind_addrs(struct connection *con, uint16_t port) make_sockaddr(&localaddr, port, &addr_len); if (!i) - result = kernel_bind(con->sock, - (struct sockaddr *)&localaddr, - addr_len); + result = kernel_bind(con->sock, addr, addr_len); else - result = kernel_setsockopt(con->sock, SOL_SCTP, - SCTP_SOCKOPT_BINDX_ADD, - (char *)&localaddr, addr_len); + result = sock_bind_add(con->sock->sk, addr, addr_len); if (result < 0) { log_print("Can't bind to %d addr number %d, %d.\n", diff --git a/include/net/sock.h b/include/net/sock.h index d994daa418ec..6e9f713a7860 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1156,7 +1156,9 @@ struct proto { int (*sendpage)(struct sock *sk, struct page *page, int offset, size_t size, int flags); int (*bind)(struct sock *sk, - struct sockaddr *uaddr, int addr_len); + struct sockaddr *addr, int addr_len); + int (*bind_add)(struct sock *sk, + struct sockaddr *addr, int addr_len); int (*backlog_rcv) (struct sock *sk, struct sk_buff *skb); @@ -2698,4 +2700,6 @@ void sock_set_reuseaddr(struct sock *sk); void sock_set_reuseport(struct sock *sk); void sock_set_sndtimeo(struct sock *sk, s64 secs); +int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len); + #endif /* _SOCK_H */ diff --git a/net/core/sock.c b/net/core/sock.c index 2ca3425b519c..61ec573221a6 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3712,3 +3712,11 @@ bool sk_busy_loop_end(void *p, unsigned long start_time) } EXPORT_SYMBOL(sk_busy_loop_end); #endif /* CONFIG_NET_RX_BUSY_POLL */ + +int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len) +{ + if (!sk->sk_prot->bind_add) + return -EOPNOTSUPP; + return sk->sk_prot->bind_add(sk, addr, addr_len); +} +EXPORT_SYMBOL(sock_bind_add); diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 6e745ac3c4a5..d57e1a002ffc 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1052,6 +1052,18 @@ static int sctp_setsockopt_bindx(struct sock *sk, return err; } +static int sctp_bind_add(struct sock *sk, struct sockaddr *addrs, + int addrlen) +{ + int err; + + lock_sock(sk); + err = sctp_setsockopt_bindx_kernel(sk, addrs, addrlen, + SCTP_BINDX_ADD_ADDR); + release_sock(sk); + return err; +} + static int sctp_connect_new_asoc(struct sctp_endpoint *ep, const union sctp_addr *daddr, const struct sctp_initmsg *init, @@ -9620,6 +9632,7 @@ struct proto sctp_prot = { .sendmsg = sctp_sendmsg, .recvmsg = sctp_recvmsg, .bind = sctp_bind, + .bind_add = sctp_bind_add, .backlog_rcv = sctp_backlog_rcv, .hash = sctp_hash, .unhash = sctp_unhash, @@ -9662,6 +9675,7 @@ struct proto sctpv6_prot = { .sendmsg = sctp_sendmsg, .recvmsg = sctp_recvmsg, .bind = sctp_bind, + .bind_add = sctp_bind_add, .backlog_rcv = sctp_backlog_rcv, .hash = sctp_hash, .unhash = sctp_unhash, -- cgit v1.2.3 From 5a892ff2facb4548c17c05931ed899038a0da63e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 29 May 2020 14:09:43 +0200 Subject: net: remove kernel_setsockopt No users left. Signed-off-by: Christoph Hellwig Reviewed-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/linux/net.h | 2 -- net/socket.c | 31 ------------------------------- 2 files changed, 33 deletions(-) (limited to 'net') diff --git a/include/linux/net.h b/include/linux/net.h index 74ef5d7315f7..e10f378194a5 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -303,8 +303,6 @@ int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen, int flags); int kernel_getsockname(struct socket *sock, struct sockaddr *addr); int kernel_getpeername(struct socket *sock, struct sockaddr *addr); -int kernel_setsockopt(struct socket *sock, int level, int optname, char *optval, - unsigned int optlen); int kernel_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags); int kernel_sendpage_locked(struct sock *sk, struct page *page, int offset, diff --git a/net/socket.c b/net/socket.c index 81a98b6cbd08..976426d03f09 100644 --- a/net/socket.c +++ b/net/socket.c @@ -3624,37 +3624,6 @@ int kernel_getpeername(struct socket *sock, struct sockaddr *addr) } EXPORT_SYMBOL(kernel_getpeername); -/** - * kernel_setsockopt - set a socket option (kernel space) - * @sock: socket - * @level: API level (SOL_SOCKET, ...) - * @optname: option tag - * @optval: option value - * @optlen: option length - * - * Returns 0 or an error. - */ - -int kernel_setsockopt(struct socket *sock, int level, int optname, - char *optval, unsigned int optlen) -{ - mm_segment_t oldfs = get_fs(); - char __user *uoptval; - int err; - - uoptval = (char __user __force *) optval; - - set_fs(KERNEL_DS); - if (level == SOL_SOCKET) - err = sock_setsockopt(sock, level, optname, uoptval, optlen); - else - err = sock->ops->setsockopt(sock, level, optname, uoptval, - optlen); - set_fs(oldfs); - return err; -} -EXPORT_SYMBOL(kernel_setsockopt); - /** * kernel_sendpage - send a &page through a socket (kernel space) * @sock: socket -- cgit v1.2.3 From 04198499b23f9d73a127a17b8576c9266c8f6f9b Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 27 May 2020 19:41:34 +0300 Subject: net: dsa: tag_8021q: stop restoring VLANs from bridge Right now, our only tag_8021q user, sja1105, has the ability to restore bridge VLANs on its own, so this logic is unnecessary. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- net/dsa/tag_8021q.c | 61 +---------------------------------------------------- 1 file changed, 1 insertion(+), 60 deletions(-) (limited to 'net') diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c index 3052da668156..780b2a15ac9b 100644 --- a/net/dsa/tag_8021q.c +++ b/net/dsa/tag_8021q.c @@ -140,34 +140,6 @@ bool vid_is_dsa_8021q(u16 vid) } EXPORT_SYMBOL_GPL(vid_is_dsa_8021q); -static int dsa_8021q_restore_pvid(struct dsa_switch *ds, int port) -{ - struct bridge_vlan_info vinfo; - struct net_device *slave; - u16 pvid; - int err; - - if (!dsa_is_user_port(ds, port)) - return 0; - - slave = dsa_to_port(ds, port)->slave; - - err = br_vlan_get_pvid(slave, &pvid); - if (!pvid || err < 0) - /* There is no pvid on the bridge for this port, which is - * perfectly valid. Nothing to restore, bye-bye! - */ - return 0; - - err = br_vlan_get_info(slave, pvid, &vinfo); - if (err < 0) { - dev_err(ds->dev, "Couldn't determine PVID attributes\n"); - return err; - } - - return dsa_port_vid_add(dsa_to_port(ds, port), pvid, vinfo.flags); -} - /* If @enabled is true, installs @vid with @flags into the switch port's HW * filter. * If @enabled is false, deletes @vid (ignores @flags) from the port. Had the @@ -178,39 +150,11 @@ static int dsa_8021q_vid_apply(struct dsa_switch *ds, int port, u16 vid, u16 flags, bool enabled) { struct dsa_port *dp = dsa_to_port(ds, port); - struct bridge_vlan_info vinfo; - int err; if (enabled) return dsa_port_vid_add(dp, vid, flags); - err = dsa_port_vid_del(dp, vid); - if (err < 0) - return err; - - /* Nothing to restore from the bridge for a non-user port. - * The CPU port VLANs are restored implicitly with the user ports, - * similar to how the bridge does in dsa_slave_vlan_add and - * dsa_slave_vlan_del. - */ - if (!dsa_is_user_port(ds, port)) - return 0; - - err = br_vlan_get_info(dp->slave, vid, &vinfo); - /* Couldn't determine bridge attributes for this vid, - * it means the bridge had not configured it. - */ - if (err < 0) - return 0; - - /* Restore the VID from the bridge */ - err = dsa_port_vid_add(dp, vid, vinfo.flags); - if (err < 0) - return err; - - vinfo.flags &= ~BRIDGE_VLAN_INFO_PVID; - - return dsa_port_vid_add(dp->cpu_dp, vid, vinfo.flags); + return dsa_port_vid_del(dp, vid); } /* RX VLAN tagging (left) and TX VLAN tagging (right) setup shown for a single @@ -329,9 +273,6 @@ int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int port, bool enabled) return err; } - if (!enabled) - err = dsa_8021q_restore_pvid(ds, port); - return err; } EXPORT_SYMBOL_GPL(dsa_port_setup_8021q_tagging); -- cgit v1.2.3 From 8298a419a0064c6cd6c84a4a45de294a3eff0832 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Thu, 28 May 2020 07:43:59 +0000 Subject: tipc: remove set but not used variable 'prev' Fixes gcc '-Wunused-but-set-variable' warning: net/tipc/msg.c: In function 'tipc_msg_append': net/tipc/msg.c:215:24: warning: variable 'prev' set but not used [-Wunused-but-set-variable] commit 0a3e060f340d ("tipc: add test for Nagle algorithm effectiveness") left behind this, remove it. Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- net/tipc/msg.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 23809039dda1..c0afcd627c5e 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -212,7 +212,7 @@ err: int tipc_msg_append(struct tipc_msg *_hdr, struct msghdr *m, int dlen, int mss, struct sk_buff_head *txq) { - struct sk_buff *skb, *prev; + struct sk_buff *skb; int accounted, total, curr; int mlen, cpy, rem = dlen; struct tipc_msg *hdr; @@ -223,7 +223,6 @@ int tipc_msg_append(struct tipc_msg *_hdr, struct msghdr *m, int dlen, while (rem) { if (!skb || skb->len >= mss) { - prev = skb; skb = tipc_buf_acquire(mss, GFP_KERNEL); if (unlikely(!skb)) return -ENOMEM; -- cgit v1.2.3 From fd55199d3b762f9555c66a1bc4bf6eac3901fc2f Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Thu, 28 May 2020 23:43:24 +0200 Subject: net: ethtool: cabletest: Make ethnl_act_cable_test_tdr_cfg static kbuild test robot is reporting: net/ethtool/cabletest.c:230:5: warning: no previous prototype for Mark the function as static. Reported-by: kbuild test robot Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- net/ethtool/cabletest.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ethtool/cabletest.c b/net/ethtool/cabletest.c index 9991688d7d1d..7b7a0456c15c 100644 --- a/net/ethtool/cabletest.c +++ b/net/ethtool/cabletest.c @@ -227,9 +227,9 @@ cable_test_tdr_act_policy[ETHTOOL_A_CABLE_TEST_TDR_MAX + 1] = { }; /* CABLE_TEST_TDR_ACT */ -int ethnl_act_cable_test_tdr_cfg(const struct nlattr *nest, - struct genl_info *info, - struct phy_tdr_config *cfg) +static int ethnl_act_cable_test_tdr_cfg(const struct nlattr *nest, + struct genl_info *info, + struct phy_tdr_config *cfg) { struct nlattr *tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_MAX + 1]; int ret; -- cgit v1.2.3 From bc183dec08f9cb177cf5206a010b7a9e7b22e567 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 29 May 2020 00:01:52 +0200 Subject: tcp: tcp_init_buffer_space can be static As of commit 98fa6271cfcb ("tcp: refactor setting the initial congestion window") this is called only from tcp_input.c, so it can be static. Signed-off-by: Florian Westphal Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 1 - net/ipv4/tcp_input.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 66e4b8331850..bca761ffa25f 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -662,7 +662,6 @@ void tcp_initialize_rcv_mss(struct sock *sk); int tcp_mtu_to_mss(struct sock *sk, int pmtu); int tcp_mss_to_mtu(struct sock *sk, int mss); void tcp_mtup_init(struct sock *sk); -void tcp_init_buffer_space(struct sock *sk); static inline void tcp_bound_rto(const struct sock *sk) { diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index ad90102f5dfb..83330a6cb242 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -437,7 +437,7 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) /* 3. Try to fixup all. It is made immediately after connection enters * established state. */ -void tcp_init_buffer_space(struct sock *sk) +static void tcp_init_buffer_space(struct sock *sk) { int tcp_app_win = sock_net(sk)->ipv4.sysctl_tcp_app_win; struct tcp_sock *tp = tcp_sk(sk); -- cgit v1.2.3 From 86ae579cefffe18cb08928505d90fbc87367e8f5 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Wed, 27 May 2020 08:35:03 +0300 Subject: net: Make mpls_entry_encode() available for generic users Move mpls_entry_encode() from net/mpls/internal.h to include/net/mpls.h and make it available for other users. Specifically, hardware driver that offload MPLS can benefit from that. Suggested-by: Jakub Kicinski Suggested-by: David Ahern Signed-off-by: Eli Cohen Signed-off-by: Saeed Mahameed --- include/net/mpls.h | 17 +++++++++++++++++ net/mpls/internal.h | 11 ----------- 2 files changed, 17 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/include/net/mpls.h b/include/net/mpls.h index ccaf238e8ea7..0bb7944e7b08 100644 --- a/include/net/mpls.h +++ b/include/net/mpls.h @@ -8,6 +8,7 @@ #include #include +#include #define MPLS_HLEN 4 @@ -25,4 +26,20 @@ static inline struct mpls_shim_hdr *mpls_hdr(const struct sk_buff *skb) { return (struct mpls_shim_hdr *)skb_network_header(skb); } + +static inline struct mpls_shim_hdr mpls_entry_encode(u32 label, + unsigned int ttl, + unsigned int tc, + bool bos) +{ + struct mpls_shim_hdr result; + + result.label_stack_entry = + cpu_to_be32((label << MPLS_LS_LABEL_SHIFT) | + (tc << MPLS_LS_TC_SHIFT) | + (bos ? (1 << MPLS_LS_S_SHIFT) : 0) | + (ttl << MPLS_LS_TTL_SHIFT)); + return result; +} + #endif diff --git a/net/mpls/internal.h b/net/mpls/internal.h index 0e9aa94adc07..838cdfc10e47 100644 --- a/net/mpls/internal.h +++ b/net/mpls/internal.h @@ -172,17 +172,6 @@ struct mpls_route { /* next hop label forwarding entry */ #define endfor_nexthops(rt) } -static inline struct mpls_shim_hdr mpls_entry_encode(u32 label, unsigned ttl, unsigned tc, bool bos) -{ - struct mpls_shim_hdr result; - result.label_stack_entry = - cpu_to_be32((label << MPLS_LS_LABEL_SHIFT) | - (tc << MPLS_LS_TC_SHIFT) | - (bos ? (1 << MPLS_LS_S_SHIFT) : 0) | - (ttl << MPLS_LS_TTL_SHIFT)); - return result; -} - static inline struct mpls_entry_decoded mpls_entry_decode(struct mpls_shim_hdr *hdr) { struct mpls_entry_decoded result; -- cgit v1.2.3 From b8ded9de8db34dd209a3dece94cf54fc414e78f7 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Sat, 30 May 2020 16:42:37 +0200 Subject: net/smc: pre-fetch send buffer outside of send_lock Pre-fetch send buffer for the CDC validation message before entering the send_lock. Without that the send call might fail with -EBUSY because there are no free buffers and waiting for buffers is not possible under send_lock. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_cdc.c | 10 +++------- net/smc/smc_cdc.h | 4 +++- net/smc/smc_core.c | 18 +++++++++++++++--- 3 files changed, 21 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index b2b85e1be72c..a47e8855e045 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -116,19 +116,15 @@ int smc_cdc_msg_send(struct smc_connection *conn, } /* send a validation msg indicating the move of a conn to an other QP link */ -int smcr_cdc_msg_send_validation(struct smc_connection *conn) +int smcr_cdc_msg_send_validation(struct smc_connection *conn, + struct smc_cdc_tx_pend *pend, + struct smc_wr_buf *wr_buf) { struct smc_host_cdc_msg *local = &conn->local_tx_ctrl; struct smc_link *link = conn->lnk; - struct smc_cdc_tx_pend *pend; - struct smc_wr_buf *wr_buf; struct smc_cdc_msg *peer; int rc; - rc = smc_cdc_get_free_slot(conn, link, &wr_buf, NULL, &pend); - if (rc) - return rc; - peer = (struct smc_cdc_msg *)wr_buf; peer->common.type = local->common.type; peer->len = local->len; diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h index 2ddcc5fb5ceb..0a0a89abd38b 100644 --- a/net/smc/smc_cdc.h +++ b/net/smc/smc_cdc.h @@ -296,7 +296,9 @@ int smc_cdc_msg_send(struct smc_connection *conn, struct smc_wr_buf *wr_buf, struct smc_cdc_tx_pend *pend); int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn); int smcd_cdc_msg_send(struct smc_connection *conn); -int smcr_cdc_msg_send_validation(struct smc_connection *conn); +int smcr_cdc_msg_send_validation(struct smc_connection *conn, + struct smc_cdc_tx_pend *pend, + struct smc_wr_buf *wr_buf); int smc_cdc_init(void) __init; void smcd_cdc_rx_init(struct smc_connection *conn); diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 65de700e1f17..7964a21e5e6f 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -483,7 +483,8 @@ static int smc_write_space(struct smc_connection *conn) return space; } -static int smc_switch_cursor(struct smc_sock *smc) +static int smc_switch_cursor(struct smc_sock *smc, struct smc_cdc_tx_pend *pend, + struct smc_wr_buf *wr_buf) { struct smc_connection *conn = &smc->conn; union smc_host_cursor cons, fin; @@ -520,11 +521,14 @@ static int smc_switch_cursor(struct smc_sock *smc) if (smc->sk.sk_state != SMC_INIT && smc->sk.sk_state != SMC_CLOSED) { - rc = smcr_cdc_msg_send_validation(conn); + rc = smcr_cdc_msg_send_validation(conn, pend, wr_buf); if (!rc) { schedule_delayed_work(&conn->tx_work, 0); smc->sk.sk_data_ready(&smc->sk); } + } else { + smc_wr_tx_put_slot(conn->lnk, + (struct smc_wr_tx_pend_priv *)pend); } return rc; } @@ -533,7 +537,9 @@ struct smc_link *smc_switch_conns(struct smc_link_group *lgr, struct smc_link *from_lnk, bool is_dev_err) { struct smc_link *to_lnk = NULL; + struct smc_cdc_tx_pend *pend; struct smc_connection *conn; + struct smc_wr_buf *wr_buf; struct smc_sock *smc; struct rb_node *node; int i, rc = 0; @@ -582,10 +588,16 @@ again: } sock_hold(&smc->sk); read_unlock_bh(&lgr->conns_lock); + /* pre-fetch buffer outside of send_lock, might sleep */ + rc = smc_cdc_get_free_slot(conn, to_lnk, &wr_buf, NULL, &pend); + if (rc) { + smcr_link_down_cond_sched(to_lnk); + return NULL; + } /* avoid race with smcr_tx_sndbuf_nonempty() */ spin_lock_bh(&conn->send_lock); conn->lnk = to_lnk; - rc = smc_switch_cursor(smc); + rc = smc_switch_cursor(smc, pend, wr_buf); spin_unlock_bh(&conn->send_lock); sock_put(&smc->sk); if (rc) { -- cgit v1.2.3 From b0c19ed6088ab41dd2a727b60594b7297c15d6ce Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Fri, 29 May 2020 14:43:44 +0200 Subject: sch_cake: Take advantage of skb->hash where appropriate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While the other fq-based qdiscs take advantage of skb->hash and doesn't recompute it if it is already set, sch_cake does not. This was a deliberate choice because sch_cake hashes various parts of the packet header to support its advanced flow isolation modes. However, foregoing the use of skb->hash entirely loses a few important benefits: - When skb->hash is set by hardware, a few CPU cycles can be saved by not hashing again in software. - Tunnel encapsulations will generally preserve the value of skb->hash from before the encapsulation, which allows flow-based qdiscs to distinguish between flows even though the outer packet header no longer has flow information. It turns out that we can preserve these desirable properties in many cases, while still supporting the advanced flow isolation properties of sch_cake. This patch does so by reusing the skb->hash value as the flow_hash part of the hashing procedure in cake_hash() only in the following conditions: - If the skb->hash is marked as covering the flow headers (skb->l4_hash is set) AND - NAT header rewriting is either disabled, or did not change any values used for hashing. The latter is important to match local-origin packets such as those of a tunnel endpoint. The immediate motivation for fixing this was the recent patch to WireGuard to preserve the skb->hash on encapsulation. As such, this is also what I tested against; with this patch, added latency under load for competing flows drops from ~8 ms to sub-1ms on an RRUL test over a WireGuard tunnel going through a virtual link shaped to 1Gbps using sch_cake. This matches the results we saw with a similar setup using sch_fq_codel when testing the WireGuard patch. Fixes: 046f6fd5daef ("sched: Add Common Applications Kept Enhanced (cake) qdisc") Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- net/sched/sch_cake.c | 65 +++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 51 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 1496e87cd07b..60f8ae578819 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -584,26 +584,48 @@ static bool cobalt_should_drop(struct cobalt_vars *vars, return drop; } -static void cake_update_flowkeys(struct flow_keys *keys, +static bool cake_update_flowkeys(struct flow_keys *keys, const struct sk_buff *skb) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) struct nf_conntrack_tuple tuple = {}; - bool rev = !skb->_nfct; + bool rev = !skb->_nfct, upd = false; + __be32 ip; if (tc_skb_protocol(skb) != htons(ETH_P_IP)) - return; + return false; if (!nf_ct_get_tuple_skb(&tuple, skb)) - return; + return false; - keys->addrs.v4addrs.src = rev ? tuple.dst.u3.ip : tuple.src.u3.ip; - keys->addrs.v4addrs.dst = rev ? tuple.src.u3.ip : tuple.dst.u3.ip; + ip = rev ? tuple.dst.u3.ip : tuple.src.u3.ip; + if (ip != keys->addrs.v4addrs.src) { + keys->addrs.v4addrs.src = ip; + upd = true; + } + ip = rev ? tuple.src.u3.ip : tuple.dst.u3.ip; + if (ip != keys->addrs.v4addrs.dst) { + keys->addrs.v4addrs.dst = ip; + upd = true; + } if (keys->ports.ports) { - keys->ports.src = rev ? tuple.dst.u.all : tuple.src.u.all; - keys->ports.dst = rev ? tuple.src.u.all : tuple.dst.u.all; + __be16 port; + + port = rev ? tuple.dst.u.all : tuple.src.u.all; + if (port != keys->ports.src) { + keys->ports.src = port; + upd = true; + } + port = rev ? tuple.src.u.all : tuple.dst.u.all; + if (port != keys->ports.dst) { + port = keys->ports.dst; + upd = true; + } } + return upd; +#else + return false; #endif } @@ -624,23 +646,36 @@ static bool cake_ddst(int flow_mode) static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb, int flow_mode, u16 flow_override, u16 host_override) { + bool hash_flows = (!flow_override && !!(flow_mode & CAKE_FLOW_FLOWS)); + bool hash_hosts = (!host_override && !!(flow_mode & CAKE_FLOW_HOSTS)); + bool nat_enabled = !!(flow_mode & CAKE_FLOW_NAT_FLAG); u32 flow_hash = 0, srchost_hash = 0, dsthost_hash = 0; u16 reduced_hash, srchost_idx, dsthost_idx; struct flow_keys keys, host_keys; + bool use_skbhash = skb->l4_hash; if (unlikely(flow_mode == CAKE_FLOW_NONE)) return 0; - /* If both overrides are set we can skip packet dissection entirely */ - if ((flow_override || !(flow_mode & CAKE_FLOW_FLOWS)) && - (host_override || !(flow_mode & CAKE_FLOW_HOSTS))) + /* If both overrides are set, or we can use the SKB hash and nat mode is + * disabled, we can skip packet dissection entirely. If nat mode is + * enabled there's another check below after doing the conntrack lookup. + */ + if ((!hash_flows || (use_skbhash && !nat_enabled)) && !hash_hosts) goto skip_hash; skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); - if (flow_mode & CAKE_FLOW_NAT_FLAG) - cake_update_flowkeys(&keys, skb); + /* Don't use the SKB hash if we change the lookup keys from conntrack */ + if (nat_enabled && cake_update_flowkeys(&keys, skb)) + use_skbhash = false; + + /* If we can still use the SKB hash and don't need the host hash, we can + * skip the rest of the hashing procedure + */ + if (use_skbhash && !hash_hosts) + goto skip_hash; /* flow_hash_from_keys() sorts the addresses by value, so we have * to preserve their order in a separate data structure to treat @@ -679,12 +714,14 @@ static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb, /* This *must* be after the above switch, since as a * side-effect it sorts the src and dst addresses. */ - if (flow_mode & CAKE_FLOW_FLOWS) + if (hash_flows && !use_skbhash) flow_hash = flow_hash_from_keys(&keys); skip_hash: if (flow_override) flow_hash = flow_override - 1; + else if (use_skbhash) + flow_hash = skb->hash; if (host_override) { dsthost_hash = host_override - 1; srchost_hash = host_override - 1; -- cgit v1.2.3 From 39884604b11692158ce0c559fc603510b96f8c2e Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 29 May 2020 17:49:18 +0200 Subject: mptcp: fix NULL ptr dereference in MP_JOIN error path When token lookup on MP_JOIN 3rd ack fails, the server socket closes with a reset the incoming child. Such socket has the 'is_mptcp' flag set, but no msk socket associated - due to the failed lookup. While crafting the reset packet mptcp_established_options_mp() will try to dereference the child's master socket, causing a NULL ptr dereference. This change addresses the issue with explicit fallback to TCP in such error path. Fixes: 729cd6436f35 ("mptcp: cope better with MP_JOIN failure") Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/mptcp/subflow.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index f3c06b8af92d..493b98a0825c 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -413,6 +413,20 @@ static void subflow_ulp_fallback(struct sock *sk, tcp_sk(sk)->is_mptcp = 0; } +static void subflow_drop_ctx(struct sock *ssk) +{ + struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk); + + if (!ctx) + return; + + subflow_ulp_fallback(ssk, ctx); + if (ctx->conn) + sock_put(ctx->conn); + + kfree_rcu(ctx, rcu); +} + static struct sock *subflow_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, @@ -485,10 +499,7 @@ create_child: if (fallback_is_fatal) goto dispose_child; - if (ctx) { - subflow_ulp_fallback(child, ctx); - kfree_rcu(ctx, rcu); - } + subflow_drop_ctx(child); goto out; } @@ -537,6 +548,7 @@ out: return child; dispose_child: + subflow_drop_ctx(child); tcp_rsk(req)->drop_req = true; tcp_send_active_reset(child, GFP_ATOMIC); inet_csk_prepare_for_destroy_sock(child); -- cgit v1.2.3 From 5e9cf0f0a3e98992184442de24253fe1b9c40f2e Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 29 May 2020 14:04:27 +0200 Subject: cfg80211: fix 6 GHz frequencies to kHz The updates to change to kHz frequencies and the 6 GHz additions evidently overlapped (or rather, I didn't see it when applying the latter), so the 6 GHz is broken. Fix this. Fixes: 934f4c7dd3a5 ("cfg80211: express channels with a KHz component") Link: https://lore.kernel.org/r/20200529140425.1bf824f6911b.I4a1174916b8f5965af4366999eb9ffc7a0347470@changeid Signed-off-by: Johannes Berg --- net/wireless/util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/util.c b/net/wireless/util.c index df75e58eca5d..5b3b0d1222a2 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -94,7 +94,7 @@ u32 ieee80211_channel_to_freq_khz(int chan, enum nl80211_band band) case NL80211_BAND_6GHZ: /* see 802.11ax D4.1 27.3.22.2 */ if (chan <= 253) - return 5940 + chan * 5; + return MHZ_TO_KHZ(5940 + chan * 5); break; case NL80211_BAND_60GHZ: if (chan < 7) -- cgit v1.2.3 From d1a1646c0de7b0083d42a1ada72a3ec243bfcc6d Mon Sep 17 00:00:00 2001 From: Arend Van Spriel Date: Fri, 29 May 2020 11:41:43 +0200 Subject: cfg80211: adapt to new channelization of the 6GHz band The 6GHz band does not have regulatory approval yet, but things are moving forward. However, that has led to a change in the channelization of the 6GHz band which has been accepted in the 11ax specification. It also fixes a missing MHZ_TO_KHZ() macro for 6GHz channels while at it. This change is primarily thrown in to discuss how to deal with it. I noticed ath11k adding 6G support with old channelization and ditto for iw. It probably involves changes in hostapd as well. Cc: Pradeep Kumar Chitrapu Cc: Jouni Malinen Signed-off-by: Arend van Spriel Link: https://lore.kernel.org/r/edf07cdd-ad15-4012-3afd-d8b961a80b69@broadcom.com Signed-off-by: Johannes Berg --- net/wireless/util.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/wireless/util.c b/net/wireless/util.c index 5b3b0d1222a2..a27d4f45fb5f 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -92,9 +92,11 @@ u32 ieee80211_channel_to_freq_khz(int chan, enum nl80211_band band) return MHZ_TO_KHZ(5000 + chan * 5); break; case NL80211_BAND_6GHZ: - /* see 802.11ax D4.1 27.3.22.2 */ + /* see 802.11ax D6.1 27.3.23.2 */ + if (chan == 2) + return MHZ_TO_KHZ(5935); if (chan <= 253) - return MHZ_TO_KHZ(5940 + chan * 5); + return MHZ_TO_KHZ(5950 + chan * 5); break; case NL80211_BAND_60GHZ: if (chan < 7) -- cgit v1.2.3 From 0e47901d78f0b91901f845c2fc575ae48d8ed395 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 28 May 2020 21:34:24 +0200 Subject: nl80211: really allow client-only BIGTK support My previous commit here was wrong, it didn't check the new flag in two necessary places, so things didn't work. Fix that. Fixes: 155d7c733807 ("nl80211: allow client-only BIGTK support") Link: https://lore.kernel.org/r/20200528213443.993f108e96ca.I0086ae42d672379380d04ac5effb2f3d5135731b@changeid Signed-off-by: Johannes Berg --- net/wireless/sme.c | 7 +++++-- net/wireless/util.c | 4 +++- 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/wireless/sme.c b/net/wireless/sme.c index 3554c0d951f4..15595cf401de 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -5,7 +5,7 @@ * (for nl80211's connect() and wext) * * Copyright 2009 Johannes Berg - * Copyright (C) 2009 Intel Corporation. All rights reserved. + * Copyright (C) 2009, 2020 Intel Corporation. All rights reserved. * Copyright 2017 Intel Deutschland GmbH */ @@ -1118,7 +1118,10 @@ void __cfg80211_disconnected(struct net_device *dev, const u8 *ie, if (wiphy_ext_feature_isset( wdev->wiphy, - NL80211_EXT_FEATURE_BEACON_PROTECTION)) + NL80211_EXT_FEATURE_BEACON_PROTECTION) || + wiphy_ext_feature_isset( + wdev->wiphy, + NL80211_EXT_FEATURE_BEACON_PROTECTION_CLIENT)) max_key_idx = 7; for (i = 0; i <= max_key_idx; i++) rdev_del_key(rdev, dev, i, false, NULL); diff --git a/net/wireless/util.c b/net/wireless/util.c index a27d4f45fb5f..4d3b76f94f55 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -242,7 +242,9 @@ int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev, int max_key_idx = 5; if (wiphy_ext_feature_isset(&rdev->wiphy, - NL80211_EXT_FEATURE_BEACON_PROTECTION)) + NL80211_EXT_FEATURE_BEACON_PROTECTION) || + wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_BEACON_PROTECTION_CLIENT)) max_key_idx = 7; if (key_idx < 0 || key_idx > max_key_idx) return -EINVAL; -- cgit v1.2.3 From 43e64bf301fd8c54f0082d91c6ffd4de861baf96 Mon Sep 17 00:00:00 2001 From: Rajkumar Manoharan Date: Thu, 28 May 2020 21:34:29 +0200 Subject: cfg80211: handle 6 GHz capability of new station Handle 6 GHz HE capability while adding new station. It will be used later in mac80211 station processing. Signed-off-by: Rajkumar Manoharan Link: https://lore.kernel.org/r/1589399105-25472-2-git-send-email-rmanohar@codeaurora.org [handle nl80211_set_station, require WME, remove NL80211_HE_6GHZ_CAPABILITY_LEN] Link: https://lore.kernel.org/r/20200528213443.b6b711fd4312.Ic9b97d57b6c4f2b28d4b2d23d2849d8bc20bd8cc@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 ++ include/uapi/linux/nl80211.h | 5 +++++ net/wireless/nl80211.c | 18 +++++++++++++++++- 3 files changed, 24 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index a38653358885..da734ea71b5a 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1238,6 +1238,7 @@ struct sta_txpwr { * @he_capa_len: the length of the HE capabilities * @airtime_weight: airtime scheduler weight for this station * @txpwr: transmit power for an associated station + * @he_6ghz_capa: HE 6 GHz Band capabilities of station */ struct station_parameters { const u8 *supported_rates; @@ -1270,6 +1271,7 @@ struct station_parameters { u8 he_capa_len; u16 airtime_weight; struct sta_txpwr txpwr; + const struct ieee80211_he_6ghz_capa *he_6ghz_capa; }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index c14666b75e57..e42ae429383e 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2502,6 +2502,9 @@ enum nl80211_commands { * first channel segment specified in %NL80211_ATTR_CENTER_FREQ1. * @NL80211_ATTR_SCAN_FREQ_KHZ: nested attribute with KHz frequencies * + * @NL80211_ATTR_HE_6GHZ_CAPABILITY: HE 6 GHz Band Capability element (from + * association request when used with NL80211_CMD_NEW_STATION). + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -2982,6 +2985,8 @@ enum nl80211_attrs { NL80211_ATTR_CENTER_FREQ1_OFFSET, NL80211_ATTR_SCAN_FREQ_KHZ, + NL80211_ATTR_HE_6GHZ_CAPABILITY, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 22c4d13e28cb..bf8bd8268cb7 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -654,6 +654,10 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_RECEIVE_MULTICAST] = { .type = NLA_FLAG }, [NL80211_ATTR_WIPHY_FREQ_OFFSET] = NLA_POLICY_RANGE(NLA_U32, 0, 999), [NL80211_ATTR_SCAN_FREQ_KHZ] = { .type = NLA_NESTED }, + [NL80211_ATTR_HE_6GHZ_CAPABILITY] = { + .type = NLA_EXACT_LEN, + .len = sizeof(struct ieee80211_he_6ghz_capa), + }, }; /* policy for the key attributes */ @@ -5989,6 +5993,10 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info) nla_get_u8(info->attrs[NL80211_ATTR_OPMODE_NOTIF]); } + if (info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY]) + params.he_6ghz_capa = + nla_data(info->attrs[NL80211_ATTR_HE_CAPABILITY]); + if (info->attrs[NL80211_ATTR_AIRTIME_WEIGHT]) params.airtime_weight = nla_get_u16(info->attrs[NL80211_ATTR_AIRTIME_WEIGHT]); @@ -6123,6 +6131,10 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) return -EINVAL; } + if (info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY]) + params.he_6ghz_capa = + nla_data(info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY]); + if (info->attrs[NL80211_ATTR_OPMODE_NOTIF]) { params.opmode_notif_used = true; params.opmode_notif = @@ -6167,10 +6179,14 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) params.vht_capa = NULL; /* HE requires WME */ - if (params.he_capa_len) + if (params.he_capa_len || params.he_6ghz_capa) return -EINVAL; } + /* Ensure that HT/VHT capabilities are not set for 6 GHz HE STA */ + if (params.he_6ghz_capa && (params.ht_capa || params.vht_capa)) + return -EINVAL; + /* When you run into this, adjust the code below for the new flag */ BUILD_BUG_ON(NL80211_STA_FLAG_MAX != 7); -- cgit v1.2.3 From a6cf28e05f0b3bda6ff0c58100324ac91aec6027 Mon Sep 17 00:00:00 2001 From: Rajkumar Manoharan Date: Thu, 28 May 2020 21:34:30 +0200 Subject: mac80211: add HE 6 GHz Band Capabilities into parse extension Handle 6 GHz band capability element parsing for association. Signed-off-by: Rajkumar Manoharan Link: https://lore.kernel.org/r/1589399105-25472-4-git-send-email-rmanohar@codeaurora.org [some renaming to be in line with previous patches] Link: https://lore.kernel.org/r/20200528213443.a13d7a0b85b0.Ia07584da4fc77aa77c4cc563248d2ce4234ffe5d@changeid Signed-off-by: Johannes Berg --- net/mac80211/ieee80211_i.h | 1 + net/mac80211/util.c | 4 ++++ 2 files changed, 5 insertions(+) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index b7935f3d000d..dac016636d12 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1494,6 +1494,7 @@ struct ieee802_11_elems { const struct ieee80211_he_operation *he_operation; const struct ieee80211_he_spr *he_spr; const struct ieee80211_mu_edca_param_set *mu_edca_param_set; + const struct ieee80211_he_6ghz_capa *he_6ghz_capa; const u8 *uora_element; const u8 *mesh_id; const u8 *peering; diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 20436c86b9bf..5d2c5ae8aadb 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -936,6 +936,10 @@ static void ieee80211_parse_extension_element(u32 *crc, len >= ieee80211_he_spr_size(data)) elems->he_spr = data; break; + case WLAN_EID_EXT_HE_6GHZ_CAPA: + if (len == sizeof(*elems->he_6ghz_capa)) + elems->he_6ghz_capa = data; + break; } } -- cgit v1.2.3 From 223952177296c34d9c8de9cde33204caffe55725 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 28 May 2020 21:34:31 +0200 Subject: cfg80211: add and expose HE 6 GHz band capabilities These capabilities cover what would otherwise be transported in HT/VHT capabilities, but only a subset thereof that is actually needed on 6 GHz with HE already present. Expose the capabilities to userspace, drivers are expected to set them as using the 6 GHz band (currently) requires HE capability. Link: https://lore.kernel.org/r/20200528213443.244cd5cb9db8.Icd8c773277a88c837e7e3af1d4d1013cc3b66543@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 3 +++ include/uapi/linux/nl80211.h | 3 +++ net/wireless/nl80211.c | 9 ++++++++- 3 files changed, 14 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index da734ea71b5a..9b76be3d561a 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -354,10 +354,13 @@ struct ieee80211_sta_he_cap { * * @types_mask: interface types mask * @he_cap: holds the HE capabilities + * @he_6ghz_capa: HE 6 GHz capabilities, must be filled in for a + * 6 GHz band channel (and 0 may be valid value). */ struct ieee80211_sband_iftype_data { u16 types_mask; struct ieee80211_sta_he_cap he_cap; + struct ieee80211_he_6ghz_capa he_6ghz_capa; }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index e42ae429383e..5b350d032fa3 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -3565,6 +3565,8 @@ enum nl80211_mpath_info { * defined in HE capabilities IE * @NL80211_BAND_IFTYPE_ATTR_MAX: highest band HE capability attribute currently * defined + * @NL80211_BAND_IFTYPE_ATTR_HE_6GHZ_CAPA: HE 6GHz band capabilities (__le16), + * given for all 6 GHz band channels * @__NL80211_BAND_IFTYPE_ATTR_AFTER_LAST: internal use */ enum nl80211_band_iftype_attr { @@ -3575,6 +3577,7 @@ enum nl80211_band_iftype_attr { NL80211_BAND_IFTYPE_ATTR_HE_CAP_PHY, NL80211_BAND_IFTYPE_ATTR_HE_CAP_MCS_SET, NL80211_BAND_IFTYPE_ATTR_HE_CAP_PPE, + NL80211_BAND_IFTYPE_ATTR_HE_6GHZ_CAPA, /* keep last */ __NL80211_BAND_IFTYPE_ATTR_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index bf8bd8268cb7..3a24e6add13e 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -1562,6 +1562,7 @@ static int nl80211_send_coalesce(struct sk_buff *msg, static int nl80211_send_iftype_data(struct sk_buff *msg, + const struct ieee80211_supported_band *sband, const struct ieee80211_sband_iftype_data *iftdata) { const struct ieee80211_sta_he_cap *he_cap = &iftdata->he_cap; @@ -1585,6 +1586,12 @@ nl80211_send_iftype_data(struct sk_buff *msg, return -ENOBUFS; } + if (sband->band == NL80211_BAND_6GHZ && + nla_put(msg, NL80211_BAND_IFTYPE_ATTR_HE_6GHZ_CAPA, + sizeof(iftdata->he_6ghz_capa), + &iftdata->he_6ghz_capa)) + return -ENOBUFS; + return 0; } @@ -1633,7 +1640,7 @@ static int nl80211_send_band_rateinfo(struct sk_buff *msg, if (!iftdata) return -ENOBUFS; - err = nl80211_send_iftype_data(msg, + err = nl80211_send_iftype_data(msg, sband, &sband->iftype_data[i]); if (err) return err; -- cgit v1.2.3 From 24a2042cb22fdfc7feef0df9622f0d9d71b8ced1 Mon Sep 17 00:00:00 2001 From: Rajkumar Manoharan Date: Thu, 28 May 2020 21:34:32 +0200 Subject: mac80211: add HE 6 GHz Band Capability element Construct HE 6 GHz band capability element (IEEE 802.11ax/D6.0, 9.4.2.261) for association request and mesh beacon. The 6 GHz capability information is passed by driver through iftypes caps. Signed-off-by: Rajkumar Manoharan Link: https://lore.kernel.org/r/1589399105-25472-7-git-send-email-rmanohar@codeaurora.org [handle SMPS, adjust for previous patches, reserve SKB space properly, change to handle SKB directly] Link: https://lore.kernel.org/r/20200528213443.643aa8101111.I3f9747c1147480f65445f13eda5c4a5ed4e86757@changeid Signed-off-by: Johannes Berg --- net/mac80211/ieee80211_i.h | 2 ++ net/mac80211/mesh.c | 9 +++++++++ net/mac80211/mesh.h | 2 ++ net/mac80211/mesh_plink.c | 4 +++- net/mac80211/mlme.c | 3 +++ net/mac80211/util.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 65 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index dac016636d12..344ea828e806 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -2177,6 +2177,8 @@ u8 ieee80211_ie_len_he_cap(struct ieee80211_sub_if_data *sdata, u8 iftype); u8 *ieee80211_ie_build_he_cap(u8 *pos, const struct ieee80211_sta_he_cap *he_cap, u8 *end); +void ieee80211_ie_build_he_6ghz_cap(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb); u8 *ieee80211_ie_build_he_oper(u8 *pos); int ieee80211_parse_bitrates(struct cfg80211_chan_def *chandef, const struct ieee80211_supported_band *sband, diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index 5930d07b1e43..5e8d72bdbb98 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -587,6 +587,13 @@ int mesh_add_he_oper_ie(struct ieee80211_sub_if_data *sdata, return 0; } +int mesh_add_he_6ghz_cap_ie(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb) +{ + ieee80211_ie_build_he_6ghz_cap(sdata, skb); + return 0; +} + static void ieee80211_mesh_path_timer(struct timer_list *t) { struct ieee80211_sub_if_data *sdata = @@ -766,6 +773,7 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh) 2 + sizeof(struct ieee80211_vht_operation) + ie_len_he_cap + 2 + 1 + sizeof(struct ieee80211_he_operation) + + 2 + 1 + sizeof(struct ieee80211_he_6ghz_capa) + ifmsh->ie_len; bcn = kzalloc(sizeof(*bcn) + head_len + tail_len, GFP_KERNEL); @@ -885,6 +893,7 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh) mesh_add_vht_oper_ie(sdata, skb) || mesh_add_he_cap_ie(sdata, skb, ie_len_he_cap) || mesh_add_he_oper_ie(sdata, skb) || + mesh_add_he_6ghz_cap_ie(sdata, skb) || mesh_add_vendor_ies(sdata, skb)) goto out_free; diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h index 953f720754e8..40492d1bd8fd 100644 --- a/net/mac80211/mesh.h +++ b/net/mac80211/mesh.h @@ -222,6 +222,8 @@ int mesh_add_he_cap_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, u8 ie_len); int mesh_add_he_oper_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); +int mesh_add_he_6ghz_cap_ie(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb); void mesh_rmc_free(struct ieee80211_sub_if_data *sdata); int mesh_rmc_init(struct ieee80211_sub_if_data *sdata); void ieee80211s_init(void); diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index 737c5f4dbf52..3aca89c97f36 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -238,6 +238,7 @@ static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata, 2 + sizeof(struct ieee80211_vht_operation) + ie_len_he_cap + 2 + 1 + sizeof(struct ieee80211_he_operation) + + 2 + 1 + sizeof(struct ieee80211_he_6ghz_capa) + 2 + 8 + /* peering IE */ sdata->u.mesh.ie_len); if (!skb) @@ -328,7 +329,8 @@ static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata, mesh_add_vht_cap_ie(sdata, skb) || mesh_add_vht_oper_ie(sdata, skb) || mesh_add_he_cap_ie(sdata, skb, ie_len_he_cap) || - mesh_add_he_oper_ie(sdata, skb)) + mesh_add_he_oper_ie(sdata, skb) || + mesh_add_he_6ghz_cap_ie(sdata, skb)) goto free; } diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index a259b4487b60..f6ddce646f18 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -658,6 +658,8 @@ static void ieee80211_add_he_ie(struct ieee80211_sub_if_data *sdata, he_cap->he_cap_elem.phy_cap_info); pos = skb_put(skb, he_cap_size); ieee80211_ie_build_he_cap(pos, he_cap, pos + he_cap_size); + + ieee80211_ie_build_he_6ghz_cap(sdata, skb); } static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) @@ -731,6 +733,7 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) 2 + 1 + sizeof(struct ieee80211_he_cap_elem) + /* HE */ sizeof(struct ieee80211_he_mcs_nss_supp) + IEEE80211_HE_PPE_THRES_MAX_LEN + + 2 + 1 + sizeof(struct ieee80211_he_6ghz_capa) + assoc_data->ie_len + /* extra IEs */ (assoc_data->fils_kek_len ? 16 /* AES-SIV */ : 0) + 9, /* WMM */ diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 5d2c5ae8aadb..048b38546a56 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -2839,6 +2839,52 @@ end: return pos; } +void ieee80211_ie_build_he_6ghz_cap(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb) +{ + struct ieee80211_supported_band *sband; + const struct ieee80211_sband_iftype_data *iftd; + enum nl80211_iftype iftype = ieee80211_vif_type_p2p(&sdata->vif); + u8 *pos; + u16 cap; + + sband = ieee80211_get_sband(sdata); + if (!sband) + return; + + iftd = ieee80211_get_sband_iftype_data(sband, iftype); + if (WARN_ON(!iftd)) + return; + + cap = le16_to_cpu(iftd->he_6ghz_capa.capa); + cap &= ~IEEE80211_HE_6GHZ_CAP_SM_PS; + + switch (sdata->smps_mode) { + case IEEE80211_SMPS_AUTOMATIC: + case IEEE80211_SMPS_NUM_MODES: + WARN_ON(1); + /* fall through */ + case IEEE80211_SMPS_OFF: + cap |= u16_encode_bits(WLAN_HT_CAP_SM_PS_DISABLED, + IEEE80211_HE_6GHZ_CAP_SM_PS); + break; + case IEEE80211_SMPS_STATIC: + cap |= u16_encode_bits(WLAN_HT_CAP_SM_PS_STATIC, + IEEE80211_HE_6GHZ_CAP_SM_PS); + break; + case IEEE80211_SMPS_DYNAMIC: + cap |= u16_encode_bits(WLAN_HT_CAP_SM_PS_DYNAMIC, + IEEE80211_HE_6GHZ_CAP_SM_PS); + break; + } + + pos = skb_put(skb, 2 + 1 + sizeof(cap)); + *pos++ = WLAN_EID_EXTENSION; + *pos++ = 1 + sizeof(cap); + *pos++ = WLAN_EID_EXT_HE_6GHZ_CAPA; + put_unaligned_le16(cap, pos); +} + u8 *ieee80211_ie_build_ht_oper(u8 *pos, struct ieee80211_sta_ht_cap *ht_cap, const struct cfg80211_chan_def *chandef, u16 prot_mode, bool rifs_mode) -- cgit v1.2.3 From d1b7524b3ea140e552658485d4e9dce5ee2953e1 Mon Sep 17 00:00:00 2001 From: Rajkumar Manoharan Date: Thu, 28 May 2020 21:34:33 +0200 Subject: mac80211: build HE operation with 6 GHz oper information Add 6 GHz operation information (IEEE 802.11ax/D6.0, Figure 9-787k) while building HE operation element for non-HE AP. This field is used to determine channel information in the absence of HT/VHT IEs. Signed-off-by: Rajkumar Manoharan Link: https://lore.kernel.org/r/1589399105-25472-8-git-send-email-rmanohar@codeaurora.org [fix skb allocation size] Link: https://lore.kernel.org/r/20200528193455.76796-1-johannes@sipsolutions.net Signed-off-by: Johannes Berg --- net/mac80211/ieee80211_i.h | 2 +- net/mac80211/mesh.c | 12 ++++++--- net/mac80211/mesh_plink.c | 1 + net/mac80211/util.c | 65 +++++++++++++++++++++++++++++++++++++++++++--- 4 files changed, 72 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 344ea828e806..9f874ce500f6 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -2179,7 +2179,7 @@ u8 *ieee80211_ie_build_he_cap(u8 *pos, u8 *end); void ieee80211_ie_build_he_6ghz_cap(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); -u8 *ieee80211_ie_build_he_oper(u8 *pos); +u8 *ieee80211_ie_build_he_oper(u8 *pos, struct cfg80211_chan_def *chandef); int ieee80211_parse_bitrates(struct cfg80211_chan_def *chandef, const struct ieee80211_supported_band *sband, const u8 *srates, int srates_len, u32 *rates); diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index 5e8d72bdbb98..5f3d45474db6 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -565,6 +565,7 @@ int mesh_add_he_oper_ie(struct ieee80211_sub_if_data *sdata, { const struct ieee80211_sta_he_cap *he_cap; struct ieee80211_supported_band *sband; + u32 len; u8 *pos; sband = ieee80211_get_sband(sdata); @@ -578,11 +579,15 @@ int mesh_add_he_oper_ie(struct ieee80211_sub_if_data *sdata, sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_10) return 0; - if (skb_tailroom(skb) < 2 + 1 + sizeof(struct ieee80211_he_operation)) + len = 2 + 1 + sizeof(struct ieee80211_he_operation); + if (sdata->vif.bss_conf.chandef.chan->band == NL80211_BAND_6GHZ) + len += sizeof(struct ieee80211_he_6ghz_oper); + + if (skb_tailroom(skb) < len) return -ENOMEM; - pos = skb_put(skb, 2 + 1 + sizeof(struct ieee80211_he_operation)); - ieee80211_ie_build_he_oper(pos); + pos = skb_put(skb, len); + ieee80211_ie_build_he_oper(pos, &sdata->vif.bss_conf.chandef); return 0; } @@ -773,6 +778,7 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh) 2 + sizeof(struct ieee80211_vht_operation) + ie_len_he_cap + 2 + 1 + sizeof(struct ieee80211_he_operation) + + sizeof(struct ieee80211_he_6ghz_oper) + 2 + 1 + sizeof(struct ieee80211_he_6ghz_capa) + ifmsh->ie_len; diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index 3aca89c97f36..fbbfc5d4a51c 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -238,6 +238,7 @@ static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata, 2 + sizeof(struct ieee80211_vht_operation) + ie_len_he_cap + 2 + 1 + sizeof(struct ieee80211_he_operation) + + sizeof(struct ieee80211_he_6ghz_oper) + 2 + 1 + sizeof(struct ieee80211_he_6ghz_capa) + 2 + 8 + /* peering IE */ sdata->u.mesh.ie_len); diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 048b38546a56..87dd003dbdf2 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -3008,13 +3008,18 @@ u8 *ieee80211_ie_build_vht_oper(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap, return pos + sizeof(struct ieee80211_vht_operation); } -u8 *ieee80211_ie_build_he_oper(u8 *pos) +u8 *ieee80211_ie_build_he_oper(u8 *pos, struct cfg80211_chan_def *chandef) { struct ieee80211_he_operation *he_oper; + struct ieee80211_he_6ghz_oper *he_6ghz_op; u32 he_oper_params; + u8 ie_len = 1 + sizeof(struct ieee80211_he_operation); + + if (chandef->chan->band == NL80211_BAND_6GHZ) + ie_len += sizeof(struct ieee80211_he_6ghz_oper); *pos++ = WLAN_EID_EXTENSION; - *pos++ = 1 + sizeof(struct ieee80211_he_operation); + *pos++ = ie_len; *pos++ = WLAN_EID_EXT_HE_OPERATION; he_oper_params = 0; @@ -3024,16 +3029,68 @@ u8 *ieee80211_ie_build_he_oper(u8 *pos) IEEE80211_HE_OPERATION_ER_SU_DISABLE); he_oper_params |= u32_encode_bits(1, IEEE80211_HE_OPERATION_BSS_COLOR_DISABLED); + if (chandef->chan->band == NL80211_BAND_6GHZ) + he_oper_params |= u32_encode_bits(1, + IEEE80211_HE_OPERATION_6GHZ_OP_INFO); he_oper = (struct ieee80211_he_operation *)pos; he_oper->he_oper_params = cpu_to_le32(he_oper_params); /* don't require special HE peer rates */ he_oper->he_mcs_nss_set = cpu_to_le16(0xffff); + pos += sizeof(struct ieee80211_he_operation); - /* TODO add VHT operational and 6GHz operational subelement? */ + if (chandef->chan->band != NL80211_BAND_6GHZ) + goto out; - return pos + sizeof(struct ieee80211_vht_operation); + /* TODO add VHT operational */ + he_6ghz_op = (struct ieee80211_he_6ghz_oper *)pos; + he_6ghz_op->minrate = 6; /* 6 Mbps */ + he_6ghz_op->primary = + ieee80211_frequency_to_channel(chandef->chan->center_freq); + he_6ghz_op->ccfs0 = + ieee80211_frequency_to_channel(chandef->center_freq1); + if (chandef->center_freq2) + he_6ghz_op->ccfs1 = + ieee80211_frequency_to_channel(chandef->center_freq2); + else + he_6ghz_op->ccfs1 = 0; + + switch (chandef->width) { + case NL80211_CHAN_WIDTH_160: + /* Convert 160 MHz channel width to new style as interop + * workaround. + */ + he_6ghz_op->control = + IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_160MHZ; + he_6ghz_op->ccfs1 = he_6ghz_op->ccfs0; + if (chandef->chan->center_freq < chandef->center_freq1) + he_6ghz_op->ccfs0 -= 8; + else + he_6ghz_op->ccfs0 += 8; + fallthrough; + case NL80211_CHAN_WIDTH_80P80: + he_6ghz_op->control = + IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_160MHZ; + break; + case NL80211_CHAN_WIDTH_80: + he_6ghz_op->control = + IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_80MHZ; + break; + case NL80211_CHAN_WIDTH_40: + he_6ghz_op->control = + IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_40MHZ; + break; + default: + he_6ghz_op->control = + IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_20MHZ; + break; + } + + pos += sizeof(struct ieee80211_he_6ghz_oper); + +out: + return pos; } bool ieee80211_chandef_ht_oper(const struct ieee80211_ht_operation *ht_oper, -- cgit v1.2.3 From 607ca9ea3462719e256b60b24286f984e0d48c9b Mon Sep 17 00:00:00 2001 From: Rajkumar Manoharan Date: Thu, 28 May 2020 21:34:34 +0200 Subject: mac80211: do not allow HT/VHT IEs in 6 GHz mesh mode As HT/VHT elements are not allowed in 6 GHz band, do not include them in mesh beacon template formation. Signed-off-by: Rajkumar Manoharan Link: https://lore.kernel.org/r/1589399105-25472-9-git-send-email-rmanohar@codeaurora.org Link: https://lore.kernel.org/r/20200528193455.76796-2-johannes@sipsolutions.net Signed-off-by: Johannes Berg --- net/mac80211/mesh.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'net') diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index 5f3d45474db6..79e0a90982dd 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -415,6 +415,10 @@ int mesh_add_ht_cap_ie(struct ieee80211_sub_if_data *sdata, if (!sband) return -EINVAL; + /* HT not allowed in 6 GHz */ + if (sband->band == NL80211_BAND_6GHZ) + return 0; + if (!sband->ht_cap.ht_supported || sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT || sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_5 || @@ -452,6 +456,10 @@ int mesh_add_ht_oper_ie(struct ieee80211_sub_if_data *sdata, sband = local->hw.wiphy->bands[channel->band]; ht_cap = &sband->ht_cap; + /* HT not allowed in 6 GHz */ + if (sband->band == NL80211_BAND_6GHZ) + return 0; + if (!ht_cap->ht_supported || sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT || sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_5 || @@ -479,6 +487,10 @@ int mesh_add_vht_cap_ie(struct ieee80211_sub_if_data *sdata, if (!sband) return -EINVAL; + /* VHT not allowed in 6 GHz */ + if (sband->band == NL80211_BAND_6GHZ) + return 0; + if (!sband->vht_cap.vht_supported || sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT || sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_5 || @@ -516,6 +528,10 @@ int mesh_add_vht_oper_ie(struct ieee80211_sub_if_data *sdata, sband = local->hw.wiphy->bands[channel->band]; vht_cap = &sband->vht_cap; + /* VHT not allowed in 6 GHz */ + if (sband->band == NL80211_BAND_6GHZ) + return 0; + if (!vht_cap->vht_supported || sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT || sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_5 || -- cgit v1.2.3 From 2a333a0db24e37daa2e4eb9a542c07deda44ca5a Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 28 May 2020 21:34:35 +0200 Subject: mac80211: avoid using ext NSS high BW if not supported If the AP advertises inconsistent data, namely it has CCFS1 or CCFS2, but doesn't advertise support for 160/80+80 bandwidth or "Extended NSS BW Support", then we cannot use any MCSes in the the higher bandwidth. Thus, avoid connecting with higher bandwidth since it's less efficient that way. Link: https://lore.kernel.org/r/20200528213443.0e55d40c3ccc.I6fd0b4708ebd087e5e46466c3e91f6efbcbef668@changeid Signed-off-by: Johannes Berg --- net/mac80211/ibss.c | 11 +++++++++-- net/mac80211/ieee80211_i.h | 6 +++++- net/mac80211/mesh.c | 16 ++++++++++++--- net/mac80211/mlme.c | 25 +++++++++++++++++------ net/mac80211/scan.c | 6 ++++++ net/mac80211/spectmgmt.c | 4 +++- net/mac80211/util.c | 49 ++++++++++++++++++++++++++++++++++++++++++---- 7 files changed, 100 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index 2479cd48fed0..81d26fef41e9 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -9,7 +9,7 @@ * Copyright 2009, Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright(c) 2016 Intel Deutschland GmbH - * Copyright(c) 2018-2019 Intel Corporation + * Copyright(c) 2018-2020 Intel Corporation */ #include @@ -781,6 +781,7 @@ ieee80211_ibss_process_chanswitch(struct ieee80211_sub_if_data *sdata, enum nl80211_channel_type ch_type; int err; u32 sta_flags; + u32 vht_cap_info = 0; sdata_assert_lock(sdata); @@ -798,9 +799,13 @@ ieee80211_ibss_process_chanswitch(struct ieee80211_sub_if_data *sdata, break; } + if (elems->vht_cap_elem) + vht_cap_info = le32_to_cpu(elems->vht_cap_elem->vht_cap_info); + memset(¶ms, 0, sizeof(params)); err = ieee80211_parse_ch_switch_ie(sdata, elems, ifibss->chandef.chan->band, + vht_cap_info, sta_flags, ifibss->bssid, &csa_ie); /* can't switch to destination channel, fail */ if (err < 0) @@ -1060,8 +1065,10 @@ static void ieee80211_update_sta_info(struct ieee80211_sub_if_data *sdata, /* we both use VHT */ struct ieee80211_vht_cap cap_ie; struct ieee80211_sta_vht_cap cap = sta->sta.vht_cap; + u32 vht_cap_info = + le32_to_cpu(elems->vht_cap_elem->vht_cap_info); - ieee80211_chandef_vht_oper(&local->hw, + ieee80211_chandef_vht_oper(&local->hw, vht_cap_info, elems->vht_operation, elems->ht_operation, &chandef); diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 9f874ce500f6..0cc584574976 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -111,6 +111,8 @@ struct ieee80211_bss { size_t supp_rates_len; struct ieee80211_rate *beacon_rate; + u32 vht_cap_info; + /* * During association, we save an ERP value from a probe response so * that we can feed ERP info to the driver when handling the @@ -1915,6 +1917,7 @@ void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata, * @sdata: the sdata of the interface which has received the frame * @elems: parsed 802.11 elements received with the frame * @current_band: indicates the current band + * @vht_cap_info: VHT capabilities of the transmitter * @sta_flags: contains information about own capabilities and restrictions * to decide which channel switch announcements can be accepted. Only the * following subset of &enum ieee80211_sta_flags are evaluated: @@ -1929,6 +1932,7 @@ void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata, int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata, struct ieee802_11_elems *elems, enum nl80211_band current_band, + u32 vht_cap_info, u32 sta_flags, u8 *bssid, struct ieee80211_csa_ie *csa_ie); @@ -2194,7 +2198,7 @@ u8 *ieee80211_add_wmm_info_ie(u8 *buf, u8 qosinfo); /* channel management */ bool ieee80211_chandef_ht_oper(const struct ieee80211_ht_operation *ht_oper, struct cfg80211_chan_def *chandef); -bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw, +bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw, u32 vht_cap_info, const struct ieee80211_vht_operation *oper, const struct ieee80211_ht_operation *htop, struct cfg80211_chan_def *chandef); diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index 79e0a90982dd..696d6fb322e6 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008, 2009 open80211s Ltd. - * Copyright (C) 2018 - 2019 Intel Corporation + * Copyright (C) 2018 - 2020 Intel Corporation * Authors: Luis Carlos Cobo * Javier Cardona */ @@ -63,6 +63,7 @@ bool mesh_matches_local(struct ieee80211_sub_if_data *sdata, u32 basic_rates = 0; struct cfg80211_chan_def sta_chan_def; struct ieee80211_supported_band *sband; + u32 vht_cap_info = 0; /* * As support for each feature is added, check for matching @@ -96,7 +97,11 @@ bool mesh_matches_local(struct ieee80211_sub_if_data *sdata, cfg80211_chandef_create(&sta_chan_def, sdata->vif.bss_conf.chandef.chan, NL80211_CHAN_NO_HT); ieee80211_chandef_ht_oper(ie->ht_operation, &sta_chan_def); - ieee80211_chandef_vht_oper(&sdata->local->hw, + + if (ie->vht_cap_elem) + vht_cap_info = le32_to_cpu(ie->vht_cap_elem->vht_cap_info); + + ieee80211_chandef_vht_oper(&sdata->local->hw, vht_cap_info, ie->vht_operation, ie->ht_operation, &sta_chan_def); @@ -1076,7 +1081,7 @@ ieee80211_mesh_process_chnswitch(struct ieee80211_sub_if_data *sdata, struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; struct ieee80211_supported_band *sband; int err; - u32 sta_flags; + u32 sta_flags, vht_cap_info = 0; sdata_assert_lock(sdata); @@ -1099,8 +1104,13 @@ ieee80211_mesh_process_chnswitch(struct ieee80211_sub_if_data *sdata, break; } + if (elems->vht_cap_elem) + vht_cap_info = + le32_to_cpu(elems->vht_cap_elem->vht_cap_info); + memset(¶ms, 0, sizeof(params)); err = ieee80211_parse_ch_switch_ie(sdata, elems, sband->band, + vht_cap_info, sta_flags, sdata->vif.addr, &csa_ie); if (err < 0) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index f6ddce646f18..1f9c48414c85 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -145,6 +145,7 @@ static u32 ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, struct ieee80211_channel *channel, + u32 vht_cap_info, const struct ieee80211_ht_operation *ht_oper, const struct ieee80211_vht_operation *vht_oper, const struct ieee80211_he_operation *he_oper, @@ -223,7 +224,7 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata, memcpy(&he_oper_vht_cap, he_oper->optional, 3); he_oper_vht_cap.basic_mcs_set = cpu_to_le16(0); - if (!ieee80211_chandef_vht_oper(&sdata->local->hw, + if (!ieee80211_chandef_vht_oper(&sdata->local->hw, vht_cap_info, &he_oper_vht_cap, ht_oper, &vht_chandef)) { if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE)) @@ -232,8 +233,10 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata, ret = IEEE80211_STA_DISABLE_HE; goto out; } - } else if (!ieee80211_chandef_vht_oper(&sdata->local->hw, vht_oper, - ht_oper, &vht_chandef)) { + } else if (!ieee80211_chandef_vht_oper(&sdata->local->hw, + vht_cap_info, + vht_oper, ht_oper, + &vht_chandef)) { if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)) sdata_info(sdata, "AP VHT information is invalid, disable VHT\n"); @@ -329,6 +332,7 @@ out: static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, const struct ieee80211_ht_cap *ht_cap, + const struct ieee80211_vht_cap *vht_cap, const struct ieee80211_ht_operation *ht_oper, const struct ieee80211_vht_operation *vht_oper, const struct ieee80211_he_operation *he_oper, @@ -343,6 +347,7 @@ static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata, u16 ht_opmode; u32 flags; enum ieee80211_sta_rx_bandwidth new_sta_bw; + u32 vht_cap_info = 0; int ret; /* if HT was/is disabled, don't track any bandwidth changes */ @@ -371,8 +376,11 @@ static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata, sdata->vif.bss_conf.ht_operation_mode = ht_opmode; } + if (vht_cap) + vht_cap_info = le32_to_cpu(vht_cap->vht_cap_info); + /* calculate new channel (type) based on HT/VHT/HE operation IEs */ - flags = ieee80211_determine_chantype(sdata, sband, chan, + flags = ieee80211_determine_chantype(sdata, sband, chan, vht_cap_info, ht_oper, vht_oper, he_oper, &chandef, true); @@ -1327,6 +1335,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, enum nl80211_band current_band; struct ieee80211_csa_ie csa_ie; struct ieee80211_channel_switch ch_switch; + struct ieee80211_bss *bss; int res; sdata_assert_lock(sdata); @@ -1338,7 +1347,9 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, return; current_band = cbss->channel->band; + bss = (void *)cbss->priv; res = ieee80211_parse_ch_switch_ie(sdata, elems, current_band, + bss->vht_cap_info, ifmgd->flags, ifmgd->associated->bssid, &csa_ie); @@ -4097,8 +4108,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, changed |= ieee80211_recalc_twt_req(sdata, sta, &elems); - if (ieee80211_config_bw(sdata, sta, - elems.ht_cap_elem, elems.ht_operation, + if (ieee80211_config_bw(sdata, sta, elems.ht_cap_elem, + elems.vht_cap_elem, elems.ht_operation, elems.vht_operation, elems.he_operation, bssid, &changed)) { mutex_unlock(&local->sta_mtx); @@ -4815,6 +4826,7 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, const struct ieee80211_he_operation *he_oper = NULL; struct ieee80211_supported_band *sband; struct cfg80211_chan_def chandef; + struct ieee80211_bss *bss = (void *)cbss->priv; int ret; u32 i; bool have_80mhz; @@ -4913,6 +4925,7 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, ifmgd->flags |= ieee80211_determine_chantype(sdata, sband, cbss->channel, + bss->vht_cap_info, ht_oper, vht_oper, he_oper, &chandef, false); diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index 5db15996524f..d0c2e8012118 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -132,6 +132,12 @@ ieee80211_update_bss_from_elems(struct ieee80211_local *local, bss->beacon_rate = &sband->bitrates[rx_status->rate_idx]; } + + if (elems->vht_cap_elem) + bss->vht_cap_info = + le32_to_cpu(elems->vht_cap_elem->vht_cap_info); + else + bss->vht_cap_info = 0; } struct ieee80211_bss * diff --git a/net/mac80211/spectmgmt.c b/net/mac80211/spectmgmt.c index 5fe2b645912f..ae1cb2c68722 100644 --- a/net/mac80211/spectmgmt.c +++ b/net/mac80211/spectmgmt.c @@ -9,7 +9,7 @@ * Copyright 2007, Michael Wu * Copyright 2007-2008, Intel Corporation * Copyright 2008, Johannes Berg - * Copyright (C) 2018 Intel Corporation + * Copyright (C) 2018, 2020 Intel Corporation */ #include @@ -22,6 +22,7 @@ int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata, struct ieee802_11_elems *elems, enum nl80211_band current_band, + u32 vht_cap_info, u32 sta_flags, u8 *bssid, struct ieee80211_csa_ie *csa_ie) { @@ -150,6 +151,7 @@ int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata, /* ignore if parsing fails */ if (!ieee80211_chandef_vht_oper(&sdata->local->hw, + vht_cap_info, &vht_oper, &ht_oper, &new_vht_chandef)) new_vht_chandef.chan = NULL; diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 87dd003dbdf2..e6104829fa1c 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -3120,7 +3120,7 @@ bool ieee80211_chandef_ht_oper(const struct ieee80211_ht_operation *ht_oper, return true; } -bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw, +bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw, u32 vht_cap_info, const struct ieee80211_vht_operation *oper, const struct ieee80211_ht_operation *htop, struct cfg80211_chan_def *chandef) @@ -3132,6 +3132,10 @@ bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw, u32 vht_cap; bool support_80_80 = false; bool support_160 = false; + u8 ext_nss_bw_supp = u32_get_bits(vht_cap_info, + IEEE80211_VHT_CAP_EXT_NSS_BW_MASK); + u8 supp_chwidth = u32_get_bits(vht_cap_info, + IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK); if (!oper || !htop) return false; @@ -3151,11 +3155,48 @@ bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw, IEEE80211_HT_OP_MODE_CCFS2_MASK) >> IEEE80211_HT_OP_MODE_CCFS2_SHIFT; - /* when parsing (and we know how to) CCFS1 and CCFS2 are equivalent */ ccf0 = ccfs0; - ccf1 = ccfs1; - if (!ccfs1 && ieee80211_hw_check(hw, SUPPORTS_VHT_EXT_NSS_BW)) + + /* if not supported, parse as though we didn't understand it */ + if (!ieee80211_hw_check(hw, SUPPORTS_VHT_EXT_NSS_BW)) + ext_nss_bw_supp = 0; + + /* + * Cf. IEEE 802.11 Table 9-250 + * + * We really just consider that because it's inefficient to connect + * at a higher bandwidth than we'll actually be able to use. + */ + switch ((supp_chwidth << 4) | ext_nss_bw_supp) { + default: + case 0x00: + ccf1 = 0; + support_160 = false; + support_80_80 = false; + break; + case 0x01: + support_80_80 = false; + /* fall through */ + case 0x02: + case 0x03: ccf1 = ccfs2; + break; + case 0x10: + ccf1 = ccfs1; + break; + case 0x11: + case 0x12: + if (!ccfs1) + ccf1 = ccfs2; + else + ccf1 = ccfs1; + break; + case 0x13: + case 0x20: + case 0x23: + ccf1 = ccfs1; + break; + } cf0 = ieee80211_channel_to_frequency(ccf0, chandef->chan->band); cf1 = ieee80211_channel_to_frequency(ccf1, chandef->chan->band); -- cgit v1.2.3 From 57fa5e85d53ce51e0cb06a7f320b79377d0fbe5f Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 28 May 2020 21:34:36 +0200 Subject: mac80211: determine chandef from HE 6 GHz operation Support connecting to HE 6 GHz APs and mesh networks on 6 GHz, where the HT/VHT information is missing but instead the HE 6 GHz band capability is present, and the 6 GHz Operation information field is used to encode the channel configuration instead of the HT/VHT operation elements. Also add some other bits needed to connect to 6 GHz networks. Link: https://lore.kernel.org/r/1589399105-25472-10-git-send-email-rmanohar@codeaurora.org Co-developed-by: Rajkumar Manoharan Signed-off-by: Rajkumar Manoharan Link: https://lore.kernel.org/r/20200528213443.25687d2695bc.I3f9747c1147480f65445f13eda5c4a5ed4e86757@changeid Signed-off-by: Johannes Berg --- net/mac80211/ieee80211_i.h | 3 ++ net/mac80211/mesh.c | 1 + net/mac80211/mlme.c | 69 +++++++++++++++++++++-------- net/mac80211/util.c | 106 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 160 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 0cc584574976..6cac5bf7cba3 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -2202,6 +2202,9 @@ bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw, u32 vht_cap_info, const struct ieee80211_vht_operation *oper, const struct ieee80211_ht_operation *htop, struct cfg80211_chan_def *chandef); +bool ieee80211_chandef_he_6ghz_oper(struct ieee80211_sub_if_data *sdata, + const struct ieee80211_he_operation *he_oper, + struct cfg80211_chan_def *chandef); u32 ieee80211_chandef_downgrade(struct cfg80211_chan_def *c); int __must_check diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index 696d6fb322e6..5f1ca25b6c97 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -104,6 +104,7 @@ bool mesh_matches_local(struct ieee80211_sub_if_data *sdata, ieee80211_chandef_vht_oper(&sdata->local->hw, vht_cap_info, ie->vht_operation, ie->ht_operation, &sta_chan_def); + ieee80211_chandef_he_6ghz_oper(sdata, ie->he_operation, &sta_chan_def); if (!cfg80211_chandef_compatible(&sdata->vif.bss_conf.chandef, &sta_chan_def)) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 1f9c48414c85..bc558d1d20fc 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -156,15 +156,24 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata, struct ieee80211_sta_ht_cap sta_ht_cap; u32 ht_cfreq, ret; - memcpy(&sta_ht_cap, &sband->ht_cap, sizeof(sta_ht_cap)); - ieee80211_apply_htcap_overrides(sdata, &sta_ht_cap); - memset(chandef, 0, sizeof(struct cfg80211_chan_def)); chandef->chan = channel; chandef->width = NL80211_CHAN_WIDTH_20_NOHT; chandef->center_freq1 = channel->center_freq; chandef->freq1_offset = channel->freq_offset; + if (channel->band == NL80211_BAND_6GHZ) { + if (!ieee80211_chandef_he_6ghz_oper(sdata, he_oper, chandef)) + ret = IEEE80211_STA_DISABLE_HT | + IEEE80211_STA_DISABLE_VHT | + IEEE80211_STA_DISABLE_HE; + vht_chandef = *chandef; + goto out; + } + + memcpy(&sta_ht_cap, &sband->ht_cap, sizeof(sta_ht_cap)); + ieee80211_apply_htcap_overrides(sdata, &sta_ht_cap); + if (!ht_oper || !sta_ht_cap.ht_supported) { ret = IEEE80211_STA_DISABLE_HT | IEEE80211_STA_DISABLE_VHT | @@ -914,7 +923,8 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) !(ifmgd->flags & IEEE80211_STA_DISABLE_VHT))) ifmgd->flags |= IEEE80211_STA_DISABLE_VHT; - if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT)) + if (sband->band != NL80211_BAND_6GHZ && + !(ifmgd->flags & IEEE80211_STA_DISABLE_HT)) ieee80211_add_ht_ie(sdata, skb, assoc_data->ap_ht_param, sband, chan, sdata->smps_mode); @@ -968,7 +978,8 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) offset = noffset; } - if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)) + if (sband->band != NL80211_BAND_6GHZ && + !(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)) ieee80211_add_vht_ie(sdata, skb, sband, &assoc_data->ap_vht_cap); @@ -3248,6 +3259,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf; const struct cfg80211_bss_ies *bss_ies = NULL; struct ieee80211_mgd_assoc_data *assoc_data = ifmgd->assoc_data; + bool is_6ghz = cbss->channel->band == NL80211_BAND_6GHZ; u32 changed = 0; int err; bool ret; @@ -3289,11 +3301,12 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, * 2G/3G/4G wifi routers, reported models include the "Onda PN51T", * "Vodafone PocketWiFi 2", "ZTE MF60" and a similar T-Mobile device. */ - if ((assoc_data->wmm && !elems->wmm_param) || - (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT) && - (!elems->ht_cap_elem || !elems->ht_operation)) || - (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT) && - (!elems->vht_cap_elem || !elems->vht_operation))) { + if (!is_6ghz && + ((assoc_data->wmm && !elems->wmm_param) || + (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT) && + (!elems->ht_cap_elem || !elems->ht_operation)) || + (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT) && + (!elems->vht_cap_elem || !elems->vht_operation)))) { const struct cfg80211_bss_ies *ies; struct ieee802_11_elems bss_elems; @@ -3351,7 +3364,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, * We previously checked these in the beacon/probe response, so * they should be present here. This is just a safety net. */ - if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT) && + if (!is_6ghz && !(ifmgd->flags & IEEE80211_STA_DISABLE_HT) && (!elems->wmm_param || !elems->ht_cap_elem || !elems->ht_operation)) { sdata_info(sdata, "HT AP is missing WMM params or HT capability/operation\n"); @@ -3359,7 +3372,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, goto out; } - if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT) && + if (!is_6ghz && !(ifmgd->flags & IEEE80211_STA_DISABLE_VHT) && (!elems->vht_cap_elem || !elems->vht_operation)) { sdata_info(sdata, "VHT AP is missing VHT capability/operation\n"); @@ -3367,6 +3380,14 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, goto out; } + if (is_6ghz && !(ifmgd->flags & IEEE80211_STA_DISABLE_HE) && + !elems->he_6ghz_capa) { + sdata_info(sdata, + "HE 6 GHz AP is missing HE 6 GHz band capability\n"); + ret = false; + goto out; + } + mutex_lock(&sdata->local->sta_mtx); /* * station info was already allocated and inserted before @@ -4826,6 +4847,7 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, const struct ieee80211_he_operation *he_oper = NULL; struct ieee80211_supported_band *sband; struct cfg80211_chan_def chandef; + bool is_6ghz = cbss->channel->band == NL80211_BAND_6GHZ; struct ieee80211_bss *bss = (void *)cbss->priv; int ret; u32 i; @@ -4838,21 +4860,23 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, IEEE80211_STA_DISABLE_160MHZ); /* disable HT/VHT/HE if we don't support them */ - if (!sband->ht_cap.ht_supported) { + if (!sband->ht_cap.ht_supported && !is_6ghz) { ifmgd->flags |= IEEE80211_STA_DISABLE_HT; ifmgd->flags |= IEEE80211_STA_DISABLE_VHT; ifmgd->flags |= IEEE80211_STA_DISABLE_HE; } - if (!sband->vht_cap.vht_supported) + if (!sband->vht_cap.vht_supported && !is_6ghz) { ifmgd->flags |= IEEE80211_STA_DISABLE_VHT; + ifmgd->flags |= IEEE80211_STA_DISABLE_HE; + } if (!ieee80211_get_he_sta_cap(sband)) ifmgd->flags |= IEEE80211_STA_DISABLE_HE; rcu_read_lock(); - if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT)) { + if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT) && !is_6ghz) { const u8 *ht_oper_ie, *ht_cap_ie; ht_oper_ie = ieee80211_bss_get_ie(cbss, WLAN_EID_HT_OPERATION); @@ -4869,7 +4893,7 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, } } - if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)) { + if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT) && !is_6ghz) { const u8 *vht_oper_ie, *vht_cap; vht_oper_ie = ieee80211_bss_get_ie(cbss, @@ -4934,6 +4958,11 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, rcu_read_unlock(); + if (ifmgd->flags & IEEE80211_STA_DISABLE_HE && is_6ghz) { + sdata_info(sdata, "Rejecting non-HE 6/7 GHz connection"); + return -EINVAL; + } + /* will change later if needed */ sdata->smps_mode = IEEE80211_SMPS_OFF; @@ -5315,6 +5344,7 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata, int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, struct cfg80211_assoc_request *req) { + bool is_6ghz = req->bss->channel->band == NL80211_BAND_6GHZ; struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_bss *bss = (void *)req->bss->priv; @@ -5457,14 +5487,15 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, if (ht_ie && ht_ie[1] >= sizeof(struct ieee80211_ht_operation)) assoc_data->ap_ht_param = ((struct ieee80211_ht_operation *)(ht_ie + 2))->ht_param; - else + else if (!is_6ghz) ifmgd->flags |= IEEE80211_STA_DISABLE_HT; vht_ie = ieee80211_bss_get_ie(req->bss, WLAN_EID_VHT_CAPABILITY); if (vht_ie && vht_ie[1] >= sizeof(struct ieee80211_vht_cap)) memcpy(&assoc_data->ap_vht_cap, vht_ie + 2, sizeof(struct ieee80211_vht_cap)); - else - ifmgd->flags |= IEEE80211_STA_DISABLE_VHT; + else if (!is_6ghz) + ifmgd->flags |= IEEE80211_STA_DISABLE_VHT | + IEEE80211_STA_DISABLE_HE; rcu_read_unlock(); if (WARN((sdata->vif.driver_flags & IEEE80211_VIF_SUPPORTS_UAPSD) && diff --git a/net/mac80211/util.c b/net/mac80211/util.c index e6104829fa1c..cbe24d303f0d 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -3244,6 +3244,112 @@ bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw, u32 vht_cap_info, return true; } +bool ieee80211_chandef_he_6ghz_oper(struct ieee80211_sub_if_data *sdata, + const struct ieee80211_he_operation *he_oper, + struct cfg80211_chan_def *chandef) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_supported_band *sband; + enum nl80211_iftype iftype = ieee80211_vif_type_p2p(&sdata->vif); + const struct ieee80211_sta_he_cap *he_cap; + struct cfg80211_chan_def he_chandef = *chandef; + const struct ieee80211_he_6ghz_oper *he_6ghz_oper; + bool support_80_80, support_160; + u8 he_phy_cap; + u32 freq; + + if (chandef->chan->band != NL80211_BAND_6GHZ) + return true; + + sband = local->hw.wiphy->bands[NL80211_BAND_6GHZ]; + + he_cap = ieee80211_get_he_iftype_cap(sband, iftype); + if (!he_cap) { + sdata_info(sdata, "Missing iftype sband data/HE cap"); + return false; + } + + he_phy_cap = he_cap->he_cap_elem.phy_cap_info[0]; + support_160 = + he_phy_cap & + IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G; + support_80_80 = + he_phy_cap & + IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G; + + if (!he_oper) { + sdata_info(sdata, + "HE is not advertised on (on %d MHz), expect issues\n", + chandef->chan->center_freq); + return false; + } + + he_6ghz_oper = ieee80211_he_6ghz_oper(he_oper); + + if (!he_6ghz_oper) { + sdata_info(sdata, + "HE 6GHz operation missing (on %d MHz), expect issues\n", + chandef->chan->center_freq); + return false; + } + + freq = ieee80211_channel_to_frequency(he_6ghz_oper->primary, + NL80211_BAND_6GHZ); + he_chandef.chan = ieee80211_get_channel(sdata->local->hw.wiphy, freq); + + switch (u8_get_bits(he_6ghz_oper->control, + IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH)) { + case IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_20MHZ: + he_chandef.width = NL80211_CHAN_WIDTH_20; + break; + case IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_40MHZ: + he_chandef.width = NL80211_CHAN_WIDTH_40; + break; + case IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_80MHZ: + he_chandef.width = NL80211_CHAN_WIDTH_80; + break; + case IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_160MHZ: + he_chandef.width = NL80211_CHAN_WIDTH_80; + if (!he_6ghz_oper->ccfs1) + break; + if (abs(he_6ghz_oper->ccfs1 - he_6ghz_oper->ccfs0) == 8) { + if (support_160) + he_chandef.width = NL80211_CHAN_WIDTH_160; + } else { + if (support_80_80) + he_chandef.width = NL80211_CHAN_WIDTH_80P80; + } + break; + } + + if (he_chandef.width == NL80211_CHAN_WIDTH_160) { + he_chandef.center_freq1 = + ieee80211_channel_to_frequency(he_6ghz_oper->ccfs1, + NL80211_BAND_6GHZ); + } else { + he_chandef.center_freq1 = + ieee80211_channel_to_frequency(he_6ghz_oper->ccfs0, + NL80211_BAND_6GHZ); + he_chandef.center_freq2 = + ieee80211_channel_to_frequency(he_6ghz_oper->ccfs1, + NL80211_BAND_6GHZ); + } + + if (!cfg80211_chandef_valid(&he_chandef)) { + sdata_info(sdata, + "HE 6GHz operation resulted in invalid chandef: %d MHz/%d/%d MHz/%d MHz\n", + he_chandef.chan ? he_chandef.chan->center_freq : 0, + he_chandef.width, + he_chandef.center_freq1, + he_chandef.center_freq2); + return false; + } + + *chandef = he_chandef; + + return true; +} + int ieee80211_parse_bitrates(struct cfg80211_chan_def *chandef, const struct ieee80211_supported_band *sband, const u8 *srates, int srates_len, u32 *rates) -- cgit v1.2.3 From 3b3ec3d52e8f72ec8c40477b96f23440a89000be Mon Sep 17 00:00:00 2001 From: Shaul Triebitz Date: Thu, 28 May 2020 21:34:37 +0200 Subject: mac80211: check the correct bit for EMA AP An AP supporting EMA (Enhanced Multi-BSSID advertisement) should set bit 83 in the extended capabilities IE (9.4.2.26 in the 802.11ax D5 spec). So the *3rd* bit of the 10th byte should be checked. Also, in one place, the wrong byte was checked. (cfg80211_find_ie returns a pointer to the beginning of the IE, so the data really starts at ie[2], so the 10th byte should be ie[12]. To avoid this confusion, use cfg80211_find_elem instead). Signed-off-by: Shaul Triebitz Link: https://lore.kernel.org/r/20200528213443.4316121fa2a3.I9745582f8d41ad8e689dac0fefcd70b276d7c1ea@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 2 +- net/mac80211/mlme.c | 18 +++++++++--------- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 9580dfd9e2d1..1ecfd19f836d 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -3082,7 +3082,7 @@ enum ieee80211_tdls_actioncode { #define WLAN_EXT_CAPA10_OBSS_NARROW_BW_RU_TOLERANCE_SUPPORT BIT(7) /* Defines support for enhanced multi-bssid advertisement*/ -#define WLAN_EXT_CAPA11_EMA_SUPPORT BIT(1) +#define WLAN_EXT_CAPA11_EMA_SUPPORT BIT(3) /* TDLS specific payload type in the LLC/SNAP header */ #define WLAN_TDLS_SNAP_RFTYPE 0x2 diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index bc558d1d20fc..c534cd1bb9cd 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -5596,7 +5596,7 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, assoc_data->timeout_started = true; assoc_data->need_beacon = true; } else if (beacon_ies) { - const u8 *ie; + const struct element *elem; u8 dtim_count = 0; ieee80211_get_dtim(beacon_ies, &dtim_count, @@ -5613,15 +5613,15 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, sdata->vif.bss_conf.sync_dtim_count = dtim_count; } - ie = cfg80211_find_ext_ie(WLAN_EID_EXT_MULTIPLE_BSSID_CONFIGURATION, - beacon_ies->data, beacon_ies->len); - if (ie && ie[1] >= 3) - sdata->vif.bss_conf.profile_periodicity = ie[4]; + elem = cfg80211_find_ext_elem(WLAN_EID_EXT_MULTIPLE_BSSID_CONFIGURATION, + beacon_ies->data, beacon_ies->len); + if (elem && elem->datalen >= 3) + sdata->vif.bss_conf.profile_periodicity = elem->data[2]; - ie = cfg80211_find_ie(WLAN_EID_EXT_CAPABILITY, - beacon_ies->data, beacon_ies->len); - if (ie && ie[1] >= 11 && - (ie[10] & WLAN_EXT_CAPA11_EMA_SUPPORT)) + elem = cfg80211_find_elem(WLAN_EID_EXT_CAPABILITY, + beacon_ies->data, beacon_ies->len); + if (elem && elem->datalen >= 11 && + (elem->data[10] & WLAN_EXT_CAPA11_EMA_SUPPORT)) sdata->vif.bss_conf.ema_ap = true; } else { assoc_data->timeout = jiffies; -- cgit v1.2.3 From 1bb9a8a4c81d0305c511a0919cd30ebfa91915ae Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 28 May 2020 21:34:38 +0200 Subject: mac80211: use HE 6 GHz band capability and pass it to the driver In order to handle 6 GHz AP side, take the HE 6 GHz band capability data and pass it to the driver (which needs it for A-MPDU spacing and A-MPDU length). Link: https://lore.kernel.org/r/1589399105-25472-6-git-send-email-rmanohar@codeaurora.org Co-developed-by: Rajkumar Manoharan Signed-off-by: Rajkumar Manoharan Link: https://lore.kernel.org/r/20200528213443.784e4890d82f.I5f1230d5ab27e84e7bbe88e3645b24ea15a0c146@changeid Signed-off-by: Johannes Berg --- include/net/mac80211.h | 4 +++- net/mac80211/cfg.c | 4 +++- net/mac80211/he.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++ net/mac80211/ieee80211_i.h | 1 + net/mac80211/mesh_plink.c | 4 +++- net/mac80211/mlme.c | 1 + 6 files changed, 59 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 7cb712427df1..11d5610d2ad5 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -7,7 +7,7 @@ * Copyright 2007-2010 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015 - 2017 Intel Deutschland GmbH - * Copyright (C) 2018 - 2019 Intel Corporation + * Copyright (C) 2018 - 2020 Intel Corporation */ #ifndef MAC80211_H @@ -1977,6 +1977,7 @@ struct ieee80211_sta_txpwr { * @ht_cap: HT capabilities of this STA; restricted to our own capabilities * @vht_cap: VHT capabilities of this STA; restricted to our own capabilities * @he_cap: HE capabilities of this STA + * @he_6ghz_capa: on 6 GHz, holds the HE 6 GHz band capabilities * @max_rx_aggregation_subframes: maximal amount of frames in a single AMPDU * that this station is allowed to transmit to us. * Can be modified by driver. @@ -2016,6 +2017,7 @@ struct ieee80211_sta { struct ieee80211_sta_ht_cap ht_cap; struct ieee80211_sta_vht_cap vht_cap; struct ieee80211_sta_he_cap he_cap; + struct ieee80211_he_6ghz_capa he_6ghz_capa; u16 max_rx_aggregation_subframes; bool wme; u8 uapsd_queues; diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 06a2b7640a9d..90a07d075fdb 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1520,7 +1520,9 @@ static int sta_apply_parameters(struct ieee80211_local *local, if (params->he_capa) ieee80211_he_cap_ie_to_sta_he_cap(sdata, sband, (void *)params->he_capa, - params->he_capa_len, sta); + params->he_capa_len, + (void *)params->he_6ghz_capa, + sta); if (params->opmode_notif_used) { /* returned value is only needed for rc update, but the diff --git a/net/mac80211/he.c b/net/mac80211/he.c index f520552b22be..cc26f239838b 100644 --- a/net/mac80211/he.c +++ b/net/mac80211/he.c @@ -8,10 +8,55 @@ #include "ieee80211_i.h" +static void +ieee80211_update_from_he_6ghz_capa(const struct ieee80211_he_6ghz_capa *he_6ghz_capa, + struct sta_info *sta) +{ + enum ieee80211_smps_mode smps_mode; + + if (sta->sdata->vif.type == NL80211_IFTYPE_AP || + sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { + switch (le16_get_bits(he_6ghz_capa->capa, + IEEE80211_HE_6GHZ_CAP_SM_PS)) { + case WLAN_HT_CAP_SM_PS_INVALID: + case WLAN_HT_CAP_SM_PS_STATIC: + smps_mode = IEEE80211_SMPS_STATIC; + break; + case WLAN_HT_CAP_SM_PS_DYNAMIC: + smps_mode = IEEE80211_SMPS_DYNAMIC; + break; + case WLAN_HT_CAP_SM_PS_DISABLED: + smps_mode = IEEE80211_SMPS_OFF; + break; + } + + sta->sta.smps_mode = smps_mode; + } else { + sta->sta.smps_mode = IEEE80211_SMPS_OFF; + } + + switch (le16_get_bits(he_6ghz_capa->capa, + IEEE80211_HE_6GHZ_CAP_MAX_MPDU_LEN)) { + case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_11454: + sta->sta.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_VHT_11454; + break; + case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_7991: + sta->sta.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_VHT_7991; + break; + case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_3895: + default: + sta->sta.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_VHT_3895; + break; + } + + sta->sta.he_6ghz_capa = *he_6ghz_capa; +} + void ieee80211_he_cap_ie_to_sta_he_cap(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, const u8 *he_cap_ie, u8 he_cap_len, + const struct ieee80211_he_6ghz_capa *he_6ghz_capa, struct sta_info *sta) { struct ieee80211_sta_he_cap *he_cap = &sta->sta.he_cap; @@ -53,6 +98,9 @@ ieee80211_he_cap_ie_to_sta_he_cap(struct ieee80211_sub_if_data *sdata, sta->cur_max_bandwidth = ieee80211_sta_cap_rx_bw(sta); sta->sta.bandwidth = ieee80211_sta_cur_vht_bw(sta); + + if (sband->band == NL80211_BAND_6GHZ && he_6ghz_capa) + ieee80211_update_from_he_6ghz_capa(he_6ghz_capa, sta); } void diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 6cac5bf7cba3..24dc1fd57000 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1899,6 +1899,7 @@ void ieee80211_he_cap_ie_to_sta_he_cap(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, const u8 *he_cap_ie, u8 he_cap_len, + const struct ieee80211_he_6ghz_capa *he_6ghz_capa, struct sta_info *sta); void ieee80211_he_spr_ie_to_bss_conf(struct ieee80211_vif *vif, diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index fbbfc5d4a51c..798e4b6b383f 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -444,7 +444,9 @@ static void mesh_sta_info_init(struct ieee80211_sub_if_data *sdata, elems->vht_cap_elem, sta); ieee80211_he_cap_ie_to_sta_he_cap(sdata, sband, elems->he_cap, - elems->he_cap_len, sta); + elems->he_cap_len, + elems->he_6ghz_capa, + sta); if (bw != sta->sta.bandwidth) changed |= IEEE80211_RC_BW_CHANGED; diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index c534cd1bb9cd..8a37089e86bb 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -3430,6 +3430,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, ieee80211_he_cap_ie_to_sta_he_cap(sdata, sband, elems->he_cap, elems->he_cap_len, + elems->he_6ghz_capa, sta); bss_conf->he_support = sta->sta.he_cap.has_he; -- cgit v1.2.3 From 2ad2274c58ee2dcaf9ccde5c63ff30f59b138f77 Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Thu, 28 May 2020 21:34:39 +0200 Subject: mac80211: Add HE 6GHz capabilities element to probe request On 6 GHz, the 6 GHz capabilities element should be added, do that. Signed-off-by: Ilan Peer [add commit message] Link: https://lore.kernel.org/r/20200528213443.8ee764f0cde0.I2b0c66b60e11818c97c9803e04a6a197c6376243@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 20 ++++++++++++++++++++ net/mac80211/ieee80211_i.h | 2 +- net/mac80211/scan.c | 17 +++++++++-------- net/mac80211/util.c | 36 ++++++++++++++++++++++++++++-------- 4 files changed, 58 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 9b76be3d561a..95b55eea2afb 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -512,6 +512,26 @@ ieee80211_get_he_sta_cap(const struct ieee80211_supported_band *sband) return ieee80211_get_he_iftype_cap(sband, NL80211_IFTYPE_STATION); } +/** + * ieee80211_get_he_6ghz_capa - return HE 6 GHz capabilities + * @sband: the sband to search for the STA on + * @iftype: the iftype to search for + * + * Return: the 6GHz capabilities + */ +static inline __le16 +ieee80211_get_he_6ghz_capa(const struct ieee80211_supported_band *sband, + enum nl80211_iftype iftype) +{ + const struct ieee80211_sband_iftype_data *data = + ieee80211_get_sband_iftype_data(sband, iftype); + + if (WARN_ON(!data || !data->he_cap.has_he)) + return 0; + + return data->he_6ghz_capa.capa; +} + /** * wiphy_read_of_freq_limits - read frequency limits from device tree * diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 24dc1fd57000..ec1a71ac65f2 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -2144,7 +2144,7 @@ enum { IEEE80211_PROBE_FLAG_RANDOM_SN = BIT(2), }; -int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer, +int ieee80211_build_preq_ies(struct ieee80211_sub_if_data *sdata, u8 *buffer, size_t buffer_len, struct ieee80211_scan_ies *ie_desc, const u8 *ie, size_t ie_len, diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index d0c2e8012118..ad90bbe57457 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -313,8 +313,9 @@ ieee80211_prepare_scan_chandef(struct cfg80211_chan_def *chandef, } /* return false if no more work */ -static bool ieee80211_prep_hw_scan(struct ieee80211_local *local) +static bool ieee80211_prep_hw_scan(struct ieee80211_sub_if_data *sdata) { + struct ieee80211_local *local = sdata->local; struct cfg80211_scan_request *req; struct cfg80211_chan_def chandef; u8 bands_used = 0; @@ -361,7 +362,7 @@ static bool ieee80211_prep_hw_scan(struct ieee80211_local *local) if (req->flags & NL80211_SCAN_FLAG_MIN_PREQ_CONTENT) flags |= IEEE80211_PROBE_FLAG_MIN_CONTENT; - ielen = ieee80211_build_preq_ies(local, + ielen = ieee80211_build_preq_ies(sdata, (u8 *)local->hw_scan_req->req.ie, local->hw_scan_ies_bufsize, &local->hw_scan_req->ies, @@ -401,9 +402,12 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) if (WARN_ON(!local->scan_req)) return; + scan_sdata = rcu_dereference_protected(local->scan_sdata, + lockdep_is_held(&local->mtx)); + if (hw_scan && !aborted && !ieee80211_hw_check(&local->hw, SINGLE_SCAN_ON_ALL_BANDS) && - ieee80211_prep_hw_scan(local)) { + ieee80211_prep_hw_scan(scan_sdata)) { int rc; rc = drv_hw_scan(local, @@ -432,9 +436,6 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) cfg80211_scan_done(scan_req, &local->scan_info); } RCU_INIT_POINTER(local->scan_req, NULL); - - scan_sdata = rcu_dereference_protected(local->scan_sdata, - lockdep_is_held(&local->mtx)); RCU_INIT_POINTER(local->scan_sdata, NULL); local->scanning = 0; @@ -776,7 +777,7 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata, ieee80211_recalc_idle(local); if (hw_scan) { - WARN_ON(!ieee80211_prep_hw_scan(local)); + WARN_ON(!ieee80211_prep_hw_scan(sdata)); rc = drv_hw_scan(local, sdata, local->hw_scan_req); } else { rc = ieee80211_start_sw_scan(local, sdata); @@ -1274,7 +1275,7 @@ int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata, ieee80211_prepare_scan_chandef(&chandef, req->scan_width); - ieee80211_build_preq_ies(local, ie, num_bands * iebufsz, + ieee80211_build_preq_ies(sdata, ie, num_bands * iebufsz, &sched_scan_ies, req->ie, req->ie_len, bands_used, rate_masks, &chandef, flags); diff --git a/net/mac80211/util.c b/net/mac80211/util.c index cbe24d303f0d..21c94094a699 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1663,7 +1663,20 @@ void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata, } } -static int ieee80211_build_preq_ies_band(struct ieee80211_local *local, +static u8 *ieee80211_write_he_6ghz_cap(u8 *pos, __le16 cap, u8 *end) +{ + if ((end - pos) < 5) + return pos; + + *pos++ = WLAN_EID_EXTENSION; + *pos++ = 1 + sizeof(cap); + *pos++ = WLAN_EID_EXT_HE_6GHZ_CAPA; + memcpy(pos, &cap, sizeof(cap)); + + return pos + 2; +} + +static int ieee80211_build_preq_ies_band(struct ieee80211_sub_if_data *sdata, u8 *buffer, size_t buffer_len, const u8 *ie, size_t ie_len, enum nl80211_band band, @@ -1671,6 +1684,7 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_local *local, struct cfg80211_chan_def *chandef, size_t *offset, u32 flags) { + struct ieee80211_local *local = sdata->local; struct ieee80211_supported_band *sband; const struct ieee80211_sta_he_cap *he_cap; u8 *pos = buffer, *end = buffer + buffer_len; @@ -1848,6 +1862,14 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_local *local, pos = ieee80211_ie_build_he_cap(pos, he_cap, end); if (!pos) goto out_err; + + if (sband->band == NL80211_BAND_6GHZ) { + enum nl80211_iftype iftype = + ieee80211_vif_type_p2p(&sdata->vif); + __le16 cap = ieee80211_get_he_6ghz_capa(sband, iftype); + + pos = ieee80211_write_he_6ghz_cap(pos, cap, end); + } } /* @@ -1862,7 +1884,7 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_local *local, return pos - buffer; } -int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer, +int ieee80211_build_preq_ies(struct ieee80211_sub_if_data *sdata, u8 *buffer, size_t buffer_len, struct ieee80211_scan_ies *ie_desc, const u8 *ie, size_t ie_len, @@ -1877,7 +1899,7 @@ int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer, for (i = 0; i < NUM_NL80211_BANDS; i++) { if (bands_used & BIT(i)) { - pos += ieee80211_build_preq_ies_band(local, + pos += ieee80211_build_preq_ies_band(sdata, buffer + pos, buffer_len - pos, ie, ie_len, i, @@ -1939,7 +1961,7 @@ struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata, return NULL; rate_masks[chan->band] = ratemask; - ies_len = ieee80211_build_preq_ies(local, skb_tail_pointer(skb), + ies_len = ieee80211_build_preq_ies(sdata, skb_tail_pointer(skb), skb_tailroom(skb), &dummy_ie_desc, ie, ie_len, BIT(chan->band), rate_masks, &chandef, flags); @@ -2879,10 +2901,8 @@ void ieee80211_ie_build_he_6ghz_cap(struct ieee80211_sub_if_data *sdata, } pos = skb_put(skb, 2 + 1 + sizeof(cap)); - *pos++ = WLAN_EID_EXTENSION; - *pos++ = 1 + sizeof(cap); - *pos++ = WLAN_EID_EXT_HE_6GHZ_CAPA; - put_unaligned_le16(cap, pos); + ieee80211_write_he_6ghz_cap(pos, cpu_to_le16(cap), + pos + 2 + 1 + sizeof(cap)); } u8 *ieee80211_ie_build_ht_oper(u8 *pos, struct ieee80211_sta_ht_cap *ht_cap, -- cgit v1.2.3 From ba8f6a037f790147438173029799f54c9d3065f2 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 28 May 2020 21:34:40 +0200 Subject: cfg80211: treat 6 GHz channels as valid regardless of capability If a 6 GHz channel exists, then we can probably safely assume that the device actually supports it, and then it should support most bandwidths. This will probably need to be extended to check the interface type and then dig into the HE capabilities for that though, to have the correct bandwidth check. Link: https://lore.kernel.org/r/20200528213443.d4864ef52e92.I82f09b2b14a56413ce20376d09967fe954a033eb@changeid Signed-off-by: Johannes Berg --- net/wireless/chan.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/wireless/chan.c b/net/wireless/chan.c index e111c08daa0e..cddf92c5d09e 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -6,7 +6,7 @@ * * Copyright 2009 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH - * Copyright 2018 Intel Corporation + * Copyright 2018-2020 Intel Corporation */ #include @@ -919,7 +919,8 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy, width = 10; break; case NL80211_CHAN_WIDTH_20: - if (!ht_cap->ht_supported) + if (!ht_cap->ht_supported && + chandef->chan->band != NL80211_BAND_6GHZ) return false; /* fall through */ case NL80211_CHAN_WIDTH_20_NOHT: @@ -928,6 +929,8 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy, break; case NL80211_CHAN_WIDTH_40: width = 40; + if (chandef->chan->band == NL80211_BAND_6GHZ) + break; if (!ht_cap->ht_supported) return false; if (!(ht_cap->cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40) || @@ -942,24 +945,29 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy, break; case NL80211_CHAN_WIDTH_80P80: cap = vht_cap->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK; - if (cap != IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ) + if (chandef->chan->band != NL80211_BAND_6GHZ && + cap != IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ) return false; /* fall through */ case NL80211_CHAN_WIDTH_80: - if (!vht_cap->vht_supported) - return false; prohibited_flags |= IEEE80211_CHAN_NO_80MHZ; width = 80; + if (chandef->chan->band == NL80211_BAND_6GHZ) + break; + if (!vht_cap->vht_supported) + return false; break; case NL80211_CHAN_WIDTH_160: + prohibited_flags |= IEEE80211_CHAN_NO_160MHZ; + width = 160; + if (chandef->chan->band == NL80211_BAND_6GHZ) + break; if (!vht_cap->vht_supported) return false; cap = vht_cap->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK; if (cap != IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ && cap != IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ) return false; - prohibited_flags |= IEEE80211_CHAN_NO_160MHZ; - width = 160; break; default: WARN_ON_ONCE(1); -- cgit v1.2.3 From 461ce35d5535c1479384f67fcf4bfc3f3610edca Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 28 May 2020 21:34:41 +0200 Subject: cfg80211: reject HT/VHT capabilities on 6 GHz band On the 6 GHz band, HE should be used, but without any direct HT/VHT capabilities, instead the HE 6 GHz band capabilities will capture the relevant information. Reject HT/VHT capabilities here. Link: https://lore.kernel.org/r/20200528213443.bfe89c35459a.Ibba5e066fa0087fd49d13cfee89d196ea0c68ae2@changeid Signed-off-by: Johannes Berg --- net/wireless/core.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/core.c b/net/wireless/core.c index b795f363d004..1651f86db6ca 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -5,7 +5,7 @@ * Copyright 2006-2010 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2019 Intel Corporation + * Copyright (C) 2018-2020 Intel Corporation */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -807,6 +807,11 @@ int wiphy_register(struct wiphy *wiphy) !sband->n_bitrates)) return -EINVAL; + if (WARN_ON(band == NL80211_BAND_6GHZ && + (sband->ht_cap.ht_supported || + sband->vht_cap.vht_supported))) + return -EINVAL; + /* * Since cfg80211_disable_40mhz_24ghz is global, we can * modify the sband's ht data even if the driver uses a -- cgit v1.2.3 From f438136528482f98535889c9a6f99bbacdd92870 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 28 May 2020 21:34:42 +0200 Subject: cfg80211: require HE capabilities for 6 GHz band On 6 GHz band, HE capabilities must be available for all of the interface types, otherwise we shouldn't use 6 GHz. Check this. Link: https://lore.kernel.org/r/20200528213443.5881cb3c8c4a.I583b54172f91f98d44af64a16c5826fe458cbb27@changeid Signed-off-by: Johannes Berg --- net/wireless/core.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'net') diff --git a/net/wireless/core.c b/net/wireless/core.c index 1651f86db6ca..5b6714460490 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -791,6 +791,7 @@ int wiphy_register(struct wiphy *wiphy) /* sanity check supported bands/channels */ for (band = 0; band < NUM_NL80211_BANDS; band++) { u16 types = 0; + bool have_he = false; sband = wiphy->bands[band]; if (!sband) @@ -859,8 +860,17 @@ int wiphy_register(struct wiphy *wiphy) return -EINVAL; types |= iftd->types_mask; + + if (i == 0) + have_he = iftd->he_cap.has_he; + else + have_he = have_he && + iftd->he_cap.has_he; } + if (WARN_ON(!have_he && band == NL80211_BAND_6GHZ)) + return -EINVAL; + have_band = true; } -- cgit v1.2.3 From 93382a0d119b3ab95e3ebca51ea15aa87187b493 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 28 May 2020 21:34:44 +0200 Subject: mac80211: accept aggregation sessions on 6 GHz On 6 GHz, stations don't have ht_supported set, but they can still do aggregation since they must have HE, allow that. Link: https://lore.kernel.org/r/20200528213443.776d3c891b64.Ifa099d450617b50c691832b3c4aa08959fab520a@changeid Signed-off-by: Johannes Berg --- net/mac80211/agg-rx.c | 5 +++-- net/mac80211/agg-tx.c | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index 4d1c335e06e5..7f245e9f114c 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -9,7 +9,7 @@ * Copyright 2007, Michael Wu * Copyright 2007-2010, Intel Corporation * Copyright(c) 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018 Intel Corporation + * Copyright (C) 2018-2020 Intel Corporation */ /** @@ -292,7 +292,8 @@ void ___ieee80211_start_rx_ba_session(struct sta_info *sta, goto end; } - if (!sta->sta.ht_cap.ht_supported) { + if (!sta->sta.ht_cap.ht_supported && + sta->sdata->vif.bss_conf.chandef.chan->band != NL80211_BAND_6GHZ) { ht_dbg(sta->sdata, "STA %pM erroneously requests BA session on tid %d w/o QoS\n", sta->sta.addr, tid); diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index c2d5f512526d..b37c8a983d88 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -593,7 +593,8 @@ int ieee80211_start_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid, "Requested to start BA session on reserved tid=%d", tid)) return -EINVAL; - if (!pubsta->ht_cap.ht_supported) + if (!pubsta->ht_cap.ht_supported && + sta->sdata->vif.bss_conf.chandef.chan->band != NL80211_BAND_6GHZ) return -EINVAL; if (WARN_ON_ONCE(!local->ops->ampdu_action)) -- cgit v1.2.3 From 6fcb56ce0f9036b08d1c3e35ff93b100d499771b Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Thu, 28 May 2020 21:34:45 +0200 Subject: mac80211: Consider 6 GHz band when handling power constraint Treat it like the 5 GHz band. Signed-off-by: Ilan Peer Link: https://lore.kernel.org/r/20200528213443.889e5c9dd006.Id8ed3bb8000ba8738be5df05639415eb2e23c61a@changeid Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 8a37089e86bb..d7bffd640ed3 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -1533,6 +1533,7 @@ ieee80211_find_80211h_pwr_constr(struct ieee80211_sub_if_data *sdata, chan_increment = 1; break; case NL80211_BAND_5GHZ: + case NL80211_BAND_6GHZ: chan_increment = 4; break; } -- cgit v1.2.3 From 07c12d618f06c8876fe12a53217d92b32d7baf07 Mon Sep 17 00:00:00 2001 From: Tova Mussai Date: Thu, 28 May 2020 21:34:46 +0200 Subject: mac80211: set short_slot for 6 GHz band Set short slot also for 6 GHz band, just like 5 GHz. Signed-off-by: Tova Mussai Link: https://lore.kernel.org/r/20200528213443.75f38e6f5efd.I272fbae402b03123f04e9ae69204eeab960c70cd@changeid Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 3 ++- net/mac80211/mlme.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 90a07d075fdb..9b360544ad6f 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -2198,7 +2198,8 @@ static int ieee80211_change_bss(struct wiphy *wiphy, } if (!sdata->vif.bss_conf.use_short_slot && - sband->band == NL80211_BAND_5GHZ) { + (sband->band == NL80211_BAND_5GHZ || + sband->band == NL80211_BAND_6GHZ)) { sdata->vif.bss_conf.use_short_slot = true; changed |= BSS_CHANGED_ERP_SLOT; } diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index d7bffd640ed3..5820ef02a587 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -2171,7 +2171,8 @@ static u32 ieee80211_handle_bss_capability(struct ieee80211_sub_if_data *sdata, } use_short_slot = !!(capab & WLAN_CAPABILITY_SHORT_SLOT_TIME); - if (sband->band == NL80211_BAND_5GHZ) + if (sband->band == NL80211_BAND_5GHZ || + sband->band == NL80211_BAND_6GHZ) use_short_slot = true; if (use_protection != bss_conf->use_cts_prot) { -- cgit v1.2.3 From 093a48d2aa4b74db3134b61d7b7a061dbe79177b Mon Sep 17 00:00:00 2001 From: Nathan Errera Date: Thu, 28 May 2020 21:22:38 +0200 Subject: cfg80211: support bigger kek/kck key length With some newer AKMs, the KCK and KEK are bigger, so allow that if the driver advertises support for it. In addition, add a new attribute for the AKM so we can use it for offloaded rekeying. Signed-off-by: Nathan Errera [reword commit message] Link: https://lore.kernel.org/r/20200528212237.5eb58b00a5d1.I61b09d77c4f382e8d58a05dcca78096e99a6bc15@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 12 +++++++++--- include/uapi/linux/nl80211.h | 4 ++++ net/wireless/nl80211.c | 23 +++++++++++++++++++---- 3 files changed, 32 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 95b55eea2afb..b58ad1a3f695 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2936,12 +2936,17 @@ struct cfg80211_wowlan_wakeup { /** * struct cfg80211_gtk_rekey_data - rekey data - * @kek: key encryption key (NL80211_KEK_LEN bytes) - * @kck: key confirmation key (NL80211_KCK_LEN bytes) + * @kek: key encryption key (@kek_len bytes) + * @kck: key confirmation key (@kck_len bytes) * @replay_ctr: replay counter (NL80211_REPLAY_CTR_LEN bytes) + * @kek_len: length of kek + * @kck_len length of kck + * @akm: akm (oui, id) */ struct cfg80211_gtk_rekey_data { const u8 *kek, *kck, *replay_ctr; + u32 akm; + u8 kek_len, kck_len; }; /** @@ -4166,9 +4171,10 @@ struct cfg80211_ops { * beaconing mode (AP, IBSS, Mesh, ...). * @WIPHY_FLAG_HAS_STATIC_WEP: The device supports static WEP key installation * before connection. + * @WIPHY_FLAG_SUPPORTS_EXT_KEK_KCK: The device supports bigger kek and kck keys */ enum wiphy_flags { - /* use hole at 0 */ + WIPHY_FLAG_SUPPORTS_EXT_KEK_KCK = BIT(0), /* use hole at 1 */ /* use hole at 2 */ WIPHY_FLAG_NETNS_OK = BIT(3), diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 5b350d032fa3..dad8c8f8581f 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -5396,6 +5396,8 @@ enum plink_actions { #define NL80211_KCK_LEN 16 #define NL80211_KEK_LEN 16 +#define NL80211_KCK_EXT_LEN 24 +#define NL80211_KEK_EXT_LEN 32 #define NL80211_REPLAY_CTR_LEN 8 /** @@ -5404,6 +5406,7 @@ enum plink_actions { * @NL80211_REKEY_DATA_KEK: key encryption key (binary) * @NL80211_REKEY_DATA_KCK: key confirmation key (binary) * @NL80211_REKEY_DATA_REPLAY_CTR: replay counter (binary) + * @NL80211_REKEY_DATA_AKM: AKM data (OUI, suite type) * @NUM_NL80211_REKEY_DATA: number of rekey attributes (internal) * @MAX_NL80211_REKEY_DATA: highest rekey attribute (internal) */ @@ -5412,6 +5415,7 @@ enum nl80211_rekey_data { NL80211_REKEY_DATA_KEK, NL80211_REKEY_DATA_KCK, NL80211_REKEY_DATA_REPLAY_CTR, + NL80211_REKEY_DATA_AKM, /* keep last */ NUM_NL80211_REKEY_DATA, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 3a24e6add13e..263ae395ad44 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -730,9 +730,16 @@ nl80211_coalesce_policy[NUM_NL80211_ATTR_COALESCE_RULE] = { /* policy for GTK rekey offload attributes */ static const struct nla_policy nl80211_rekey_policy[NUM_NL80211_REKEY_DATA] = { - [NL80211_REKEY_DATA_KEK] = NLA_POLICY_EXACT_LEN_WARN(NL80211_KEK_LEN), - [NL80211_REKEY_DATA_KCK] = NLA_POLICY_EXACT_LEN_WARN(NL80211_KCK_LEN), + [NL80211_REKEY_DATA_KEK] = { + .type = NLA_BINARY, + .len = NL80211_KEK_EXT_LEN + }, + [NL80211_REKEY_DATA_KCK] = { + .type = NLA_BINARY, + .len = NL80211_KCK_EXT_LEN + }, [NL80211_REKEY_DATA_REPLAY_CTR] = NLA_POLICY_EXACT_LEN_WARN(NL80211_REPLAY_CTR_LEN), + [NL80211_REKEY_DATA_AKM] = { .type = NLA_U32 }, }; static const struct nla_policy @@ -12347,14 +12354,22 @@ static int nl80211_set_rekey_data(struct sk_buff *skb, struct genl_info *info) return -EINVAL; if (nla_len(tb[NL80211_REKEY_DATA_REPLAY_CTR]) != NL80211_REPLAY_CTR_LEN) return -ERANGE; - if (nla_len(tb[NL80211_REKEY_DATA_KEK]) != NL80211_KEK_LEN) + if (nla_len(tb[NL80211_REKEY_DATA_KEK]) != NL80211_KEK_LEN && + !(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_EXT_KEK_KCK && + nla_len(tb[NL80211_REKEY_DATA_KEK]) == NL80211_KEK_EXT_LEN)) return -ERANGE; - if (nla_len(tb[NL80211_REKEY_DATA_KCK]) != NL80211_KCK_LEN) + if (nla_len(tb[NL80211_REKEY_DATA_KCK]) != NL80211_KCK_LEN && + !(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_EXT_KEK_KCK && + nla_len(tb[NL80211_REKEY_DATA_KEK]) == NL80211_KCK_EXT_LEN)) return -ERANGE; rekey_data.kek = nla_data(tb[NL80211_REKEY_DATA_KEK]); rekey_data.kck = nla_data(tb[NL80211_REKEY_DATA_KCK]); rekey_data.replay_ctr = nla_data(tb[NL80211_REKEY_DATA_REPLAY_CTR]); + rekey_data.kek_len = nla_len(tb[NL80211_REKEY_DATA_KEK]); + rekey_data.kck_len = nla_len(tb[NL80211_REKEY_DATA_KCK]); + if (tb[NL80211_REKEY_DATA_AKM]) + rekey_data.akm = nla_get_u32(tb[NL80211_REKEY_DATA_AKM]); wdev_lock(wdev); if (!wdev->current_bss) { -- cgit v1.2.3 From dafe2078a75af1abe4780313ef8dd8491ba8598f Mon Sep 17 00:00:00 2001 From: Patrick Eigensatz Date: Mon, 1 Jun 2020 13:12:01 +0200 Subject: ipv4: nexthop: Fix deadcode issue by performing a proper NULL check After allocating the spare nexthop group it should be tested for kzalloc() returning NULL, instead the already used nexthop group (which cannot be NULL at this point) had been tested so far. Additionally, if kzalloc() fails, return ERR_PTR(-ENOMEM) instead of NULL. Coverity-id: 1463885 Reported-by: Coverity Signed-off-by: Patrick Eigensatz Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/ipv4/nexthop.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index ebafa5ed91ac..400a9f89ebdb 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -1185,10 +1185,10 @@ static struct nexthop *nexthop_create_group(struct net *net, /* spare group used for removals */ nhg->spare = nexthop_grp_alloc(num_nh); - if (!nhg) { + if (!nhg->spare) { kfree(nhg); kfree(nh); - return NULL; + return ERR_PTR(-ENOMEM); } nhg->spare->spare = nhg; -- cgit v1.2.3 From 53fc685243bd6fb90d90305cea54598b78d3cbfc Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 1 Jun 2020 15:58:54 +0300 Subject: bridge: Avoid infinite loop when suppressing NS messages with invalid options When neighbor suppression is enabled the bridge device might reply to Neighbor Solicitation (NS) messages on behalf of remote hosts. In case the NS message includes the "Source link-layer address" option [1], the bridge device will use the specified address as the link-layer destination address in its reply. To avoid an infinite loop, break out of the options parsing loop when encountering an option with length zero and disregard the NS message. This is consistent with the IPv6 ndisc code and RFC 4886 which states that "Nodes MUST silently discard an ND packet that contains an option with length zero" [2]. [1] https://tools.ietf.org/html/rfc4861#section-4.3 [2] https://tools.ietf.org/html/rfc4861#section-4.6 Fixes: ed842faeb2bd ("bridge: suppress nd pkts on BR_NEIGH_SUPPRESS ports") Signed-off-by: Ido Schimmel Reported-by: Alla Segal Tested-by: Alla Segal Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_arp_nd_proxy.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/bridge/br_arp_nd_proxy.c b/net/bridge/br_arp_nd_proxy.c index 37908561a64b..b18cdf03edb3 100644 --- a/net/bridge/br_arp_nd_proxy.c +++ b/net/bridge/br_arp_nd_proxy.c @@ -276,6 +276,10 @@ static void br_nd_send(struct net_bridge *br, struct net_bridge_port *p, ns_olen = request->len - (skb_network_offset(request) + sizeof(struct ipv6hdr)) - sizeof(*ns); for (i = 0; i < ns_olen - 1; i += (ns->opt[i + 1] << 3)) { + if (!ns->opt[i + 1]) { + kfree_skb(reply); + return; + } if (ns->opt[i] == ND_OPT_SOURCE_LL_ADDR) { daddr = ns->opt + i + sizeof(struct nd_opt_hdr); break; -- cgit v1.2.3 From a01c245438c59a44f3ece8440046c78675fa8b0b Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Fri, 29 May 2020 00:05:32 +0200 Subject: net/sched: fix a couple of splats in the error path of tfc_gate_init() trying to configure TC 'act_gate' rules with invalid control actions, the following splat can be observed: general protection fault, probably for non-canonical address 0xdffffc0000000002: 0000 [#1] SMP KASAN NOPTI KASAN: null-ptr-deref in range [0x0000000000000010-0x0000000000000017] CPU: 1 PID: 2143 Comm: tc Not tainted 5.7.0-rc6+ #168 Hardware name: Red Hat KVM, BIOS 1.11.1-4.module+el8.1.0+4066+0f1aadab 04/01/2014 RIP: 0010:hrtimer_active+0x56/0x290 [...] Call Trace: hrtimer_try_to_cancel+0x6d/0x330 hrtimer_cancel+0x11/0x20 tcf_gate_cleanup+0x15/0x30 [act_gate] tcf_action_cleanup+0x58/0x170 __tcf_action_put+0xb0/0xe0 __tcf_idr_release+0x68/0x90 tcf_gate_init+0x7c7/0x19a0 [act_gate] tcf_action_init_1+0x60f/0x960 tcf_action_init+0x157/0x2a0 tcf_action_add+0xd9/0x2f0 tc_ctl_action+0x2a3/0x39d rtnetlink_rcv_msg+0x5f3/0x920 netlink_rcv_skb+0x121/0x350 netlink_unicast+0x439/0x630 netlink_sendmsg+0x714/0xbf0 sock_sendmsg+0xe2/0x110 ____sys_sendmsg+0x5b4/0x890 ___sys_sendmsg+0xe9/0x160 __sys_sendmsg+0xd3/0x170 do_syscall_64+0x9a/0x370 entry_SYSCALL_64_after_hwframe+0x44/0xa9 this is caused by hrtimer_cancel(), running before hrtimer_init(). Fix it ensuring to call hrtimer_cancel() only if clockid is valid, and the timer has been initialized. After fixing this splat, the same error path causes another problem: general protection fault, probably for non-canonical address 0xdffffc0000000000: 0000 [#1] SMP KASAN NOPTI KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007] CPU: 1 PID: 980 Comm: tc Not tainted 5.7.0-rc6+ #168 Hardware name: Red Hat KVM, BIOS 1.11.1-4.module+el8.1.0+4066+0f1aadab 04/01/2014 RIP: 0010:release_entry_list+0x4a/0x240 [act_gate] [...] Call Trace: tcf_action_cleanup+0x58/0x170 __tcf_action_put+0xb0/0xe0 __tcf_idr_release+0x68/0x90 tcf_gate_init+0x7ab/0x19a0 [act_gate] tcf_action_init_1+0x60f/0x960 tcf_action_init+0x157/0x2a0 tcf_action_add+0xd9/0x2f0 tc_ctl_action+0x2a3/0x39d rtnetlink_rcv_msg+0x5f3/0x920 netlink_rcv_skb+0x121/0x350 netlink_unicast+0x439/0x630 netlink_sendmsg+0x714/0xbf0 sock_sendmsg+0xe2/0x110 ____sys_sendmsg+0x5b4/0x890 ___sys_sendmsg+0xe9/0x160 __sys_sendmsg+0xd3/0x170 do_syscall_64+0x9a/0x370 entry_SYSCALL_64_after_hwframe+0x44/0xa9 the problem is similar: tcf_action_cleanup() was trying to release a list without initializing it first. Ensure that INIT_LIST_HEAD() is called for every newly created 'act_gate' action, same as what was done to 'act_ife' with commit 44c23d71599f ("net/sched: act_ife: initalize ife->metalist earlier"). Fixes: a51c328df310 ("net: qos: introduce a gate control flow action") CC: Ivan Vecera Signed-off-by: Davide Caratti Signed-off-by: David S. Miller --- net/sched/act_gate.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/act_gate.c b/net/sched/act_gate.c index 35fc48795541..9c628591f452 100644 --- a/net/sched/act_gate.c +++ b/net/sched/act_gate.c @@ -331,6 +331,10 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla, tcf_idr_release(*a, bind); return -EEXIST; } + if (ret == ACT_P_CREATED) { + to_gate(*a)->param.tcfg_clockid = -1; + INIT_LIST_HEAD(&(to_gate(*a)->param.entries)); + } if (tb[TCA_GATE_PRIORITY]) prio = nla_get_s32(tb[TCA_GATE_PRIORITY]); @@ -377,7 +381,6 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla, goto chain_put; } - INIT_LIST_HEAD(&p->entries); if (tb[TCA_GATE_ENTRY_LIST]) { err = parse_gate_list(tb[TCA_GATE_ENTRY_LIST], p, extack); if (err < 0) @@ -449,9 +452,9 @@ static void tcf_gate_cleanup(struct tc_action *a) struct tcf_gate *gact = to_gate(a); struct tcf_gate_params *p; - hrtimer_cancel(&gact->hitimer); - p = &gact->param; + if (p->tcfg_clockid != -1) + hrtimer_cancel(&gact->hitimer); release_entry_list(&p->entries); } -- cgit v1.2.3 From a8284c6899cf7321abbd258d970a9442978b0a4f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 29 May 2020 02:25:34 +0200 Subject: netfilter: nf_flowtable: expose nf_flow_table_gc_cleanup() This function schedules the flow teardown state and it forces a gc run. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/net/netfilter/nf_flow_table.h | 2 ++ net/netfilter/nf_flow_table_core.c | 6 +++--- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index c54a7f707e50..d7338bfd7b0f 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -175,6 +175,8 @@ void flow_offload_refresh(struct nf_flowtable *flow_table, struct flow_offload_tuple_rhash *flow_offload_lookup(struct nf_flowtable *flow_table, struct flow_offload_tuple *tuple); +void nf_flow_table_gc_cleanup(struct nf_flowtable *flowtable, + struct net_device *dev); void nf_flow_table_cleanup(struct net_device *dev); int nf_flow_table_init(struct nf_flowtable *flow_table); diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 42da6e337276..6a3034f84ab6 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -588,8 +588,8 @@ static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data) flow_offload_teardown(flow); } -static void nf_flow_table_iterate_cleanup(struct nf_flowtable *flowtable, - struct net_device *dev) +void nf_flow_table_gc_cleanup(struct nf_flowtable *flowtable, + struct net_device *dev) { nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, dev); flush_delayed_work(&flowtable->gc_work); @@ -602,7 +602,7 @@ void nf_flow_table_cleanup(struct net_device *dev) mutex_lock(&flowtable_lock); list_for_each_entry(flowtable, &flowtables, list) - nf_flow_table_iterate_cleanup(flowtable, dev); + nf_flow_table_gc_cleanup(flowtable, dev); mutex_unlock(&flowtable_lock); } EXPORT_SYMBOL_GPL(nf_flow_table_cleanup); -- cgit v1.2.3 From 1fac52da5942c58dd3e337fd7c5a550925ca752e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 29 May 2020 02:25:35 +0200 Subject: net: flow_offload: consolidate indirect flow_block infrastructure Tunnel devices provide no dev->netdev_ops->ndo_setup_tc(...) interface. The tunnel device and route control plane does not provide an obvious way to relate tunnel and physical devices. This patch allows drivers to register a tunnel device offload handler for the tc and netfilter frontends through flow_indr_dev_register() and flow_indr_dev_unregister(). The frontend calls flow_indr_dev_setup_offload() that iterates over the list of drivers that are offering tunnel device hardware offload support and it sets up the flow block for this tunnel device. If the driver module is removed, the indirect flow_block ends up with a stale callback reference. The module removal path triggers the dev_shutdown() path to remove the qdisc and the flow_blocks for the physical devices. However, this is not useful for tunnel devices, where relation between the physical and the tunnel device is not explicit. This patch introduces a cleanup callback that is invoked when the driver module is removed to clean up the tunnel device flow_block. This patch defines struct flow_block_indr and it uses it from flow_block_cb to store the information that front-end requires to perform the flow_block_cb cleanup on module removal. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/net/flow_offload.h | 19 ++++++ net/core/flow_offload.c | 157 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 176 insertions(+) (limited to 'net') diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index 95d633785ef9..5493282348fa 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -443,6 +443,16 @@ enum tc_setup_type; typedef int flow_setup_cb_t(enum tc_setup_type type, void *type_data, void *cb_priv); +struct flow_block_cb; + +struct flow_block_indr { + struct list_head list; + struct net_device *dev; + enum flow_block_binder_type binder_type; + void *data; + void (*cleanup)(struct flow_block_cb *block_cb); +}; + struct flow_block_cb { struct list_head driver_list; struct list_head list; @@ -450,6 +460,7 @@ struct flow_block_cb { void *cb_ident; void *cb_priv; void (*release)(void *cb_priv); + struct flow_block_indr indr; unsigned int refcnt; }; @@ -523,6 +534,14 @@ static inline void flow_block_init(struct flow_block *flow_block) typedef int flow_indr_block_bind_cb_t(struct net_device *dev, void *cb_priv, enum tc_setup_type type, void *type_data); +int flow_indr_dev_register(flow_indr_block_bind_cb_t *cb, void *cb_priv); +void flow_indr_dev_unregister(flow_indr_block_bind_cb_t *cb, void *cb_priv, + flow_setup_cb_t *setup_cb); +int flow_indr_dev_setup_offload(struct net_device *dev, + enum tc_setup_type type, void *data, + struct flow_block_offload *bo, + void (*cleanup)(struct flow_block_cb *block_cb)); + typedef void flow_indr_block_cmd_t(struct net_device *dev, flow_indr_block_bind_cb_t *cb, void *cb_priv, enum flow_block_command command); diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c index e64941c526b1..8cd7da2586ae 100644 --- a/net/core/flow_offload.c +++ b/net/core/flow_offload.c @@ -317,6 +317,163 @@ int flow_block_cb_setup_simple(struct flow_block_offload *f, } EXPORT_SYMBOL(flow_block_cb_setup_simple); +static DEFINE_MUTEX(flow_indr_block_lock); +static LIST_HEAD(flow_block_indr_list); +static LIST_HEAD(flow_block_indr_dev_list); + +struct flow_indr_dev { + struct list_head list; + flow_indr_block_bind_cb_t *cb; + void *cb_priv; + refcount_t refcnt; + struct rcu_head rcu; +}; + +static struct flow_indr_dev *flow_indr_dev_alloc(flow_indr_block_bind_cb_t *cb, + void *cb_priv) +{ + struct flow_indr_dev *indr_dev; + + indr_dev = kmalloc(sizeof(*indr_dev), GFP_KERNEL); + if (!indr_dev) + return NULL; + + indr_dev->cb = cb; + indr_dev->cb_priv = cb_priv; + refcount_set(&indr_dev->refcnt, 1); + + return indr_dev; +} + +int flow_indr_dev_register(flow_indr_block_bind_cb_t *cb, void *cb_priv) +{ + struct flow_indr_dev *indr_dev; + + mutex_lock(&flow_indr_block_lock); + list_for_each_entry(indr_dev, &flow_block_indr_dev_list, list) { + if (indr_dev->cb == cb && + indr_dev->cb_priv == cb_priv) { + refcount_inc(&indr_dev->refcnt); + mutex_unlock(&flow_indr_block_lock); + return 0; + } + } + + indr_dev = flow_indr_dev_alloc(cb, cb_priv); + if (!indr_dev) { + mutex_unlock(&flow_indr_block_lock); + return -ENOMEM; + } + + list_add(&indr_dev->list, &flow_block_indr_dev_list); + mutex_unlock(&flow_indr_block_lock); + + return 0; +} +EXPORT_SYMBOL(flow_indr_dev_register); + +static void __flow_block_indr_cleanup(flow_setup_cb_t *setup_cb, void *cb_priv, + struct list_head *cleanup_list) +{ + struct flow_block_cb *this, *next; + + list_for_each_entry_safe(this, next, &flow_block_indr_list, indr.list) { + if (this->cb == setup_cb && + this->cb_priv == cb_priv) { + list_move(&this->indr.list, cleanup_list); + return; + } + } +} + +static void flow_block_indr_notify(struct list_head *cleanup_list) +{ + struct flow_block_cb *this, *next; + + list_for_each_entry_safe(this, next, cleanup_list, indr.list) { + list_del(&this->indr.list); + this->indr.cleanup(this); + } +} + +void flow_indr_dev_unregister(flow_indr_block_bind_cb_t *cb, void *cb_priv, + flow_setup_cb_t *setup_cb) +{ + struct flow_indr_dev *this, *next, *indr_dev = NULL; + LIST_HEAD(cleanup_list); + + mutex_lock(&flow_indr_block_lock); + list_for_each_entry_safe(this, next, &flow_block_indr_dev_list, list) { + if (this->cb == cb && + this->cb_priv == cb_priv && + refcount_dec_and_test(&this->refcnt)) { + indr_dev = this; + list_del(&indr_dev->list); + break; + } + } + + if (!indr_dev) { + mutex_unlock(&flow_indr_block_lock); + return; + } + + __flow_block_indr_cleanup(setup_cb, cb_priv, &cleanup_list); + mutex_unlock(&flow_indr_block_lock); + + flow_block_indr_notify(&cleanup_list); + kfree(indr_dev); +} +EXPORT_SYMBOL(flow_indr_dev_unregister); + +static void flow_block_indr_init(struct flow_block_cb *flow_block, + struct flow_block_offload *bo, + struct net_device *dev, void *data, + void (*cleanup)(struct flow_block_cb *block_cb)) +{ + flow_block->indr.binder_type = bo->binder_type; + flow_block->indr.data = data; + flow_block->indr.dev = dev; + flow_block->indr.cleanup = cleanup; +} + +static void __flow_block_indr_binding(struct flow_block_offload *bo, + struct net_device *dev, void *data, + void (*cleanup)(struct flow_block_cb *block_cb)) +{ + struct flow_block_cb *block_cb; + + list_for_each_entry(block_cb, &bo->cb_list, list) { + switch (bo->command) { + case FLOW_BLOCK_BIND: + flow_block_indr_init(block_cb, bo, dev, data, cleanup); + list_add(&block_cb->indr.list, &flow_block_indr_list); + break; + case FLOW_BLOCK_UNBIND: + list_del(&block_cb->indr.list); + break; + } + } +} + +int flow_indr_dev_setup_offload(struct net_device *dev, + enum tc_setup_type type, void *data, + struct flow_block_offload *bo, + void (*cleanup)(struct flow_block_cb *block_cb)) +{ + struct flow_indr_dev *this; + + mutex_lock(&flow_indr_block_lock); + list_for_each_entry(this, &flow_block_indr_dev_list, list) + this->cb(dev, this->cb_priv, type, bo); + + __flow_block_indr_binding(bo, dev, data, cleanup); + mutex_unlock(&flow_indr_block_lock); + + return list_empty(&bo->cb_list) ? -EOPNOTSUPP : 0; +} +EXPORT_SYMBOL(flow_indr_dev_setup_offload); + static LIST_HEAD(block_cb_list); static struct rhashtable indr_setup_block_ht; -- cgit v1.2.3 From 324a823b9962a0f290c40fb6314926d434193276 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 29 May 2020 02:25:36 +0200 Subject: net: cls_api: add tcf_block_offload_init() Add a helper function to initialize the flow_block_offload structure. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- net/sched/cls_api.c | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 752d608f4442..c5a2f16097b6 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -693,6 +693,22 @@ static void tc_indr_block_get_and_cmd(struct net_device *dev, tc_indr_block_cmd(dev, block, cb, cb_priv, command, false); } +static void tcf_block_offload_init(struct flow_block_offload *bo, + struct net_device *dev, + enum flow_block_command command, + enum flow_block_binder_type binder_type, + struct flow_block *flow_block, + bool shared, struct netlink_ext_ack *extack) +{ + bo->net = dev_net(dev); + bo->command = command; + bo->binder_type = binder_type; + bo->block = flow_block; + bo->block_shared = shared; + bo->extack = extack; + INIT_LIST_HEAD(&bo->cb_list); +} + static void tc_indr_block_call(struct tcf_block *block, struct net_device *dev, struct tcf_block_ext_info *ei, @@ -727,13 +743,9 @@ static int tcf_block_offload_cmd(struct tcf_block *block, struct flow_block_offload bo = {}; int err; - bo.net = dev_net(dev); - bo.command = command; - bo.binder_type = ei->binder_type; - bo.block = &block->flow_block; - bo.block_shared = tcf_block_shared(block); - bo.extack = extack; - INIT_LIST_HEAD(&bo.cb_list); + tcf_block_offload_init(&bo, dev, command, ei->binder_type, + &block->flow_block, tcf_block_shared(block), + extack); err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo); if (err < 0) { -- cgit v1.2.3 From 0fdcf78d59737939ea449b512d02c3733a22c8e1 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 29 May 2020 02:25:37 +0200 Subject: net: use flow_indr_dev_setup_offload() Update existing frontends to use flow_indr_dev_setup_offload(). This new function must be called if ->ndo_setup_tc is unset to deal with tunnel devices. If there is no driver that is subscribed to new tunnel device flow_block bindings, then this function bails out with EOPNOTSUPP. If the driver module is removed, the ->cleanup() callback removes the entries that belong to this tunnel device. This cleanup procedures is triggered when the device unregisters the tunnel device offload handler. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- net/netfilter/nf_flow_table_offload.c | 19 +++++++++--- net/netfilter/nf_tables_offload.c | 28 ++++++++++++++--- net/sched/cls_api.c | 58 +++++++++++++++++------------------ 3 files changed, 67 insertions(+), 38 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index 2ff4087007a6..01cfa02c43bd 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -942,6 +942,18 @@ static void nf_flow_table_block_offload_init(struct flow_block_offload *bo, INIT_LIST_HEAD(&bo->cb_list); } +static void nf_flow_table_indr_cleanup(struct flow_block_cb *block_cb) +{ + struct nf_flowtable *flowtable = block_cb->indr.data; + struct net_device *dev = block_cb->indr.dev; + + nf_flow_table_gc_cleanup(flowtable, dev); + down_write(&flowtable->flow_block_lock); + list_del(&block_cb->list); + flow_block_cb_free(block_cb); + up_write(&flowtable->flow_block_lock); +} + static int nf_flow_table_indr_offload_cmd(struct flow_block_offload *bo, struct nf_flowtable *flowtable, struct net_device *dev, @@ -950,12 +962,9 @@ static int nf_flow_table_indr_offload_cmd(struct flow_block_offload *bo, { nf_flow_table_block_offload_init(bo, dev_net(dev), cmd, flowtable, extack); - flow_indr_block_call(dev, bo, cmd, TC_SETUP_FT); - if (list_empty(&bo->cb_list)) - return -EOPNOTSUPP; - - return 0; + return flow_indr_dev_setup_offload(dev, TC_SETUP_FT, flowtable, bo, + nf_flow_table_indr_cleanup); } static int nf_flow_table_offload_cmd(struct flow_block_offload *bo, diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index 954bccb7f32a..1960f11477e8 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -304,21 +304,41 @@ static void nft_indr_block_ing_cmd(struct net_device *dev, nft_block_setup(chain, &bo, cmd); } -static int nft_indr_block_offload_cmd(struct nft_base_chain *chain, +static void nft_indr_block_cleanup(struct flow_block_cb *block_cb) +{ + struct nft_base_chain *basechain = block_cb->indr.data; + struct net_device *dev = block_cb->indr.dev; + struct netlink_ext_ack extack = {}; + struct net *net = dev_net(dev); + struct flow_block_offload bo; + + nft_flow_block_offload_init(&bo, dev_net(dev), FLOW_BLOCK_UNBIND, + basechain, &extack); + mutex_lock(&net->nft.commit_mutex); + list_move(&block_cb->list, &bo.cb_list); + nft_flow_offload_unbind(&bo, basechain); + mutex_unlock(&net->nft.commit_mutex); +} + +static int nft_indr_block_offload_cmd(struct nft_base_chain *basechain, struct net_device *dev, enum flow_block_command cmd) { struct netlink_ext_ack extack = {}; struct flow_block_offload bo; + int err; - nft_flow_block_offload_init(&bo, dev_net(dev), cmd, chain, &extack); + nft_flow_block_offload_init(&bo, dev_net(dev), cmd, basechain, &extack); - flow_indr_block_call(dev, &bo, cmd, TC_SETUP_BLOCK); + err = flow_indr_dev_setup_offload(dev, TC_SETUP_BLOCK, basechain, &bo, + nft_indr_block_cleanup); + if (err < 0) + return err; if (list_empty(&bo.cb_list)) return -EOPNOTSUPP; - return nft_block_setup(chain, &bo, cmd); + return nft_block_setup(basechain, &bo, cmd); } #define FLOW_SETUP_BLOCK TC_SETUP_BLOCK diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index c5a2f16097b6..760e51d852f5 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -709,24 +709,26 @@ static void tcf_block_offload_init(struct flow_block_offload *bo, INIT_LIST_HEAD(&bo->cb_list); } -static void tc_indr_block_call(struct tcf_block *block, - struct net_device *dev, - struct tcf_block_ext_info *ei, - enum flow_block_command command, - struct netlink_ext_ack *extack) +static void tcf_block_unbind(struct tcf_block *block, + struct flow_block_offload *bo); + +static void tc_block_indr_cleanup(struct flow_block_cb *block_cb) { - struct flow_block_offload bo = { - .command = command, - .binder_type = ei->binder_type, - .net = dev_net(dev), - .block = &block->flow_block, - .block_shared = tcf_block_shared(block), - .extack = extack, - }; - INIT_LIST_HEAD(&bo.cb_list); + struct tcf_block *block = block_cb->indr.data; + struct net_device *dev = block_cb->indr.dev; + struct netlink_ext_ack extack = {}; + struct flow_block_offload bo; - flow_indr_block_call(dev, &bo, command, TC_SETUP_BLOCK); - tcf_block_setup(block, &bo); + tcf_block_offload_init(&bo, dev, FLOW_BLOCK_UNBIND, + block_cb->indr.binder_type, + &block->flow_block, tcf_block_shared(block), + &extack); + down_write(&block->cb_lock); + list_move(&block_cb->list, &bo.cb_list); + up_write(&block->cb_lock); + rtnl_lock(); + tcf_block_unbind(block, &bo); + rtnl_unlock(); } static bool tcf_block_offload_in_use(struct tcf_block *block) @@ -747,7 +749,12 @@ static int tcf_block_offload_cmd(struct tcf_block *block, &block->flow_block, tcf_block_shared(block), extack); - err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo); + if (dev->netdev_ops->ndo_setup_tc) + err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo); + else + err = flow_indr_dev_setup_offload(dev, TC_SETUP_BLOCK, block, + &bo, tc_block_indr_cleanup); + if (err < 0) { if (err != -EOPNOTSUPP) NL_SET_ERR_MSG(extack, "Driver ndo_setup_tc failed"); @@ -765,13 +772,13 @@ static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q, int err; down_write(&block->cb_lock); - if (!dev->netdev_ops->ndo_setup_tc) - goto no_offload_dev_inc; /* If tc offload feature is disabled and the block we try to bind * to already has some offloaded filters, forbid to bind. */ - if (!tc_can_offload(dev) && tcf_block_offload_in_use(block)) { + if (dev->netdev_ops->ndo_setup_tc && + !tc_can_offload(dev) && + tcf_block_offload_in_use(block)) { NL_SET_ERR_MSG(extack, "Bind to offloaded block failed as dev has offload disabled"); err = -EOPNOTSUPP; goto err_unlock; @@ -783,18 +790,15 @@ static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q, if (err) goto err_unlock; - tc_indr_block_call(block, dev, ei, FLOW_BLOCK_BIND, extack); up_write(&block->cb_lock); return 0; no_offload_dev_inc: - if (tcf_block_offload_in_use(block)) { - err = -EOPNOTSUPP; + if (tcf_block_offload_in_use(block)) goto err_unlock; - } + err = 0; block->nooffloaddevcnt++; - tc_indr_block_call(block, dev, ei, FLOW_BLOCK_BIND, extack); err_unlock: up_write(&block->cb_lock); return err; @@ -807,10 +811,6 @@ static void tcf_block_offload_unbind(struct tcf_block *block, struct Qdisc *q, int err; down_write(&block->cb_lock); - tc_indr_block_call(block, dev, ei, FLOW_BLOCK_UNBIND, NULL); - - if (!dev->netdev_ops->ndo_setup_tc) - goto no_offload_dev_dec; err = tcf_block_offload_cmd(block, dev, ei, FLOW_BLOCK_UNBIND, NULL); if (err == -EOPNOTSUPP) goto no_offload_dev_dec; -- cgit v1.2.3 From 709ffbe19b777e8fc952e2fdcfd8e6f50c8ef08c Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 29 May 2020 02:25:41 +0200 Subject: net: remove indirect block netdev event registration Drivers do not register to netdev events to set up indirect blocks anymore. Remove __flow_indr_block_cb_register() and __flow_indr_block_cb_unregister(). The frontends set up the callbacks through flow_indr_dev_setup_block() Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/net/flow_offload.h | 9 -- net/core/flow_offload.c | 238 ---------------------------------- net/netfilter/nf_flow_table_offload.c | 66 ---------- net/netfilter/nf_tables_offload.c | 53 +------- net/sched/cls_api.c | 79 ----------- 5 files changed, 1 insertion(+), 444 deletions(-) (limited to 'net') diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index 5493282348fa..69e13c8b6b3a 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -546,15 +546,6 @@ typedef void flow_indr_block_cmd_t(struct net_device *dev, flow_indr_block_bind_cb_t *cb, void *cb_priv, enum flow_block_command command); -struct flow_indr_block_entry { - flow_indr_block_cmd_t *cb; - struct list_head list; -}; - -void flow_indr_add_block_cb(struct flow_indr_block_entry *entry); - -void flow_indr_del_block_cb(struct flow_indr_block_entry *entry); - int __flow_indr_block_cb_register(struct net_device *dev, void *cb_priv, flow_indr_block_bind_cb_t *cb, void *cb_ident); diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c index 8cd7da2586ae..0cfc35e6be28 100644 --- a/net/core/flow_offload.c +++ b/net/core/flow_offload.c @@ -473,241 +473,3 @@ int flow_indr_dev_setup_offload(struct net_device *dev, return list_empty(&bo->cb_list) ? -EOPNOTSUPP : 0; } EXPORT_SYMBOL(flow_indr_dev_setup_offload); - -static LIST_HEAD(block_cb_list); - -static struct rhashtable indr_setup_block_ht; - -struct flow_indr_block_cb { - struct list_head list; - void *cb_priv; - flow_indr_block_bind_cb_t *cb; - void *cb_ident; -}; - -struct flow_indr_block_dev { - struct rhash_head ht_node; - struct net_device *dev; - unsigned int refcnt; - struct list_head cb_list; -}; - -static const struct rhashtable_params flow_indr_setup_block_ht_params = { - .key_offset = offsetof(struct flow_indr_block_dev, dev), - .head_offset = offsetof(struct flow_indr_block_dev, ht_node), - .key_len = sizeof(struct net_device *), -}; - -static struct flow_indr_block_dev * -flow_indr_block_dev_lookup(struct net_device *dev) -{ - return rhashtable_lookup_fast(&indr_setup_block_ht, &dev, - flow_indr_setup_block_ht_params); -} - -static struct flow_indr_block_dev * -flow_indr_block_dev_get(struct net_device *dev) -{ - struct flow_indr_block_dev *indr_dev; - - indr_dev = flow_indr_block_dev_lookup(dev); - if (indr_dev) - goto inc_ref; - - indr_dev = kzalloc(sizeof(*indr_dev), GFP_KERNEL); - if (!indr_dev) - return NULL; - - INIT_LIST_HEAD(&indr_dev->cb_list); - indr_dev->dev = dev; - if (rhashtable_insert_fast(&indr_setup_block_ht, &indr_dev->ht_node, - flow_indr_setup_block_ht_params)) { - kfree(indr_dev); - return NULL; - } - -inc_ref: - indr_dev->refcnt++; - return indr_dev; -} - -static void flow_indr_block_dev_put(struct flow_indr_block_dev *indr_dev) -{ - if (--indr_dev->refcnt) - return; - - rhashtable_remove_fast(&indr_setup_block_ht, &indr_dev->ht_node, - flow_indr_setup_block_ht_params); - kfree(indr_dev); -} - -static struct flow_indr_block_cb * -flow_indr_block_cb_lookup(struct flow_indr_block_dev *indr_dev, - flow_indr_block_bind_cb_t *cb, void *cb_ident) -{ - struct flow_indr_block_cb *indr_block_cb; - - list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list) - if (indr_block_cb->cb == cb && - indr_block_cb->cb_ident == cb_ident) - return indr_block_cb; - return NULL; -} - -static struct flow_indr_block_cb * -flow_indr_block_cb_add(struct flow_indr_block_dev *indr_dev, void *cb_priv, - flow_indr_block_bind_cb_t *cb, void *cb_ident) -{ - struct flow_indr_block_cb *indr_block_cb; - - indr_block_cb = flow_indr_block_cb_lookup(indr_dev, cb, cb_ident); - if (indr_block_cb) - return ERR_PTR(-EEXIST); - - indr_block_cb = kzalloc(sizeof(*indr_block_cb), GFP_KERNEL); - if (!indr_block_cb) - return ERR_PTR(-ENOMEM); - - indr_block_cb->cb_priv = cb_priv; - indr_block_cb->cb = cb; - indr_block_cb->cb_ident = cb_ident; - list_add(&indr_block_cb->list, &indr_dev->cb_list); - - return indr_block_cb; -} - -static void flow_indr_block_cb_del(struct flow_indr_block_cb *indr_block_cb) -{ - list_del(&indr_block_cb->list); - kfree(indr_block_cb); -} - -static DEFINE_MUTEX(flow_indr_block_cb_lock); - -static void flow_block_cmd(struct net_device *dev, - flow_indr_block_bind_cb_t *cb, void *cb_priv, - enum flow_block_command command) -{ - struct flow_indr_block_entry *entry; - - mutex_lock(&flow_indr_block_cb_lock); - list_for_each_entry(entry, &block_cb_list, list) { - entry->cb(dev, cb, cb_priv, command); - } - mutex_unlock(&flow_indr_block_cb_lock); -} - -int __flow_indr_block_cb_register(struct net_device *dev, void *cb_priv, - flow_indr_block_bind_cb_t *cb, - void *cb_ident) -{ - struct flow_indr_block_cb *indr_block_cb; - struct flow_indr_block_dev *indr_dev; - int err; - - indr_dev = flow_indr_block_dev_get(dev); - if (!indr_dev) - return -ENOMEM; - - indr_block_cb = flow_indr_block_cb_add(indr_dev, cb_priv, cb, cb_ident); - err = PTR_ERR_OR_ZERO(indr_block_cb); - if (err) - goto err_dev_put; - - flow_block_cmd(dev, indr_block_cb->cb, indr_block_cb->cb_priv, - FLOW_BLOCK_BIND); - - return 0; - -err_dev_put: - flow_indr_block_dev_put(indr_dev); - return err; -} -EXPORT_SYMBOL_GPL(__flow_indr_block_cb_register); - -int flow_indr_block_cb_register(struct net_device *dev, void *cb_priv, - flow_indr_block_bind_cb_t *cb, - void *cb_ident) -{ - int err; - - rtnl_lock(); - err = __flow_indr_block_cb_register(dev, cb_priv, cb, cb_ident); - rtnl_unlock(); - - return err; -} -EXPORT_SYMBOL_GPL(flow_indr_block_cb_register); - -void __flow_indr_block_cb_unregister(struct net_device *dev, - flow_indr_block_bind_cb_t *cb, - void *cb_ident) -{ - struct flow_indr_block_cb *indr_block_cb; - struct flow_indr_block_dev *indr_dev; - - indr_dev = flow_indr_block_dev_lookup(dev); - if (!indr_dev) - return; - - indr_block_cb = flow_indr_block_cb_lookup(indr_dev, cb, cb_ident); - if (!indr_block_cb) - return; - - flow_block_cmd(dev, indr_block_cb->cb, indr_block_cb->cb_priv, - FLOW_BLOCK_UNBIND); - - flow_indr_block_cb_del(indr_block_cb); - flow_indr_block_dev_put(indr_dev); -} -EXPORT_SYMBOL_GPL(__flow_indr_block_cb_unregister); - -void flow_indr_block_cb_unregister(struct net_device *dev, - flow_indr_block_bind_cb_t *cb, - void *cb_ident) -{ - rtnl_lock(); - __flow_indr_block_cb_unregister(dev, cb, cb_ident); - rtnl_unlock(); -} -EXPORT_SYMBOL_GPL(flow_indr_block_cb_unregister); - -void flow_indr_block_call(struct net_device *dev, - struct flow_block_offload *bo, - enum flow_block_command command, - enum tc_setup_type type) -{ - struct flow_indr_block_cb *indr_block_cb; - struct flow_indr_block_dev *indr_dev; - - indr_dev = flow_indr_block_dev_lookup(dev); - if (!indr_dev) - return; - - list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list) - indr_block_cb->cb(dev, indr_block_cb->cb_priv, type, bo); -} -EXPORT_SYMBOL_GPL(flow_indr_block_call); - -void flow_indr_add_block_cb(struct flow_indr_block_entry *entry) -{ - mutex_lock(&flow_indr_block_cb_lock); - list_add_tail(&entry->list, &block_cb_list); - mutex_unlock(&flow_indr_block_cb_lock); -} -EXPORT_SYMBOL_GPL(flow_indr_add_block_cb); - -void flow_indr_del_block_cb(struct flow_indr_block_entry *entry) -{ - mutex_lock(&flow_indr_block_cb_lock); - list_del(&entry->list); - mutex_unlock(&flow_indr_block_cb_lock); -} -EXPORT_SYMBOL_GPL(flow_indr_del_block_cb); - -static int __init init_flow_indr_rhashtable(void) -{ - return rhashtable_init(&indr_setup_block_ht, - &flow_indr_setup_block_ht_params); -} -subsys_initcall(init_flow_indr_rhashtable); diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index 01cfa02c43bd..62651e6683f6 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -1008,69 +1008,6 @@ int nf_flow_table_offload_setup(struct nf_flowtable *flowtable, } EXPORT_SYMBOL_GPL(nf_flow_table_offload_setup); -static void nf_flow_table_indr_block_ing_cmd(struct net_device *dev, - struct nf_flowtable *flowtable, - flow_indr_block_bind_cb_t *cb, - void *cb_priv, - enum flow_block_command cmd) -{ - struct netlink_ext_ack extack = {}; - struct flow_block_offload bo; - - if (!flowtable) - return; - - nf_flow_table_block_offload_init(&bo, dev_net(dev), cmd, flowtable, - &extack); - - cb(dev, cb_priv, TC_SETUP_FT, &bo); - - nf_flow_table_block_setup(flowtable, &bo, cmd); -} - -static void nf_flow_table_indr_block_cb_cmd(struct nf_flowtable *flowtable, - struct net_device *dev, - flow_indr_block_bind_cb_t *cb, - void *cb_priv, - enum flow_block_command cmd) -{ - if (!(flowtable->flags & NF_FLOWTABLE_HW_OFFLOAD)) - return; - - nf_flow_table_indr_block_ing_cmd(dev, flowtable, cb, cb_priv, cmd); -} - -static void nf_flow_table_indr_block_cb(struct net_device *dev, - flow_indr_block_bind_cb_t *cb, - void *cb_priv, - enum flow_block_command cmd) -{ - struct net *net = dev_net(dev); - struct nft_flowtable *nft_ft; - struct nft_table *table; - struct nft_hook *hook; - - mutex_lock(&net->nft.commit_mutex); - list_for_each_entry(table, &net->nft.tables, list) { - list_for_each_entry(nft_ft, &table->flowtables, list) { - list_for_each_entry(hook, &nft_ft->hook_list, list) { - if (hook->ops.dev != dev) - continue; - - nf_flow_table_indr_block_cb_cmd(&nft_ft->data, - dev, cb, - cb_priv, cmd); - } - } - } - mutex_unlock(&net->nft.commit_mutex); -} - -static struct flow_indr_block_entry block_ing_entry = { - .cb = nf_flow_table_indr_block_cb, - .list = LIST_HEAD_INIT(block_ing_entry.list), -}; - int nf_flow_table_offload_init(void) { nf_flow_offload_wq = alloc_workqueue("nf_flow_table_offload", @@ -1078,13 +1015,10 @@ int nf_flow_table_offload_init(void) if (!nf_flow_offload_wq) return -ENOMEM; - flow_indr_add_block_cb(&block_ing_entry); - return 0; } void nf_flow_table_offload_exit(void) { - flow_indr_del_block_cb(&block_ing_entry); destroy_workqueue(nf_flow_offload_wq); } diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index 1960f11477e8..185fc82c99aa 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -285,25 +285,6 @@ static int nft_block_offload_cmd(struct nft_base_chain *chain, return nft_block_setup(chain, &bo, cmd); } -static void nft_indr_block_ing_cmd(struct net_device *dev, - struct nft_base_chain *chain, - flow_indr_block_bind_cb_t *cb, - void *cb_priv, - enum flow_block_command cmd) -{ - struct netlink_ext_ack extack = {}; - struct flow_block_offload bo; - - if (!chain) - return; - - nft_flow_block_offload_init(&bo, dev_net(dev), cmd, chain, &extack); - - cb(dev, cb_priv, TC_SETUP_BLOCK, &bo); - - nft_block_setup(chain, &bo, cmd); -} - static void nft_indr_block_cleanup(struct flow_block_cb *block_cb) { struct nft_base_chain *basechain = block_cb->indr.data; @@ -575,24 +556,6 @@ static struct nft_chain *__nft_offload_get_chain(struct net_device *dev) return NULL; } -static void nft_indr_block_cb(struct net_device *dev, - flow_indr_block_bind_cb_t *cb, void *cb_priv, - enum flow_block_command cmd) -{ - struct net *net = dev_net(dev); - struct nft_chain *chain; - - mutex_lock(&net->nft.commit_mutex); - chain = __nft_offload_get_chain(dev); - if (chain && chain->flags & NFT_CHAIN_HW_OFFLOAD) { - struct nft_base_chain *basechain; - - basechain = nft_base_chain(chain); - nft_indr_block_ing_cmd(dev, basechain, cb, cb_priv, cmd); - } - mutex_unlock(&net->nft.commit_mutex); -} - static int nft_offload_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { @@ -614,30 +577,16 @@ static int nft_offload_netdev_event(struct notifier_block *this, return NOTIFY_DONE; } -static struct flow_indr_block_entry block_ing_entry = { - .cb = nft_indr_block_cb, - .list = LIST_HEAD_INIT(block_ing_entry.list), -}; - static struct notifier_block nft_offload_netdev_notifier = { .notifier_call = nft_offload_netdev_event, }; int nft_offload_init(void) { - int err; - - err = register_netdevice_notifier(&nft_offload_netdev_notifier); - if (err < 0) - return err; - - flow_indr_add_block_cb(&block_ing_entry); - - return 0; + return register_netdevice_notifier(&nft_offload_netdev_notifier); } void nft_offload_exit(void) { - flow_indr_del_block_cb(&block_ing_entry); unregister_netdevice_notifier(&nft_offload_netdev_notifier); } diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 760e51d852f5..a00a203b2ef5 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -621,78 +621,6 @@ static void tcf_chain_flush(struct tcf_chain *chain, bool rtnl_held) static int tcf_block_setup(struct tcf_block *block, struct flow_block_offload *bo); -static void tc_indr_block_cmd(struct net_device *dev, struct tcf_block *block, - flow_indr_block_bind_cb_t *cb, void *cb_priv, - enum flow_block_command command, bool ingress) -{ - struct flow_block_offload bo = { - .command = command, - .binder_type = ingress ? - FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS : - FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS, - .net = dev_net(dev), - .block_shared = tcf_block_non_null_shared(block), - }; - INIT_LIST_HEAD(&bo.cb_list); - - if (!block) - return; - - bo.block = &block->flow_block; - - down_write(&block->cb_lock); - cb(dev, cb_priv, TC_SETUP_BLOCK, &bo); - - tcf_block_setup(block, &bo); - up_write(&block->cb_lock); -} - -static struct tcf_block *tc_dev_block(struct net_device *dev, bool ingress) -{ - const struct Qdisc_class_ops *cops; - const struct Qdisc_ops *ops; - struct Qdisc *qdisc; - - if (!dev_ingress_queue(dev)) - return NULL; - - qdisc = dev_ingress_queue(dev)->qdisc_sleeping; - if (!qdisc) - return NULL; - - ops = qdisc->ops; - if (!ops) - return NULL; - - if (!ingress && !strcmp("ingress", ops->id)) - return NULL; - - cops = ops->cl_ops; - if (!cops) - return NULL; - - if (!cops->tcf_block) - return NULL; - - return cops->tcf_block(qdisc, - ingress ? TC_H_MIN_INGRESS : TC_H_MIN_EGRESS, - NULL); -} - -static void tc_indr_block_get_and_cmd(struct net_device *dev, - flow_indr_block_bind_cb_t *cb, - void *cb_priv, - enum flow_block_command command) -{ - struct tcf_block *block; - - block = tc_dev_block(dev, true); - tc_indr_block_cmd(dev, block, cb, cb_priv, command, true); - - block = tc_dev_block(dev, false); - tc_indr_block_cmd(dev, block, cb, cb_priv, command, false); -} - static void tcf_block_offload_init(struct flow_block_offload *bo, struct net_device *dev, enum flow_block_command command, @@ -3836,11 +3764,6 @@ static struct pernet_operations tcf_net_ops = { .size = sizeof(struct tcf_net), }; -static struct flow_indr_block_entry block_entry = { - .cb = tc_indr_block_get_and_cmd, - .list = LIST_HEAD_INIT(block_entry.list), -}; - static int __init tc_filter_init(void) { int err; @@ -3853,8 +3776,6 @@ static int __init tc_filter_init(void) if (err) goto err_register_pernet_subsys; - flow_indr_add_block_cb(&block_entry); - rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_new_tfilter, NULL, RTNL_FLAG_DOIT_UNLOCKED); rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_del_tfilter, NULL, -- cgit v1.2.3 From 678eb199cc9df3bf1cb12fb2da22768b8d1b6bf3 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 29 May 2020 21:36:36 +0300 Subject: devlink: Create dedicated trap group for layer 3 exceptions Packets that hit exceptions during layer 3 forwarding must be trapped to the CPU for the control plane to function properly. Create a dedicated group for them, so that user space could choose to assign a different policer for them. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- Documentation/networking/devlink/devlink-trap.rst | 7 +++++-- include/net/devlink.h | 3 +++ net/core/devlink.c | 1 + 3 files changed, 9 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/Documentation/networking/devlink/devlink-trap.rst b/Documentation/networking/devlink/devlink-trap.rst index fe089acb7783..4ca241e70064 100644 --- a/Documentation/networking/devlink/devlink-trap.rst +++ b/Documentation/networking/devlink/devlink-trap.rst @@ -277,8 +277,11 @@ narrow. The description of these groups must be added to the following table: - Contains packet traps for packets that were dropped by the device during layer 2 forwarding (i.e., bridge) * - ``l3_drops`` - - Contains packet traps for packets that were dropped by the device or hit - an exception (e.g., TTL error) during layer 3 forwarding + - Contains packet traps for packets that were dropped by the device during + layer 3 forwarding + * - ``l3_exceptions`` + - Contains packet traps for packets that hit an exception (e.g., TTL + error) during layer 3 forwarding * - ``buffer_drops`` - Contains packet traps for packets that were dropped by the device due to an enqueue decision diff --git a/include/net/devlink.h b/include/net/devlink.h index 8ffc1b5cd89b..851388c9d795 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -657,6 +657,7 @@ enum devlink_trap_generic_id { enum devlink_trap_group_generic_id { DEVLINK_TRAP_GROUP_GENERIC_ID_L2_DROPS, DEVLINK_TRAP_GROUP_GENERIC_ID_L3_DROPS, + DEVLINK_TRAP_GROUP_GENERIC_ID_L3_EXCEPTIONS, DEVLINK_TRAP_GROUP_GENERIC_ID_BUFFER_DROPS, DEVLINK_TRAP_GROUP_GENERIC_ID_TUNNEL_DROPS, DEVLINK_TRAP_GROUP_GENERIC_ID_ACL_DROPS, @@ -730,6 +731,8 @@ enum devlink_trap_group_generic_id { "l2_drops" #define DEVLINK_TRAP_GROUP_GENERIC_NAME_L3_DROPS \ "l3_drops" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_L3_EXCEPTIONS \ + "l3_exceptions" #define DEVLINK_TRAP_GROUP_GENERIC_NAME_BUFFER_DROPS \ "buffer_drops" #define DEVLINK_TRAP_GROUP_GENERIC_NAME_TUNNEL_DROPS \ diff --git a/net/core/devlink.c b/net/core/devlink.c index 7b76e5fffc10..d9fff7083f02 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -8505,6 +8505,7 @@ static const struct devlink_trap devlink_trap_generic[] = { static const struct devlink_trap_group devlink_trap_group_generic[] = { DEVLINK_TRAP_GROUP(L2_DROPS), DEVLINK_TRAP_GROUP(L3_DROPS), + DEVLINK_TRAP_GROUP(L3_EXCEPTIONS), DEVLINK_TRAP_GROUP(BUFFER_DROPS), DEVLINK_TRAP_GROUP(TUNNEL_DROPS), DEVLINK_TRAP_GROUP(ACL_DROPS), -- cgit v1.2.3 From 9eefeabed6f831018c15bd7e17d34967ee34d9dd Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 29 May 2020 21:36:39 +0300 Subject: devlink: Add 'mirror' trap action The action is used by control traps such as IGMP query. The packet is flooded by the device, but also trapped to the CPU in order for the software bridge to mark the receiving port as a multicast router port. Such packets are marked with 'skb->offload_fwd_mark = 1' in order to prevent the software bridge from flooding them again. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- Documentation/networking/devlink/devlink-trap.rst | 2 ++ include/uapi/linux/devlink.h | 3 +++ net/core/devlink.c | 3 ++- 3 files changed, 7 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/Documentation/networking/devlink/devlink-trap.rst b/Documentation/networking/devlink/devlink-trap.rst index 4ca241e70064..5b97327caefc 100644 --- a/Documentation/networking/devlink/devlink-trap.rst +++ b/Documentation/networking/devlink/devlink-trap.rst @@ -108,6 +108,8 @@ The ``devlink-trap`` mechanism supports the following packet trap actions: * ``trap``: The sole copy of the packet is sent to the CPU. * ``drop``: The packet is dropped by the underlying device and a copy is not sent to the CPU. + * ``mirror``: The packet is forwarded by the underlying device and a copy is + sent to the CPU. Generic Packet Traps ==================== diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 1ae90e06c06d..16305932a950 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -233,10 +233,13 @@ enum { * @DEVLINK_TRAP_ACTION_DROP: Packet is dropped by the device and a copy is not * sent to the CPU. * @DEVLINK_TRAP_ACTION_TRAP: The sole copy of the packet is sent to the CPU. + * @DEVLINK_TRAP_ACTION_MIRROR: Packet is forwarded by the device and a copy is + * sent to the CPU. */ enum devlink_trap_action { DEVLINK_TRAP_ACTION_DROP, DEVLINK_TRAP_ACTION_TRAP, + DEVLINK_TRAP_ACTION_MIRROR, }; /** diff --git a/net/core/devlink.c b/net/core/devlink.c index d9fff7083f02..d6298917b077 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -5869,7 +5869,8 @@ devlink_trap_action_get_from_info(struct genl_info *info, val = nla_get_u8(info->attrs[DEVLINK_ATTR_TRAP_ACTION]); switch (val) { case DEVLINK_TRAP_ACTION_DROP: /* fall-through */ - case DEVLINK_TRAP_ACTION_TRAP: + case DEVLINK_TRAP_ACTION_TRAP: /* fall-through */ + case DEVLINK_TRAP_ACTION_MIRROR: *p_trap_action = val; break; default: -- cgit v1.2.3 From 30a4e9a29ab9aadfe6c5386ae4aa396b1d2556c2 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 29 May 2020 21:36:40 +0300 Subject: devlink: Add 'control' trap type This type is used for traps that trap control packets such as ARP request and IGMP query to the CPU. Do not report such packets to the kernel's drop monitor as they were not dropped by the device no encountered an exception during forwarding. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- Documentation/networking/devlink/devlink-trap.rst | 8 +++++++- include/uapi/linux/devlink.h | 6 ++++++ net/core/devlink.c | 7 +++++++ 3 files changed, 20 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/Documentation/networking/devlink/devlink-trap.rst b/Documentation/networking/devlink/devlink-trap.rst index 5b97327caefc..6c293cfa23ee 100644 --- a/Documentation/networking/devlink/devlink-trap.rst +++ b/Documentation/networking/devlink/devlink-trap.rst @@ -55,7 +55,7 @@ The following diagram provides a general overview of ``devlink-trap``:: | | +-------^--------+ | - | + | Non-control traps | +----+----+ | | Kernel's Rx path @@ -97,6 +97,12 @@ The ``devlink-trap`` mechanism supports the following packet trap types: processed by ``devlink`` and injected to the kernel's Rx path. Changing the action of such traps is not allowed, as it can easily break the control plane. + * ``control``: Trapped packets were trapped by the device because these are + control packets required for the correct functioning of the control plane. + For example, ARP request and IGMP query packets. Packets are injected to + the kernel's Rx path, but not reported to the kernel's drop monitor. + Changing the action of such traps is not allowed, as it can easily break + the control plane. .. _Trap-Actions: diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 16305932a950..08563e6a424d 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -253,10 +253,16 @@ enum devlink_trap_action { * control plane for resolution. Trapped packets * are processed by devlink and injected to * the kernel's Rx path. + * @DEVLINK_TRAP_TYPE_CONTROL: Packet was trapped because it is required for + * the correct functioning of the control plane. + * For example, an ARP request packet. Trapped + * packets are injected to the kernel's Rx path, + * but not reported to drop monitor. */ enum devlink_trap_type { DEVLINK_TRAP_TYPE_DROP, DEVLINK_TRAP_TYPE_EXCEPTION, + DEVLINK_TRAP_TYPE_CONTROL, }; enum { diff --git a/net/core/devlink.c b/net/core/devlink.c index d6298917b077..47c28e0f848f 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -8847,6 +8847,13 @@ void devlink_trap_report(struct devlink *devlink, struct sk_buff *skb, devlink_trap_stats_update(trap_item->stats, skb->len); devlink_trap_stats_update(trap_item->group_item->stats, skb->len); + /* Control packets were not dropped by the device or encountered an + * exception during forwarding and therefore should not be reported to + * the kernel's drop monitor. + */ + if (trap_item->trap->type == DEVLINK_TRAP_TYPE_CONTROL) + return; + devlink_trap_report_metadata_fill(&hw_metadata, trap_item, in_devlink_port, fa_cookie); net_dm_hw_report(skb, &hw_metadata); -- cgit v1.2.3 From 515eac677fe119433c2a466443bef95c10c550cc Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 29 May 2020 21:36:41 +0300 Subject: devlink: Add layer 2 control packet traps Add layer 2 control packet traps such as STP and IGMP query, so that capable device drivers could register them with devlink. Add documentation for every added packet trap and packet trap group. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- Documentation/networking/devlink/devlink-trap.rst | 45 +++++++++++++++++++++ include/net/devlink.h | 48 +++++++++++++++++++++++ net/core/devlink.c | 16 ++++++++ 3 files changed, 109 insertions(+) (limited to 'net') diff --git a/Documentation/networking/devlink/devlink-trap.rst b/Documentation/networking/devlink/devlink-trap.rst index 6c293cfa23ee..e9fc3c9d7d7a 100644 --- a/Documentation/networking/devlink/devlink-trap.rst +++ b/Documentation/networking/devlink/devlink-trap.rst @@ -252,6 +252,42 @@ be added to the following table: * - ``egress_flow_action_drop`` - ``drop`` - Traps packets dropped during processing of egress flow action drop + * - ``stp`` + - ``control`` + - Traps STP packets + * - ``lacp`` + - ``control`` + - Traps LACP packets + * - ``lldp`` + - ``control`` + - Traps LLDP packets + * - ``igmp_query`` + - ``control`` + - Traps IGMP Membership Query packets + * - ``igmp_v1_report`` + - ``control`` + - Traps IGMP Version 1 Membership Report packets + * - ``igmp_v2_report`` + - ``control`` + - Traps IGMP Version 2 Membership Report packets + * - ``igmp_v3_report`` + - ``control`` + - Traps IGMP Version 3 Membership Report packets + * - ``igmp_v2_leave`` + - ``control`` + - Traps IGMP Version 2 Leave Group packets + * - ``mld_query`` + - ``control`` + - Traps MLD Multicast Listener Query packets + * - ``mld_v1_report`` + - ``control`` + - Traps MLD Version 1 Multicast Listener Report packets + * - ``mld_v2_report`` + - ``control`` + - Traps MLD Version 2 Multicast Listener Report packets + * - ``mld_v1_done`` + - ``control`` + - Traps MLD Version 1 Multicast Listener Done packets Driver-specific Packet Traps ============================ @@ -299,6 +335,15 @@ narrow. The description of these groups must be added to the following table: * - ``acl_drops`` - Contains packet traps for packets that were dropped by the device during ACL processing + * - ``stp`` + - Contains packet traps for STP packets + * - ``lacp`` + - Contains packet traps for LACP packets + * - ``lldp`` + - Contains packet traps for LLDP packets + * - ``mc_snooping`` + - Contains packet traps for IGMP and MLD packets required for multicast + snooping Packet Trap Policers ==================== diff --git a/include/net/devlink.h b/include/net/devlink.h index 851388c9d795..c0061542ad65 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -645,6 +645,18 @@ enum devlink_trap_generic_id { DEVLINK_TRAP_GENERIC_ID_OVERLAY_SMAC_MC, DEVLINK_TRAP_GENERIC_ID_INGRESS_FLOW_ACTION_DROP, DEVLINK_TRAP_GENERIC_ID_EGRESS_FLOW_ACTION_DROP, + DEVLINK_TRAP_GENERIC_ID_STP, + DEVLINK_TRAP_GENERIC_ID_LACP, + DEVLINK_TRAP_GENERIC_ID_LLDP, + DEVLINK_TRAP_GENERIC_ID_IGMP_QUERY, + DEVLINK_TRAP_GENERIC_ID_IGMP_V1_REPORT, + DEVLINK_TRAP_GENERIC_ID_IGMP_V2_REPORT, + DEVLINK_TRAP_GENERIC_ID_IGMP_V3_REPORT, + DEVLINK_TRAP_GENERIC_ID_IGMP_V2_LEAVE, + DEVLINK_TRAP_GENERIC_ID_MLD_QUERY, + DEVLINK_TRAP_GENERIC_ID_MLD_V1_REPORT, + DEVLINK_TRAP_GENERIC_ID_MLD_V2_REPORT, + DEVLINK_TRAP_GENERIC_ID_MLD_V1_DONE, /* Add new generic trap IDs above */ __DEVLINK_TRAP_GENERIC_ID_MAX, @@ -661,6 +673,10 @@ enum devlink_trap_group_generic_id { DEVLINK_TRAP_GROUP_GENERIC_ID_BUFFER_DROPS, DEVLINK_TRAP_GROUP_GENERIC_ID_TUNNEL_DROPS, DEVLINK_TRAP_GROUP_GENERIC_ID_ACL_DROPS, + DEVLINK_TRAP_GROUP_GENERIC_ID_STP, + DEVLINK_TRAP_GROUP_GENERIC_ID_LACP, + DEVLINK_TRAP_GROUP_GENERIC_ID_LLDP, + DEVLINK_TRAP_GROUP_GENERIC_ID_MC_SNOOPING, /* Add new generic trap group IDs above */ __DEVLINK_TRAP_GROUP_GENERIC_ID_MAX, @@ -726,6 +742,30 @@ enum devlink_trap_group_generic_id { "ingress_flow_action_drop" #define DEVLINK_TRAP_GENERIC_NAME_EGRESS_FLOW_ACTION_DROP \ "egress_flow_action_drop" +#define DEVLINK_TRAP_GENERIC_NAME_STP \ + "stp" +#define DEVLINK_TRAP_GENERIC_NAME_LACP \ + "lacp" +#define DEVLINK_TRAP_GENERIC_NAME_LLDP \ + "lldp" +#define DEVLINK_TRAP_GENERIC_NAME_IGMP_QUERY \ + "igmp_query" +#define DEVLINK_TRAP_GENERIC_NAME_IGMP_V1_REPORT \ + "igmp_v1_report" +#define DEVLINK_TRAP_GENERIC_NAME_IGMP_V2_REPORT \ + "igmp_v2_report" +#define DEVLINK_TRAP_GENERIC_NAME_IGMP_V3_REPORT \ + "igmp_v3_report" +#define DEVLINK_TRAP_GENERIC_NAME_IGMP_V2_LEAVE \ + "igmp_v2_leave" +#define DEVLINK_TRAP_GENERIC_NAME_MLD_QUERY \ + "mld_query" +#define DEVLINK_TRAP_GENERIC_NAME_MLD_V1_REPORT \ + "mld_v1_report" +#define DEVLINK_TRAP_GENERIC_NAME_MLD_V2_REPORT \ + "mld_v2_report" +#define DEVLINK_TRAP_GENERIC_NAME_MLD_V1_DONE \ + "mld_v1_done" #define DEVLINK_TRAP_GROUP_GENERIC_NAME_L2_DROPS \ "l2_drops" @@ -739,6 +779,14 @@ enum devlink_trap_group_generic_id { "tunnel_drops" #define DEVLINK_TRAP_GROUP_GENERIC_NAME_ACL_DROPS \ "acl_drops" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_STP \ + "stp" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_LACP \ + "lacp" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_LLDP \ + "lldp" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_MC_SNOOPING \ + "mc_snooping" #define DEVLINK_TRAP_GENERIC(_type, _init_action, _id, _group_id, \ _metadata_cap) \ diff --git a/net/core/devlink.c b/net/core/devlink.c index 47c28e0f848f..c91ef1b5f738 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -8495,6 +8495,18 @@ static const struct devlink_trap devlink_trap_generic[] = { DEVLINK_TRAP(OVERLAY_SMAC_MC, DROP), DEVLINK_TRAP(INGRESS_FLOW_ACTION_DROP, DROP), DEVLINK_TRAP(EGRESS_FLOW_ACTION_DROP, DROP), + DEVLINK_TRAP(STP, CONTROL), + DEVLINK_TRAP(LACP, CONTROL), + DEVLINK_TRAP(LLDP, CONTROL), + DEVLINK_TRAP(IGMP_QUERY, CONTROL), + DEVLINK_TRAP(IGMP_V1_REPORT, CONTROL), + DEVLINK_TRAP(IGMP_V2_REPORT, CONTROL), + DEVLINK_TRAP(IGMP_V3_REPORT, CONTROL), + DEVLINK_TRAP(IGMP_V2_LEAVE, CONTROL), + DEVLINK_TRAP(MLD_QUERY, CONTROL), + DEVLINK_TRAP(MLD_V1_REPORT, CONTROL), + DEVLINK_TRAP(MLD_V2_REPORT, CONTROL), + DEVLINK_TRAP(MLD_V1_DONE, CONTROL), }; #define DEVLINK_TRAP_GROUP(_id) \ @@ -8510,6 +8522,10 @@ static const struct devlink_trap_group devlink_trap_group_generic[] = { DEVLINK_TRAP_GROUP(BUFFER_DROPS), DEVLINK_TRAP_GROUP(TUNNEL_DROPS), DEVLINK_TRAP_GROUP(ACL_DROPS), + DEVLINK_TRAP_GROUP(STP), + DEVLINK_TRAP_GROUP(LACP), + DEVLINK_TRAP_GROUP(LLDP), + DEVLINK_TRAP_GROUP(MC_SNOOPING), }; static int devlink_trap_generic_verify(const struct devlink_trap *trap) -- cgit v1.2.3 From d77cfd162a346259222d0207a95bf1a0cc0c2520 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 29 May 2020 21:36:42 +0300 Subject: devlink: Add layer 3 control packet traps Add layer 3 control packet traps such as ARP and DHCP, so that capable device drivers could register them with devlink. Add documentation for every added packet trap and packet trap group. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- Documentation/networking/devlink/devlink-trap.rst | 143 ++++++++++++++++++++++ include/net/devlink.h | 126 +++++++++++++++++++ net/core/devlink.c | 42 +++++++ 3 files changed, 311 insertions(+) (limited to 'net') diff --git a/Documentation/networking/devlink/devlink-trap.rst b/Documentation/networking/devlink/devlink-trap.rst index e9fc3c9d7d7a..621b634b16be 100644 --- a/Documentation/networking/devlink/devlink-trap.rst +++ b/Documentation/networking/devlink/devlink-trap.rst @@ -288,6 +288,115 @@ be added to the following table: * - ``mld_v1_done`` - ``control`` - Traps MLD Version 1 Multicast Listener Done packets + * - ``ipv4_dhcp`` + - ``control`` + - Traps IPv4 DHCP packets + * - ``ipv6_dhcp`` + - ``control`` + - Traps IPv6 DHCP packets + * - ``arp_request`` + - ``control`` + - Traps ARP request packets + * - ``arp_response`` + - ``control`` + - Traps ARP response packets + * - ``arp_overlay`` + - ``control`` + - Traps NVE-decapsulated ARP packets that reached the overlay network. + This is required, for example, when the address that needs to be + resolved is a local address + * - ``ipv6_neigh_solicit`` + - ``control`` + - Traps IPv6 Neighbour Solicitation packets + * - ``ipv6_neigh_advert`` + - ``control`` + - Traps IPv6 Neighbour Advertisement packets + * - ``ipv4_bfd`` + - ``control`` + - Traps IPv4 BFD packets + * - ``ipv6_bfd`` + - ``control`` + - Traps IPv6 BFD packets + * - ``ipv4_ospf`` + - ``control`` + - Traps IPv4 OSPF packets + * - ``ipv6_ospf`` + - ``control`` + - Traps IPv6 OSPF packets + * - ``ipv4_bgp`` + - ``control`` + - Traps IPv4 BGP packets + * - ``ipv6_bgp`` + - ``control`` + - Traps IPv6 BGP packets + * - ``ipv4_vrrp`` + - ``control`` + - Traps IPv4 VRRP packets + * - ``ipv6_vrrp`` + - ``control`` + - Traps IPv6 VRRP packets + * - ``ipv4_pim`` + - ``control`` + - Traps IPv4 PIM packets + * - ``ipv6_pim`` + - ``control`` + - Traps IPv6 PIM packets + * - ``uc_loopback`` + - ``control`` + - Traps unicast packets that need to be routed through the same layer 3 + interface from which they were received. Such packets are routed by the + kernel, but also cause it to potentially generate ICMP redirect packets + * - ``local_route`` + - ``control`` + - Traps unicast packets that hit a local route and need to be locally + delivered + * - ``external_route`` + - ``control`` + - Traps packets that should be routed through an external interface (e.g., + management interface) that does not belong to the same device (e.g., + switch ASIC) as the ingress interface + * - ``ipv6_uc_dip_link_local_scope`` + - ``control`` + - Traps unicast IPv6 packets that need to be routed and have a destination + IP address with a link-local scope (i.e., fe80::/10). The trap allows + device drivers to avoid programming link-local routes, but still receive + packets for local delivery + * - ``ipv6_dip_all_nodes`` + - ``control`` + - Traps IPv6 packets that their destination IP address is the "All Nodes + Address" (i.e., ff02::1) + * - ``ipv6_dip_all_routers`` + - ``control`` + - Traps IPv6 packets that their destination IP address is the "All Routers + Address" (i.e., ff02::2) + * - ``ipv6_router_solicit`` + - ``control`` + - Traps IPv6 Router Solicitation packets + * - ``ipv6_router_advert`` + - ``control`` + - Traps IPv6 Router Advertisement packets + * - ``ipv6_redirect`` + - ``control`` + - Traps IPv6 Redirect Message packets + * - ``ipv4_router_alert`` + - ``control`` + - Traps IPv4 packets that need to be routed and include the Router Alert + option. Such packets need to be locally delivered to raw sockets that + have the IP_ROUTER_ALERT socket option set + * - ``ipv6_router_alert`` + - ``control`` + - Traps IPv6 packets that need to be routed and include the Router Alert + option in their Hop-by-Hop extension header. Such packets need to be + locally delivered to raw sockets that have the IPV6_ROUTER_ALERT socket + option set + * - ``ptp_event`` + - ``control`` + - Traps PTP time-critical event messages (Sync, Delay_req, Pdelay_Req and + Pdelay_Resp) + * - ``ptp_general`` + - ``control`` + - Traps PTP general messages (Announce, Follow_Up, Delay_Resp, + Pdelay_Resp_Follow_Up, management and signaling) Driver-specific Packet Traps ============================ @@ -344,6 +453,40 @@ narrow. The description of these groups must be added to the following table: * - ``mc_snooping`` - Contains packet traps for IGMP and MLD packets required for multicast snooping + * - ``dhcp`` + - Contains packet traps for DHCP packets + * - ``neigh_discovery`` + - Contains packet traps for neighbour discovery packets (e.g., ARP, IPv6 + ND) + * - ``bfd`` + - Contains packet traps for BFD packets + * - ``ospf`` + - Contains packet traps for OSPF packets + * - ``bgp`` + - Contains packet traps for BGP packets + * - ``vrrp`` + - Contains packet traps for VRRP packets + * - ``pim`` + - Contains packet traps for PIM packets + * - ``uc_loopback`` + - Contains a packet trap for unicast loopback packets (i.e., + ``uc_loopback``). This trap is singled-out because in cases such as + one-armed router it will be constantly triggered. To limit the impact on + the CPU usage, a packet trap policer with a low rate can be bound to the + group without affecting other traps + * - ``local_delivery`` + - Contains packet traps for packets that should be locally delivered after + routing, but do not match more specific packet traps (e.g., + ``ipv4_bgp``) + * - ``ipv6`` + - Contains packet traps for various IPv6 control packets (e.g., Router + Advertisements) + * - ``ptp_event`` + - Contains packet traps for PTP time-critical event messages (Sync, + Delay_req, Pdelay_Req and Pdelay_Resp) + * - ``ptp_general`` + - Contains packet traps for PTP general messages (Announce, Follow_Up, + Delay_Resp, Pdelay_Resp_Follow_Up, management and signaling) Packet Trap Policers ==================== diff --git a/include/net/devlink.h b/include/net/devlink.h index c0061542ad65..05a45dea976b 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -657,6 +657,36 @@ enum devlink_trap_generic_id { DEVLINK_TRAP_GENERIC_ID_MLD_V1_REPORT, DEVLINK_TRAP_GENERIC_ID_MLD_V2_REPORT, DEVLINK_TRAP_GENERIC_ID_MLD_V1_DONE, + DEVLINK_TRAP_GENERIC_ID_IPV4_DHCP, + DEVLINK_TRAP_GENERIC_ID_IPV6_DHCP, + DEVLINK_TRAP_GENERIC_ID_ARP_REQUEST, + DEVLINK_TRAP_GENERIC_ID_ARP_RESPONSE, + DEVLINK_TRAP_GENERIC_ID_ARP_OVERLAY, + DEVLINK_TRAP_GENERIC_ID_IPV6_NEIGH_SOLICIT, + DEVLINK_TRAP_GENERIC_ID_IPV6_NEIGH_ADVERT, + DEVLINK_TRAP_GENERIC_ID_IPV4_BFD, + DEVLINK_TRAP_GENERIC_ID_IPV6_BFD, + DEVLINK_TRAP_GENERIC_ID_IPV4_OSPF, + DEVLINK_TRAP_GENERIC_ID_IPV6_OSPF, + DEVLINK_TRAP_GENERIC_ID_IPV4_BGP, + DEVLINK_TRAP_GENERIC_ID_IPV6_BGP, + DEVLINK_TRAP_GENERIC_ID_IPV4_VRRP, + DEVLINK_TRAP_GENERIC_ID_IPV6_VRRP, + DEVLINK_TRAP_GENERIC_ID_IPV4_PIM, + DEVLINK_TRAP_GENERIC_ID_IPV6_PIM, + DEVLINK_TRAP_GENERIC_ID_UC_LB, + DEVLINK_TRAP_GENERIC_ID_LOCAL_ROUTE, + DEVLINK_TRAP_GENERIC_ID_EXTERNAL_ROUTE, + DEVLINK_TRAP_GENERIC_ID_IPV6_UC_DIP_LINK_LOCAL_SCOPE, + DEVLINK_TRAP_GENERIC_ID_IPV6_DIP_ALL_NODES, + DEVLINK_TRAP_GENERIC_ID_IPV6_DIP_ALL_ROUTERS, + DEVLINK_TRAP_GENERIC_ID_IPV6_ROUTER_SOLICIT, + DEVLINK_TRAP_GENERIC_ID_IPV6_ROUTER_ADVERT, + DEVLINK_TRAP_GENERIC_ID_IPV6_REDIRECT, + DEVLINK_TRAP_GENERIC_ID_IPV4_ROUTER_ALERT, + DEVLINK_TRAP_GENERIC_ID_IPV6_ROUTER_ALERT, + DEVLINK_TRAP_GENERIC_ID_PTP_EVENT, + DEVLINK_TRAP_GENERIC_ID_PTP_GENERAL, /* Add new generic trap IDs above */ __DEVLINK_TRAP_GENERIC_ID_MAX, @@ -677,6 +707,18 @@ enum devlink_trap_group_generic_id { DEVLINK_TRAP_GROUP_GENERIC_ID_LACP, DEVLINK_TRAP_GROUP_GENERIC_ID_LLDP, DEVLINK_TRAP_GROUP_GENERIC_ID_MC_SNOOPING, + DEVLINK_TRAP_GROUP_GENERIC_ID_DHCP, + DEVLINK_TRAP_GROUP_GENERIC_ID_NEIGH_DISCOVERY, + DEVLINK_TRAP_GROUP_GENERIC_ID_BFD, + DEVLINK_TRAP_GROUP_GENERIC_ID_OSPF, + DEVLINK_TRAP_GROUP_GENERIC_ID_BGP, + DEVLINK_TRAP_GROUP_GENERIC_ID_VRRP, + DEVLINK_TRAP_GROUP_GENERIC_ID_PIM, + DEVLINK_TRAP_GROUP_GENERIC_ID_UC_LB, + DEVLINK_TRAP_GROUP_GENERIC_ID_LOCAL_DELIVERY, + DEVLINK_TRAP_GROUP_GENERIC_ID_IPV6, + DEVLINK_TRAP_GROUP_GENERIC_ID_PTP_EVENT, + DEVLINK_TRAP_GROUP_GENERIC_ID_PTP_GENERAL, /* Add new generic trap group IDs above */ __DEVLINK_TRAP_GROUP_GENERIC_ID_MAX, @@ -766,6 +808,66 @@ enum devlink_trap_group_generic_id { "mld_v2_report" #define DEVLINK_TRAP_GENERIC_NAME_MLD_V1_DONE \ "mld_v1_done" +#define DEVLINK_TRAP_GENERIC_NAME_IPV4_DHCP \ + "ipv4_dhcp" +#define DEVLINK_TRAP_GENERIC_NAME_IPV6_DHCP \ + "ipv6_dhcp" +#define DEVLINK_TRAP_GENERIC_NAME_ARP_REQUEST \ + "arp_request" +#define DEVLINK_TRAP_GENERIC_NAME_ARP_RESPONSE \ + "arp_response" +#define DEVLINK_TRAP_GENERIC_NAME_ARP_OVERLAY \ + "arp_overlay" +#define DEVLINK_TRAP_GENERIC_NAME_IPV6_NEIGH_SOLICIT \ + "ipv6_neigh_solicit" +#define DEVLINK_TRAP_GENERIC_NAME_IPV6_NEIGH_ADVERT \ + "ipv6_neigh_advert" +#define DEVLINK_TRAP_GENERIC_NAME_IPV4_BFD \ + "ipv4_bfd" +#define DEVLINK_TRAP_GENERIC_NAME_IPV6_BFD \ + "ipv6_bfd" +#define DEVLINK_TRAP_GENERIC_NAME_IPV4_OSPF \ + "ipv4_ospf" +#define DEVLINK_TRAP_GENERIC_NAME_IPV6_OSPF \ + "ipv6_ospf" +#define DEVLINK_TRAP_GENERIC_NAME_IPV4_BGP \ + "ipv4_bgp" +#define DEVLINK_TRAP_GENERIC_NAME_IPV6_BGP \ + "ipv6_bgp" +#define DEVLINK_TRAP_GENERIC_NAME_IPV4_VRRP \ + "ipv4_vrrp" +#define DEVLINK_TRAP_GENERIC_NAME_IPV6_VRRP \ + "ipv6_vrrp" +#define DEVLINK_TRAP_GENERIC_NAME_IPV4_PIM \ + "ipv4_pim" +#define DEVLINK_TRAP_GENERIC_NAME_IPV6_PIM \ + "ipv6_pim" +#define DEVLINK_TRAP_GENERIC_NAME_UC_LB \ + "uc_loopback" +#define DEVLINK_TRAP_GENERIC_NAME_LOCAL_ROUTE \ + "local_route" +#define DEVLINK_TRAP_GENERIC_NAME_EXTERNAL_ROUTE \ + "external_route" +#define DEVLINK_TRAP_GENERIC_NAME_IPV6_UC_DIP_LINK_LOCAL_SCOPE \ + "ipv6_uc_dip_link_local_scope" +#define DEVLINK_TRAP_GENERIC_NAME_IPV6_DIP_ALL_NODES \ + "ipv6_dip_all_nodes" +#define DEVLINK_TRAP_GENERIC_NAME_IPV6_DIP_ALL_ROUTERS \ + "ipv6_dip_all_routers" +#define DEVLINK_TRAP_GENERIC_NAME_IPV6_ROUTER_SOLICIT \ + "ipv6_router_solicit" +#define DEVLINK_TRAP_GENERIC_NAME_IPV6_ROUTER_ADVERT \ + "ipv6_router_advert" +#define DEVLINK_TRAP_GENERIC_NAME_IPV6_REDIRECT \ + "ipv6_redirect" +#define DEVLINK_TRAP_GENERIC_NAME_IPV4_ROUTER_ALERT \ + "ipv4_router_alert" +#define DEVLINK_TRAP_GENERIC_NAME_IPV6_ROUTER_ALERT \ + "ipv6_router_alert" +#define DEVLINK_TRAP_GENERIC_NAME_PTP_EVENT \ + "ptp_event" +#define DEVLINK_TRAP_GENERIC_NAME_PTP_GENERAL \ + "ptp_general" #define DEVLINK_TRAP_GROUP_GENERIC_NAME_L2_DROPS \ "l2_drops" @@ -787,6 +889,30 @@ enum devlink_trap_group_generic_id { "lldp" #define DEVLINK_TRAP_GROUP_GENERIC_NAME_MC_SNOOPING \ "mc_snooping" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_DHCP \ + "dhcp" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_NEIGH_DISCOVERY \ + "neigh_discovery" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_BFD \ + "bfd" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_OSPF \ + "ospf" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_BGP \ + "bgp" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_VRRP \ + "vrrp" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_PIM \ + "pim" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_UC_LB \ + "uc_loopback" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_LOCAL_DELIVERY \ + "local_delivery" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_IPV6 \ + "ipv6" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_PTP_EVENT \ + "ptp_event" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_PTP_GENERAL \ + "ptp_general" #define DEVLINK_TRAP_GENERIC(_type, _init_action, _id, _group_id, \ _metadata_cap) \ diff --git a/net/core/devlink.c b/net/core/devlink.c index c91ef1b5f738..f32854c3d0e7 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -8507,6 +8507,36 @@ static const struct devlink_trap devlink_trap_generic[] = { DEVLINK_TRAP(MLD_V1_REPORT, CONTROL), DEVLINK_TRAP(MLD_V2_REPORT, CONTROL), DEVLINK_TRAP(MLD_V1_DONE, CONTROL), + DEVLINK_TRAP(IPV4_DHCP, CONTROL), + DEVLINK_TRAP(IPV6_DHCP, CONTROL), + DEVLINK_TRAP(ARP_REQUEST, CONTROL), + DEVLINK_TRAP(ARP_RESPONSE, CONTROL), + DEVLINK_TRAP(ARP_OVERLAY, CONTROL), + DEVLINK_TRAP(IPV6_NEIGH_SOLICIT, CONTROL), + DEVLINK_TRAP(IPV6_NEIGH_ADVERT, CONTROL), + DEVLINK_TRAP(IPV4_BFD, CONTROL), + DEVLINK_TRAP(IPV6_BFD, CONTROL), + DEVLINK_TRAP(IPV4_OSPF, CONTROL), + DEVLINK_TRAP(IPV6_OSPF, CONTROL), + DEVLINK_TRAP(IPV4_BGP, CONTROL), + DEVLINK_TRAP(IPV6_BGP, CONTROL), + DEVLINK_TRAP(IPV4_VRRP, CONTROL), + DEVLINK_TRAP(IPV6_VRRP, CONTROL), + DEVLINK_TRAP(IPV4_PIM, CONTROL), + DEVLINK_TRAP(IPV6_PIM, CONTROL), + DEVLINK_TRAP(UC_LB, CONTROL), + DEVLINK_TRAP(LOCAL_ROUTE, CONTROL), + DEVLINK_TRAP(EXTERNAL_ROUTE, CONTROL), + DEVLINK_TRAP(IPV6_UC_DIP_LINK_LOCAL_SCOPE, CONTROL), + DEVLINK_TRAP(IPV6_DIP_ALL_NODES, CONTROL), + DEVLINK_TRAP(IPV6_DIP_ALL_ROUTERS, CONTROL), + DEVLINK_TRAP(IPV6_ROUTER_SOLICIT, CONTROL), + DEVLINK_TRAP(IPV6_ROUTER_ADVERT, CONTROL), + DEVLINK_TRAP(IPV6_REDIRECT, CONTROL), + DEVLINK_TRAP(IPV4_ROUTER_ALERT, CONTROL), + DEVLINK_TRAP(IPV6_ROUTER_ALERT, CONTROL), + DEVLINK_TRAP(PTP_EVENT, CONTROL), + DEVLINK_TRAP(PTP_GENERAL, CONTROL), }; #define DEVLINK_TRAP_GROUP(_id) \ @@ -8526,6 +8556,18 @@ static const struct devlink_trap_group devlink_trap_group_generic[] = { DEVLINK_TRAP_GROUP(LACP), DEVLINK_TRAP_GROUP(LLDP), DEVLINK_TRAP_GROUP(MC_SNOOPING), + DEVLINK_TRAP_GROUP(DHCP), + DEVLINK_TRAP_GROUP(NEIGH_DISCOVERY), + DEVLINK_TRAP_GROUP(BFD), + DEVLINK_TRAP_GROUP(OSPF), + DEVLINK_TRAP_GROUP(BGP), + DEVLINK_TRAP_GROUP(VRRP), + DEVLINK_TRAP_GROUP(PIM), + DEVLINK_TRAP_GROUP(UC_LB), + DEVLINK_TRAP_GROUP(LOCAL_DELIVERY), + DEVLINK_TRAP_GROUP(IPV6), + DEVLINK_TRAP_GROUP(PTP_EVENT), + DEVLINK_TRAP_GROUP(PTP_GENERAL), }; static int devlink_trap_generic_verify(const struct devlink_trap *trap) -- cgit v1.2.3 From 5eb18a2b6c11bf165271644ef1ab812b10659c8f Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 29 May 2020 21:36:43 +0300 Subject: devlink: Add ACL control packet traps Add packet traps for packets that are sampled / trapped by ACLs, so that capable drivers could register them with devlink. Add documentation for every added packet trap and packet trap group. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- Documentation/networking/devlink/devlink-trap.rst | 14 ++++++++++++++ include/net/devlink.h | 12 ++++++++++++ net/core/devlink.c | 4 ++++ 3 files changed, 30 insertions(+) (limited to 'net') diff --git a/Documentation/networking/devlink/devlink-trap.rst b/Documentation/networking/devlink/devlink-trap.rst index 621b634b16be..1e3f3ffee248 100644 --- a/Documentation/networking/devlink/devlink-trap.rst +++ b/Documentation/networking/devlink/devlink-trap.rst @@ -397,6 +397,14 @@ be added to the following table: - ``control`` - Traps PTP general messages (Announce, Follow_Up, Delay_Resp, Pdelay_Resp_Follow_Up, management and signaling) + * - ``flow_action_sample`` + - ``control`` + - Traps packets sampled during processing of flow action sample (e.g., via + tc's sample action) + * - ``flow_action_trap`` + - ``control`` + - Traps packets logged during processing of flow action trap (e.g., via + tc's trap action) Driver-specific Packet Traps ============================ @@ -487,6 +495,12 @@ narrow. The description of these groups must be added to the following table: * - ``ptp_general`` - Contains packet traps for PTP general messages (Announce, Follow_Up, Delay_Resp, Pdelay_Resp_Follow_Up, management and signaling) + * - ``acl_sample`` + - Contains packet traps for packets that were sampled by the device during + ACL processing + * - ``acl_trap`` + - Contains packet traps for packets that were trapped (logged) by the + device during ACL processing Packet Trap Policers ==================== diff --git a/include/net/devlink.h b/include/net/devlink.h index 05a45dea976b..1df6dfec26c2 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -687,6 +687,8 @@ enum devlink_trap_generic_id { DEVLINK_TRAP_GENERIC_ID_IPV6_ROUTER_ALERT, DEVLINK_TRAP_GENERIC_ID_PTP_EVENT, DEVLINK_TRAP_GENERIC_ID_PTP_GENERAL, + DEVLINK_TRAP_GENERIC_ID_FLOW_ACTION_SAMPLE, + DEVLINK_TRAP_GENERIC_ID_FLOW_ACTION_TRAP, /* Add new generic trap IDs above */ __DEVLINK_TRAP_GENERIC_ID_MAX, @@ -719,6 +721,8 @@ enum devlink_trap_group_generic_id { DEVLINK_TRAP_GROUP_GENERIC_ID_IPV6, DEVLINK_TRAP_GROUP_GENERIC_ID_PTP_EVENT, DEVLINK_TRAP_GROUP_GENERIC_ID_PTP_GENERAL, + DEVLINK_TRAP_GROUP_GENERIC_ID_ACL_SAMPLE, + DEVLINK_TRAP_GROUP_GENERIC_ID_ACL_TRAP, /* Add new generic trap group IDs above */ __DEVLINK_TRAP_GROUP_GENERIC_ID_MAX, @@ -868,6 +872,10 @@ enum devlink_trap_group_generic_id { "ptp_event" #define DEVLINK_TRAP_GENERIC_NAME_PTP_GENERAL \ "ptp_general" +#define DEVLINK_TRAP_GENERIC_NAME_FLOW_ACTION_SAMPLE \ + "flow_action_sample" +#define DEVLINK_TRAP_GENERIC_NAME_FLOW_ACTION_TRAP \ + "flow_action_trap" #define DEVLINK_TRAP_GROUP_GENERIC_NAME_L2_DROPS \ "l2_drops" @@ -913,6 +921,10 @@ enum devlink_trap_group_generic_id { "ptp_event" #define DEVLINK_TRAP_GROUP_GENERIC_NAME_PTP_GENERAL \ "ptp_general" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_ACL_SAMPLE \ + "acl_sample" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_ACL_TRAP \ + "acl_trap" #define DEVLINK_TRAP_GENERIC(_type, _init_action, _id, _group_id, \ _metadata_cap) \ diff --git a/net/core/devlink.c b/net/core/devlink.c index f32854c3d0e7..2cafbc808b09 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -8537,6 +8537,8 @@ static const struct devlink_trap devlink_trap_generic[] = { DEVLINK_TRAP(IPV6_ROUTER_ALERT, CONTROL), DEVLINK_TRAP(PTP_EVENT, CONTROL), DEVLINK_TRAP(PTP_GENERAL, CONTROL), + DEVLINK_TRAP(FLOW_ACTION_SAMPLE, CONTROL), + DEVLINK_TRAP(FLOW_ACTION_TRAP, CONTROL), }; #define DEVLINK_TRAP_GROUP(_id) \ @@ -8568,6 +8570,8 @@ static const struct devlink_trap_group devlink_trap_group_generic[] = { DEVLINK_TRAP_GROUP(IPV6), DEVLINK_TRAP_GROUP(PTP_EVENT), DEVLINK_TRAP_GROUP(PTP_GENERAL), + DEVLINK_TRAP_GROUP(ACL_SAMPLE), + DEVLINK_TRAP_GROUP(ACL_TRAP), }; static int devlink_trap_generic_verify(const struct devlink_trap *trap) -- cgit v1.2.3 From 0af413bd3e2de73bcf0742ed556be4af83c71964 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 29 May 2020 22:13:58 +0200 Subject: flow_dissector: work around stack frame size warning The fl_flow_key structure is around 500 bytes, so having two of them on the stack in one function now exceeds the warning limit after an otherwise correct change: net/sched/cls_flower.c:298:12: error: stack frame size of 1056 bytes in function 'fl_classify' [-Werror,-Wframe-larger-than=] I suspect the fl_classify function could be reworked to only have one of them on the stack and modify it in place, but I could not work out how to do that. As a somewhat hacky workaround, move one of them into an out-of-line function to reduce its scope. This does not necessarily reduce the stack usage of the outer function, but at least the second copy is removed from the stack during most of it and does not add up to whatever is called from there. I now see 552 bytes of stack usage for fl_classify(), plus 528 bytes for fl_mask_lookup(). Fixes: 58cff782cc55 ("flow_dissector: Parse multiple MPLS Label Stack Entries") Signed-off-by: Arnd Bergmann Acked-by: Cong Wang Acked-by: Guillaume Nault Signed-off-by: David S. Miller --- net/sched/cls_flower.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 96f5999281e0..030896eadd11 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -272,14 +272,16 @@ static struct cls_fl_filter *fl_lookup_range(struct fl_flow_mask *mask, return NULL; } -static struct cls_fl_filter *fl_lookup(struct fl_flow_mask *mask, - struct fl_flow_key *mkey, - struct fl_flow_key *key) +static noinline_for_stack +struct cls_fl_filter *fl_mask_lookup(struct fl_flow_mask *mask, struct fl_flow_key *key) { + struct fl_flow_key mkey; + + fl_set_masked_key(&mkey, key, mask); if ((mask->flags & TCA_FLOWER_MASK_FLAGS_RANGE)) - return fl_lookup_range(mask, mkey, key); + return fl_lookup_range(mask, &mkey, key); - return __fl_lookup(mask, mkey); + return __fl_lookup(mask, &mkey); } static u16 fl_ct_info_to_flower_map[] = { @@ -299,7 +301,6 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { struct cls_fl_head *head = rcu_dereference_bh(tp->root); - struct fl_flow_key skb_mkey; struct fl_flow_key skb_key; struct fl_flow_mask *mask; struct cls_fl_filter *f; @@ -319,9 +320,7 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, ARRAY_SIZE(fl_ct_info_to_flower_map)); skb_flow_dissect(skb, &mask->dissector, &skb_key, 0); - fl_set_masked_key(&skb_mkey, &skb_key, mask); - - f = fl_lookup(mask, &skb_mkey, &skb_key); + f = fl_mask_lookup(mask, &skb_key); if (f && !tc_skip_sw(f->flags)) { *res = f->res; return tcf_exts_exec(skb, &f->exts, res); -- cgit v1.2.3 From 4b3a61b030d1131dcf3633a276158a3d0a435a47 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Sat, 30 May 2020 18:09:47 +0000 Subject: bridge: mrp: Set the priority of MRP instance Each MRP instance has a priority, a lower value means a higher priority. The priority of MRP instance is stored in MRP_Test frame in this way all the MRP nodes in the ring can see other nodes priority. Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- include/net/switchdev.h | 1 + include/uapi/linux/if_bridge.h | 2 ++ net/bridge/br_mrp.c | 3 ++- net/bridge/br_mrp_netlink.c | 5 +++++ net/bridge/br_mrp_switchdev.c | 1 + net/bridge/br_private_mrp.h | 1 + 6 files changed, 12 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/switchdev.h b/include/net/switchdev.h index db519957e134..f82ef4c45f5e 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -116,6 +116,7 @@ struct switchdev_obj_mrp { struct net_device *p_port; struct net_device *s_port; u32 ring_id; + u16 prio; }; #define SWITCHDEV_OBJ_MRP(OBJ) \ diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index 5a43eb86c93b..0162c1370ecb 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -176,6 +176,7 @@ enum { IFLA_BRIDGE_MRP_INSTANCE_RING_ID, IFLA_BRIDGE_MRP_INSTANCE_P_IFINDEX, IFLA_BRIDGE_MRP_INSTANCE_S_IFINDEX, + IFLA_BRIDGE_MRP_INSTANCE_PRIO, __IFLA_BRIDGE_MRP_INSTANCE_MAX, }; @@ -230,6 +231,7 @@ struct br_mrp_instance { __u32 ring_id; __u32 p_ifindex; __u32 s_ifindex; + __u16 prio; }; struct br_mrp_ring_state { diff --git a/net/bridge/br_mrp.c b/net/bridge/br_mrp.c index 8ea59504ef47..f8fd037219fe 100644 --- a/net/bridge/br_mrp.c +++ b/net/bridge/br_mrp.c @@ -147,7 +147,7 @@ static struct sk_buff *br_mrp_alloc_test_skb(struct br_mrp *mrp, br_mrp_skb_tlv(skb, BR_MRP_TLV_HEADER_RING_TEST, sizeof(*hdr)); hdr = skb_put(skb, sizeof(*hdr)); - hdr->prio = cpu_to_be16(MRP_DEFAULT_PRIO); + hdr->prio = cpu_to_be16(mrp->prio); ether_addr_copy(hdr->sa, p->br->dev->dev_addr); hdr->port_role = cpu_to_be16(port_role); hdr->state = cpu_to_be16(mrp->ring_state); @@ -290,6 +290,7 @@ int br_mrp_add(struct net_bridge *br, struct br_mrp_instance *instance) return -ENOMEM; mrp->ring_id = instance->ring_id; + mrp->prio = instance->prio; p = br_mrp_get_port(br, instance->p_ifindex); spin_lock_bh(&br->lock); diff --git a/net/bridge/br_mrp_netlink.c b/net/bridge/br_mrp_netlink.c index d9de780d2ce0..8cb67d9ca44e 100644 --- a/net/bridge/br_mrp_netlink.c +++ b/net/bridge/br_mrp_netlink.c @@ -22,6 +22,7 @@ br_mrp_instance_policy[IFLA_BRIDGE_MRP_INSTANCE_MAX + 1] = { [IFLA_BRIDGE_MRP_INSTANCE_RING_ID] = { .type = NLA_U32 }, [IFLA_BRIDGE_MRP_INSTANCE_P_IFINDEX] = { .type = NLA_U32 }, [IFLA_BRIDGE_MRP_INSTANCE_S_IFINDEX] = { .type = NLA_U32 }, + [IFLA_BRIDGE_MRP_INSTANCE_PRIO] = { .type = NLA_U16 }, }; static int br_mrp_instance_parse(struct net_bridge *br, struct nlattr *attr, @@ -49,6 +50,10 @@ static int br_mrp_instance_parse(struct net_bridge *br, struct nlattr *attr, inst.ring_id = nla_get_u32(tb[IFLA_BRIDGE_MRP_INSTANCE_RING_ID]); inst.p_ifindex = nla_get_u32(tb[IFLA_BRIDGE_MRP_INSTANCE_P_IFINDEX]); inst.s_ifindex = nla_get_u32(tb[IFLA_BRIDGE_MRP_INSTANCE_S_IFINDEX]); + inst.prio = MRP_DEFAULT_PRIO; + + if (tb[IFLA_BRIDGE_MRP_INSTANCE_PRIO]) + inst.prio = nla_get_u16(tb[IFLA_BRIDGE_MRP_INSTANCE_PRIO]); if (cmd == RTM_SETLINK) return br_mrp_add(br, &inst); diff --git a/net/bridge/br_mrp_switchdev.c b/net/bridge/br_mrp_switchdev.c index 51cb1d5a24b4..3a776043bf80 100644 --- a/net/bridge/br_mrp_switchdev.c +++ b/net/bridge/br_mrp_switchdev.c @@ -12,6 +12,7 @@ int br_mrp_switchdev_add(struct net_bridge *br, struct br_mrp *mrp) .p_port = rtnl_dereference(mrp->p_port)->dev, .s_port = rtnl_dereference(mrp->s_port)->dev, .ring_id = mrp->ring_id, + .prio = mrp->prio, }; int err; diff --git a/net/bridge/br_private_mrp.h b/net/bridge/br_private_mrp.h index a0f53cc3ab85..558941ce2366 100644 --- a/net/bridge/br_private_mrp.h +++ b/net/bridge/br_private_mrp.h @@ -14,6 +14,7 @@ struct br_mrp { struct net_bridge_port __rcu *s_port; u32 ring_id; + u16 prio; enum br_mrp_ring_role_type ring_role; u8 ring_role_offloaded; -- cgit v1.2.3 From c6676e7d62cfb5cb7c1c5320a26f3634a11afdb0 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Sat, 30 May 2020 18:09:48 +0000 Subject: bridge: mrp: Add support for role MRA A node that has the MRA role, it can behave as MRM or MRC. Initially it starts as MRM and sends MRP_Test frames on both ring ports. If it detects that there are MRP_Test send by another MRM, then it checks if these frames have a lower priority than itself. In this case it would send MRP_Nack frames to notify the other node that it needs to stop sending MRP_Test frames. If it receives a MRP_Nack frame then it stops sending MRP_Test frames and starts to behave as a MRC but it would continue to monitor the MRP_Test frames send by MRM. If at a point the MRM stops to send MRP_Test frames it would get the MRM role and start to send MRP_Test frames. Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- include/net/switchdev.h | 1 + include/uapi/linux/if_bridge.h | 2 + include/uapi/linux/mrp_bridge.h | 38 ++++++++++++ net/bridge/br_mrp.c | 125 ++++++++++++++++++++++++++++++++++------ net/bridge/br_mrp_netlink.c | 6 ++ net/bridge/br_mrp_switchdev.c | 4 +- net/bridge/br_private_mrp.h | 4 +- 7 files changed, 159 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/include/net/switchdev.h b/include/net/switchdev.h index f82ef4c45f5e..b8c059b4e06d 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -130,6 +130,7 @@ struct switchdev_obj_ring_test_mrp { u8 max_miss; u32 ring_id; u32 period; + bool monitor; }; #define SWITCHDEV_OBJ_RING_TEST_MRP(OBJ) \ diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index 0162c1370ecb..caa6914a3e53 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -222,6 +222,7 @@ enum { IFLA_BRIDGE_MRP_START_TEST_INTERVAL, IFLA_BRIDGE_MRP_START_TEST_MAX_MISS, IFLA_BRIDGE_MRP_START_TEST_PERIOD, + IFLA_BRIDGE_MRP_START_TEST_MONITOR, __IFLA_BRIDGE_MRP_START_TEST_MAX, }; @@ -249,6 +250,7 @@ struct br_mrp_start_test { __u32 interval; __u32 max_miss; __u32 period; + __u32 monitor; }; struct bridge_stp_xstats { diff --git a/include/uapi/linux/mrp_bridge.h b/include/uapi/linux/mrp_bridge.h index bcad42128d62..84f15f48a7cb 100644 --- a/include/uapi/linux/mrp_bridge.h +++ b/include/uapi/linux/mrp_bridge.h @@ -11,11 +11,14 @@ #define MRP_DOMAIN_UUID_LENGTH 16 #define MRP_VERSION 1 #define MRP_FRAME_PRIO 7 +#define MRP_OUI_LENGTH 3 +#define MRP_MANUFACTURE_DATA_LENGTH 2 enum br_mrp_ring_role_type { BR_MRP_RING_ROLE_DISABLED, BR_MRP_RING_ROLE_MRC, BR_MRP_RING_ROLE_MRM, + BR_MRP_RING_ROLE_MRA, }; enum br_mrp_ring_state_type { @@ -43,6 +46,13 @@ enum br_mrp_tlv_header_type { BR_MRP_TLV_HEADER_RING_TOPO = 0x3, BR_MRP_TLV_HEADER_RING_LINK_DOWN = 0x4, BR_MRP_TLV_HEADER_RING_LINK_UP = 0x5, + BR_MRP_TLV_HEADER_OPTION = 0x7f, +}; + +enum br_mrp_sub_tlv_header_type { + BR_MRP_SUB_TLV_HEADER_TEST_MGR_NACK = 0x1, + BR_MRP_SUB_TLV_HEADER_TEST_PROPAGATE = 0x2, + BR_MRP_SUB_TLV_HEADER_TEST_AUTO_MGR = 0x3, }; struct br_mrp_tlv_hdr { @@ -50,6 +60,11 @@ struct br_mrp_tlv_hdr { __u8 length; }; +struct br_mrp_sub_tlv_hdr { + __u8 type; + __u8 length; +}; + struct br_mrp_end_hdr { struct br_mrp_tlv_hdr hdr; }; @@ -81,4 +96,27 @@ struct br_mrp_ring_link_hdr { __be16 blocked; }; +struct br_mrp_sub_opt_hdr { + __u8 type; + __u8 manufacture_data[MRP_MANUFACTURE_DATA_LENGTH]; +}; + +struct br_mrp_test_mgr_nack_hdr { + __be16 prio; + __u8 sa[ETH_ALEN]; + __be16 other_prio; + __u8 other_sa[ETH_ALEN]; +}; + +struct br_mrp_test_prop_hdr { + __be16 prio; + __u8 sa[ETH_ALEN]; + __be16 other_prio; + __u8 other_sa[ETH_ALEN]; +}; + +struct br_mrp_oui_hdr { + __u8 oui[MRP_OUI_LENGTH]; +}; + #endif diff --git a/net/bridge/br_mrp.c b/net/bridge/br_mrp.c index f8fd037219fe..24986ec7d38c 100644 --- a/net/bridge/br_mrp.c +++ b/net/bridge/br_mrp.c @@ -160,6 +160,16 @@ static struct sk_buff *br_mrp_alloc_test_skb(struct br_mrp *mrp, return skb; } +/* This function is continuously called in the following cases: + * - when node role is MRM, in this case test_monitor is always set to false + * because it needs to notify the userspace that the ring is open and needs to + * send MRP_Test frames + * - when node role is MRA, there are 2 subcases: + * - when MRA behaves as MRM, in this case is similar with MRM role + * - when MRA behaves as MRC, in this case test_monitor is set to true, + * because it needs to detect when it stops seeing MRP_Test frames + * from MRM node but it doesn't need to send MRP_Test frames. + */ static void br_mrp_test_work_expired(struct work_struct *work) { struct delayed_work *del_work = to_delayed_work(work); @@ -177,8 +187,14 @@ static void br_mrp_test_work_expired(struct work_struct *work) /* Notify that the ring is open only if the ring state is * closed, otherwise it would continue to notify at every * interval. + * Also notify that the ring is open when the node has the + * role MRA and behaves as MRC. The reason is that the + * userspace needs to know when the MRM stopped sending + * MRP_Test frames so that the current node to try to take + * the role of a MRM. */ - if (mrp->ring_state == BR_MRP_RING_STATE_CLOSED) + if (mrp->ring_state == BR_MRP_RING_STATE_CLOSED || + mrp->test_monitor) notify_open = true; } @@ -186,12 +202,15 @@ static void br_mrp_test_work_expired(struct work_struct *work) p = rcu_dereference(mrp->p_port); if (p) { - skb = br_mrp_alloc_test_skb(mrp, p, BR_MRP_PORT_ROLE_PRIMARY); - if (!skb) - goto out; - - skb_reset_network_header(skb); - dev_queue_xmit(skb); + if (!mrp->test_monitor) { + skb = br_mrp_alloc_test_skb(mrp, p, + BR_MRP_PORT_ROLE_PRIMARY); + if (!skb) + goto out; + + skb_reset_network_header(skb); + dev_queue_xmit(skb); + } if (notify_open && !mrp->ring_role_offloaded) br_mrp_port_open(p->dev, true); @@ -199,12 +218,15 @@ static void br_mrp_test_work_expired(struct work_struct *work) p = rcu_dereference(mrp->s_port); if (p) { - skb = br_mrp_alloc_test_skb(mrp, p, BR_MRP_PORT_ROLE_SECONDARY); - if (!skb) - goto out; - - skb_reset_network_header(skb); - dev_queue_xmit(skb); + if (!mrp->test_monitor) { + skb = br_mrp_alloc_test_skb(mrp, p, + BR_MRP_PORT_ROLE_SECONDARY); + if (!skb) + goto out; + + skb_reset_network_header(skb); + dev_queue_xmit(skb); + } if (notify_open && !mrp->ring_role_offloaded) br_mrp_port_open(p->dev, true); @@ -227,7 +249,7 @@ static void br_mrp_del_impl(struct net_bridge *br, struct br_mrp *mrp) /* Stop sending MRP_Test frames */ cancel_delayed_work_sync(&mrp->test_work); - br_mrp_switchdev_send_ring_test(br, mrp, 0, 0, 0); + br_mrp_switchdev_send_ring_test(br, mrp, 0, 0, 0, 0); br_mrp_switchdev_del(br, mrp); @@ -452,8 +474,8 @@ int br_mrp_set_ring_role(struct net_bridge *br, return 0; } -/* Start to generate MRP test frames, the frames are generated by HW and if it - * fails, they are generated by the SW. +/* Start to generate or monitor MRP test frames, the frames are generated by + * HW and if it fails, they are generated by the SW. * note: already called with rtnl_lock */ int br_mrp_start_test(struct net_bridge *br, @@ -464,16 +486,18 @@ int br_mrp_start_test(struct net_bridge *br, if (!mrp) return -EINVAL; - /* Try to push it to the HW and if it fails then continue to generate in - * SW and if that also fails then return error + /* Try to push it to the HW and if it fails then continue with SW + * implementation and if that also fails then return error. */ if (!br_mrp_switchdev_send_ring_test(br, mrp, test->interval, - test->max_miss, test->period)) + test->max_miss, test->period, + test->monitor)) return 0; mrp->test_interval = test->interval; mrp->test_end = jiffies + usecs_to_jiffies(test->period); mrp->test_max_miss = test->max_miss; + mrp->test_monitor = test->monitor; mrp->test_count_miss = 0; queue_delayed_work(system_wq, &mrp->test_work, usecs_to_jiffies(test->interval)); @@ -510,6 +534,57 @@ static void br_mrp_mrm_process(struct br_mrp *mrp, struct net_bridge_port *port, br_mrp_port_open(port->dev, false); } +/* Determin if the test hdr has a better priority than the node */ +static bool br_mrp_test_better_than_own(struct br_mrp *mrp, + struct net_bridge *br, + const struct br_mrp_ring_test_hdr *hdr) +{ + u16 prio = be16_to_cpu(hdr->prio); + + if (prio < mrp->prio || + (prio == mrp->prio && + ether_addr_to_u64(hdr->sa) < ether_addr_to_u64(br->dev->dev_addr))) + return true; + + return false; +} + +/* Process only MRP Test frame. All the other MRP frames are processed by + * userspace application + * note: already called with rcu_read_lock + */ +static void br_mrp_mra_process(struct br_mrp *mrp, struct net_bridge *br, + struct net_bridge_port *port, + struct sk_buff *skb) +{ + const struct br_mrp_ring_test_hdr *test_hdr; + struct br_mrp_ring_test_hdr _test_hdr; + const struct br_mrp_tlv_hdr *hdr; + struct br_mrp_tlv_hdr _hdr; + + /* Each MRP header starts with a version field which is 16 bits. + * Therefore skip the version and get directly the TLV header. + */ + hdr = skb_header_pointer(skb, sizeof(uint16_t), sizeof(_hdr), &_hdr); + if (!hdr) + return; + + if (hdr->type != BR_MRP_TLV_HEADER_RING_TEST) + return; + + test_hdr = skb_header_pointer(skb, sizeof(uint16_t) + sizeof(_hdr), + sizeof(_test_hdr), &_test_hdr); + if (!test_hdr) + return; + + /* Only frames that have a better priority than the node will + * clear the miss counter because otherwise the node will need to behave + * as MRM. + */ + if (br_mrp_test_better_than_own(mrp, br, test_hdr)) + mrp->test_count_miss = 0; +} + /* This will just forward the frame to the other mrp ring port(MRC role) or will * not do anything. * note: already called with rcu_read_lock @@ -546,6 +621,18 @@ static int br_mrp_rcv(struct net_bridge_port *p, return 1; } + /* If the role is MRA then don't forward the frames if it behaves as + * MRM node + */ + if (mrp->ring_role == BR_MRP_RING_ROLE_MRA) { + if (!mrp->test_monitor) { + br_mrp_mrm_process(mrp, p, skb); + return 1; + } + + br_mrp_mra_process(mrp, br, p, skb); + } + /* Clone the frame and forward it on the other MRP port */ nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) diff --git a/net/bridge/br_mrp_netlink.c b/net/bridge/br_mrp_netlink.c index 8cb67d9ca44e..34b3a8776991 100644 --- a/net/bridge/br_mrp_netlink.c +++ b/net/bridge/br_mrp_netlink.c @@ -196,6 +196,7 @@ br_mrp_start_test_policy[IFLA_BRIDGE_MRP_START_TEST_MAX + 1] = { [IFLA_BRIDGE_MRP_START_TEST_INTERVAL] = { .type = NLA_U32 }, [IFLA_BRIDGE_MRP_START_TEST_MAX_MISS] = { .type = NLA_U32 }, [IFLA_BRIDGE_MRP_START_TEST_PERIOD] = { .type = NLA_U32 }, + [IFLA_BRIDGE_MRP_START_TEST_MONITOR] = { .type = NLA_U32 }, }; static int br_mrp_start_test_parse(struct net_bridge *br, struct nlattr *attr, @@ -225,6 +226,11 @@ static int br_mrp_start_test_parse(struct net_bridge *br, struct nlattr *attr, test.interval = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_TEST_INTERVAL]); test.max_miss = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_TEST_MAX_MISS]); test.period = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_TEST_PERIOD]); + test.monitor = false; + + if (tb[IFLA_BRIDGE_MRP_START_TEST_MONITOR]) + test.monitor = + nla_get_u32(tb[IFLA_BRIDGE_MRP_START_TEST_MONITOR]); return br_mrp_start_test(br, &test); } diff --git a/net/bridge/br_mrp_switchdev.c b/net/bridge/br_mrp_switchdev.c index 3a776043bf80..0da68a0da4b5 100644 --- a/net/bridge/br_mrp_switchdev.c +++ b/net/bridge/br_mrp_switchdev.c @@ -65,7 +65,8 @@ int br_mrp_switchdev_set_ring_role(struct net_bridge *br, int br_mrp_switchdev_send_ring_test(struct net_bridge *br, struct br_mrp *mrp, u32 interval, - u8 max_miss, u32 period) + u8 max_miss, u32 period, + bool monitor) { struct switchdev_obj_ring_test_mrp test = { .obj.orig_dev = br->dev, @@ -74,6 +75,7 @@ int br_mrp_switchdev_send_ring_test(struct net_bridge *br, .max_miss = max_miss, .ring_id = mrp->ring_id, .period = period, + .monitor = monitor, }; int err; diff --git a/net/bridge/br_private_mrp.h b/net/bridge/br_private_mrp.h index 558941ce2366..33b255e38ffe 100644 --- a/net/bridge/br_private_mrp.h +++ b/net/bridge/br_private_mrp.h @@ -26,6 +26,7 @@ struct br_mrp { unsigned long test_end; u32 test_count_miss; u32 test_max_miss; + bool test_monitor; u32 seq_id; @@ -52,7 +53,8 @@ int br_mrp_switchdev_set_ring_role(struct net_bridge *br, struct br_mrp *mrp, int br_mrp_switchdev_set_ring_state(struct net_bridge *br, struct br_mrp *mrp, enum br_mrp_ring_state_type state); int br_mrp_switchdev_send_ring_test(struct net_bridge *br, struct br_mrp *mrp, - u32 interval, u8 max_miss, u32 period); + u32 interval, u8 max_miss, u32 period, + bool monitor); int br_mrp_port_switchdev_set_state(struct net_bridge_port *p, enum br_mrp_port_state_type state); int br_mrp_port_switchdev_set_role(struct net_bridge_port *p, -- cgit v1.2.3 From 4e4f4ce6abf5f6a8df0561776d3a790d60d519d0 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Sat, 30 May 2020 20:49:56 +0200 Subject: cls_flower: remove mpls_opts_policy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Compiling with W=1 gives the following warning: net/sched/cls_flower.c:731:1: warning: ‘mpls_opts_policy’ defined but not used [-Wunused-const-variable=] The TCA_FLOWER_KEY_MPLS_OPTS contains a list of TCA_FLOWER_KEY_MPLS_OPTS_LSE. Therefore, the attributes all have the same type and we can't parse the list with nla_parse*() and have the attributes validated automatically using an nla_policy. fl_set_key_mpls_opts() properly verifies that all attributes in the list are TCA_FLOWER_KEY_MPLS_OPTS_LSE. Then fl_set_key_mpls_lse() uses nla_parse_nested() on all these attributes, thus verifying that they have the NLA_F_NESTED flag. So we can safely drop the mpls_opts_policy. Reported-by: kbuild test robot Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/sched/cls_flower.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'net') diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 030896eadd11..b2da37286082 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -726,11 +726,6 @@ erspan_opt_policy[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX + 1] = { [TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID] = { .type = NLA_U8 }, }; -static const struct nla_policy -mpls_opts_policy[TCA_FLOWER_KEY_MPLS_OPTS_MAX + 1] = { - [TCA_FLOWER_KEY_MPLS_OPTS_LSE] = { .type = NLA_NESTED }, -}; - static const struct nla_policy mpls_stack_entry_policy[TCA_FLOWER_KEY_MPLS_OPT_LSE_MAX + 1] = { [TCA_FLOWER_KEY_MPLS_OPT_LSE_DEPTH] = { .type = NLA_U8 }, -- cgit v1.2.3 From 547ce4cfb34cdecfa0ee19c29a5510329a7ac802 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 31 May 2020 02:06:55 +0100 Subject: switch cmsghdr_from_user_compat_to_kern() to copy_from_user() no point getting compat_cmsghdr field-by-field Signed-off-by: Al Viro Signed-off-by: David S. Miller --- net/compat.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/compat.c b/net/compat.c index afd7b444e0bf..5e3041a2c37d 100644 --- a/net/compat.c +++ b/net/compat.c @@ -183,20 +183,21 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, struct sock *sk, memset(kcmsg, 0, kcmlen); ucmsg = CMSG_COMPAT_FIRSTHDR(kmsg); while (ucmsg != NULL) { - if (__get_user(ucmlen, &ucmsg->cmsg_len)) + struct compat_cmsghdr cmsg; + if (copy_from_user(&cmsg, ucmsg, sizeof(cmsg))) goto Efault; - if (!CMSG_COMPAT_OK(ucmlen, ucmsg, kmsg)) + if (!CMSG_COMPAT_OK(cmsg.cmsg_len, ucmsg, kmsg)) goto Einval; - tmp = ((ucmlen - sizeof(*ucmsg)) + sizeof(struct cmsghdr)); + tmp = ((cmsg.cmsg_len - sizeof(*ucmsg)) + sizeof(struct cmsghdr)); if ((char *)kcmsg_base + kcmlen - (char *)kcmsg < CMSG_ALIGN(tmp)) goto Einval; kcmsg->cmsg_len = tmp; + kcmsg->cmsg_level = cmsg.cmsg_level; + kcmsg->cmsg_type = cmsg.cmsg_type; tmp = CMSG_ALIGN(tmp); - if (__get_user(kcmsg->cmsg_level, &ucmsg->cmsg_level) || - __get_user(kcmsg->cmsg_type, &ucmsg->cmsg_type) || - copy_from_user(CMSG_DATA(kcmsg), + if (copy_from_user(CMSG_DATA(kcmsg), CMSG_COMPAT_DATA(ucmsg), - (ucmlen - sizeof(*ucmsg)))) + (cmsg.cmsg_len - sizeof(*ucmsg)))) goto Efault; /* Advance. */ -- cgit v1.2.3 From abe3cac8706bffeda3ebc06e4a9fa6e9cadacf26 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 24 May 2020 09:50:33 -0700 Subject: bpf, sk_msg: Add some generic helpers that may be useful from sk_msg Add these generic helpers that may be useful to use from sk_msg programs. The helpers do not depend on ctx so we can simply add them here, BPF_FUNC_perf_event_output BPF_FUNC_get_current_uid_gid BPF_FUNC_get_current_pid_tgid BPF_FUNC_get_current_cgroup_id BPF_FUNC_get_current_ancestor_cgroup_id BPF_FUNC_get_cgroup_classid Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/159033903373.12355.15489763099696629346.stgit@john-Precision-5820-Tower Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index bd2853d23b50..c3b496a19748 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6443,6 +6443,22 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_msg_push_data_proto; case BPF_FUNC_msg_pop_data: return &bpf_msg_pop_data_proto; + case BPF_FUNC_perf_event_output: + return &bpf_event_output_data_proto; + case BPF_FUNC_get_current_uid_gid: + return &bpf_get_current_uid_gid_proto; + case BPF_FUNC_get_current_pid_tgid: + return &bpf_get_current_pid_tgid_proto; +#ifdef CONFIG_CGROUPS + case BPF_FUNC_get_current_cgroup_id: + return &bpf_get_current_cgroup_id_proto; + case BPF_FUNC_get_current_ancestor_cgroup_id: + return &bpf_get_current_ancestor_cgroup_id_proto; +#endif +#ifdef CONFIG_CGROUP_NET_CLASSID + case BPF_FUNC_get_cgroup_classid: + return &bpf_get_cgroup_classid_curr_proto; +#endif default: return bpf_base_func_proto(func_id); } -- cgit v1.2.3 From 13d70f5a5ecff367db2fb18ed4ebe433eab8a74c Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 24 May 2020 09:51:15 -0700 Subject: bpf, sk_msg: Add get socket storage helpers Add helpers to use local socket storage. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/159033907577.12355.14740125020572756560.stgit@john-Precision-5820-Tower Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 2 ++ net/core/filter.c | 15 +++++++++++++++ tools/include/uapi/linux/bpf.h | 2 ++ 3 files changed, 19 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 97e1fd19ff58..54b93f8b49b8 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3645,6 +3645,8 @@ struct sk_msg_md { __u32 remote_port; /* Stored in network byte order */ __u32 local_port; /* stored in host byte order */ __u32 size; /* Total size of sk_msg */ + + __bpf_md_ptr(struct bpf_sock *, sk); /* current socket */ }; struct sk_reuseport_md { diff --git a/net/core/filter.c b/net/core/filter.c index c3b496a19748..a6fc23447f12 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6449,6 +6449,10 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_current_uid_gid_proto; case BPF_FUNC_get_current_pid_tgid: return &bpf_get_current_pid_tgid_proto; + case BPF_FUNC_sk_storage_get: + return &bpf_sk_storage_get_proto; + case BPF_FUNC_sk_storage_delete: + return &bpf_sk_storage_delete_proto; #ifdef CONFIG_CGROUPS case BPF_FUNC_get_current_cgroup_id: return &bpf_get_current_cgroup_id_proto; @@ -7273,6 +7277,11 @@ static bool sk_msg_is_valid_access(int off, int size, if (size != sizeof(__u64)) return false; break; + case offsetof(struct sk_msg_md, sk): + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_SOCKET; + break; case bpf_ctx_range(struct sk_msg_md, family): case bpf_ctx_range(struct sk_msg_md, remote_ip4): case bpf_ctx_range(struct sk_msg_md, local_ip4): @@ -8609,6 +8618,12 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, si->dst_reg, si->src_reg, offsetof(struct sk_msg_sg, size)); break; + + case offsetof(struct sk_msg_md, sk): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg, sk)); + break; } return insn - insn_buf; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 97e1fd19ff58..54b93f8b49b8 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3645,6 +3645,8 @@ struct sk_msg_md { __u32 remote_port; /* Stored in network byte order */ __u32 local_port; /* stored in host byte order */ __u32 size; /* Total size of sk_msg */ + + __bpf_md_ptr(struct bpf_sock *, sk); /* current socket */ }; struct sk_reuseport_md { -- cgit v1.2.3 From c3c16f2ea6d20159903cf93afbb1155f3d8348d5 Mon Sep 17 00:00:00 2001 From: Amritha Nambiar Date: Tue, 26 May 2020 17:34:36 -0700 Subject: bpf: Add rx_queue_mapping to bpf_sock Add "rx_queue_mapping" to bpf_sock. This gives read access for the existing field (sk_rx_queue_mapping) of struct sock from bpf_sock. Semantics for the bpf_sock rx_queue_mapping access are similar to sk_rx_queue_get(), i.e the value NO_QUEUE_MAPPING is not allowed and -1 is returned in that case. This is useful for transmit queue selection based on the received queue index which is cached in the socket in the receive path. v3: Addressed review comments to add usecase in patch description, and fixed default value for rx_queue_mapping. v2: fixed build error for CONFIG_XPS wrapping, reported by kbuild test robot Signed-off-by: Amritha Nambiar Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 1 + net/core/filter.c | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 974ca6e948e3..630432c5c292 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3612,6 +3612,7 @@ struct bpf_sock { __u32 dst_ip4; __u32 dst_ip6[4]; __u32 state; + __s32 rx_queue_mapping; }; struct bpf_tcp_sock { diff --git a/net/core/filter.c b/net/core/filter.c index a6fc23447f12..0008b029d644 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6849,6 +6849,7 @@ bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, case offsetof(struct bpf_sock, protocol): case offsetof(struct bpf_sock, dst_port): case offsetof(struct bpf_sock, src_port): + case offsetof(struct bpf_sock, rx_queue_mapping): case bpf_ctx_range(struct bpf_sock, src_ip4): case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): case bpf_ctx_range(struct bpf_sock, dst_ip4): @@ -7897,6 +7898,23 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, skc_state), target_size)); break; + case offsetof(struct bpf_sock, rx_queue_mapping): +#ifdef CONFIG_XPS + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct sock, sk_rx_queue_mapping), + si->dst_reg, si->src_reg, + bpf_target_off(struct sock, sk_rx_queue_mapping, + sizeof_field(struct sock, + sk_rx_queue_mapping), + target_size)); + *insn++ = BPF_JMP_IMM(BPF_JNE, si->dst_reg, NO_QUEUE_MAPPING, + 1); + *insn++ = BPF_MOV64_IMM(si->dst_reg, -1); +#else + *insn++ = BPF_MOV64_IMM(si->dst_reg, -1); + *target_size = 2; +#endif + break; } return insn - insn_buf; -- cgit v1.2.3 From fbee97feed9b3e4acdf9590e1f6b4a2eefecfffe Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 29 May 2020 16:07:13 -0600 Subject: bpf: Add support to attach bpf program to a devmap entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add BPF_XDP_DEVMAP attach type for use with programs associated with a DEVMAP entry. Allow DEVMAPs to associate a program with a device entry by adding a bpf_prog.fd to 'struct bpf_devmap_val'. Values read show the program id, so the fd and id are a union. bpf programs can get access to the struct via vmlinux.h. The program associated with the fd must have type XDP with expected attach type BPF_XDP_DEVMAP. When a program is associated with a device index, the program is run on an XDP_REDIRECT and before the buffer is added to the per-cpu queue. At this point rxq data is still valid; the next patch adds tx device information allowing the prorgam to see both ingress and egress device indices. XDP generic is skb based and XDP programs do not work with skb's. Block the use case by walking maps used by a program that is to be attached via xdpgeneric and fail if any of them are DEVMAP / DEVMAP_HASH with Block attach of BPF_XDP_DEVMAP programs to devices. Signed-off-by: David Ahern Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20200529220716.75383-3-dsahern@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 5 +++ include/uapi/linux/bpf.h | 1 + kernel/bpf/devmap.c | 88 ++++++++++++++++++++++++++++++++++++++++-- net/core/dev.c | 18 +++++++++ tools/include/uapi/linux/bpf.h | 1 + 5 files changed, 109 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e5884f7f801c..e042311f991f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1250,6 +1250,7 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, struct net_device *dev_rx); int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, struct bpf_prog *xdp_prog); +bool dev_map_can_have_prog(struct bpf_map *map); struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key); void __cpu_map_flush(void); @@ -1363,6 +1364,10 @@ static inline struct net_device *__dev_map_hash_lookup_elem(struct bpf_map *map { return NULL; } +static inline bool dev_map_can_have_prog(struct bpf_map *map) +{ + return false; +} static inline void __dev_flush(void) { diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 630432c5c292..f1e364d69007 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -225,6 +225,7 @@ enum bpf_attach_type { BPF_CGROUP_INET6_GETPEERNAME, BPF_CGROUP_INET4_GETSOCKNAME, BPF_CGROUP_INET6_GETSOCKNAME, + BPF_XDP_DEVMAP, __MAX_BPF_ATTACH_TYPE }; diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index a1459de0914e..0089d56617ec 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -63,12 +63,17 @@ struct xdp_dev_bulk_queue { /* DEVMAP values */ struct bpf_devmap_val { u32 ifindex; /* device index */ + union { + int fd; /* prog fd on map write */ + u32 id; /* prog id on map read */ + } bpf_prog; }; struct bpf_dtab_netdev { struct net_device *dev; /* must be first member, due to tracepoint */ struct hlist_node index_hlist; struct bpf_dtab *dtab; + struct bpf_prog *xdp_prog; struct rcu_head rcu; unsigned int idx; struct bpf_devmap_val val; @@ -111,12 +116,18 @@ static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab, static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) { + u32 valsize = attr->value_size; u64 cost = 0; int err; - /* check sanity of attributes */ + /* check sanity of attributes. 2 value sizes supported: + * 4 bytes: ifindex + * 8 bytes: ifindex + prog fd + */ if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK) + (valsize != offsetofend(struct bpf_devmap_val, ifindex) && + valsize != offsetofend(struct bpf_devmap_val, bpf_prog.fd)) || + attr->map_flags & ~DEV_CREATE_FLAG_MASK) return -EINVAL; /* Lookup returns a pointer straight to dev->ifindex, so make sure the @@ -223,6 +234,8 @@ static void dev_map_free(struct bpf_map *map) hlist_for_each_entry_safe(dev, next, head, index_hlist) { hlist_del_rcu(&dev->index_hlist); + if (dev->xdp_prog) + bpf_prog_put(dev->xdp_prog); dev_put(dev->dev); kfree(dev); } @@ -237,6 +250,8 @@ static void dev_map_free(struct bpf_map *map) if (!dev) continue; + if (dev->xdp_prog) + bpf_prog_put(dev->xdp_prog); dev_put(dev->dev); kfree(dev); } @@ -323,6 +338,16 @@ static int dev_map_hash_get_next_key(struct bpf_map *map, void *key, return -ENOENT; } +bool dev_map_can_have_prog(struct bpf_map *map) +{ + if ((map->map_type == BPF_MAP_TYPE_DEVMAP || + map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) && + map->value_size != offsetofend(struct bpf_devmap_val, ifindex)) + return true; + + return false; +} + static int bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) { struct net_device *dev = bq->dev; @@ -447,6 +472,30 @@ static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, return bq_enqueue(dev, xdpf, dev_rx); } +static struct xdp_buff *dev_map_run_prog(struct net_device *dev, + struct xdp_buff *xdp, + struct bpf_prog *xdp_prog) +{ + u32 act; + + act = bpf_prog_run_xdp(xdp_prog, xdp); + switch (act) { + case XDP_PASS: + return xdp; + case XDP_DROP: + break; + default: + bpf_warn_invalid_xdp_action(act); + fallthrough; + case XDP_ABORTED: + trace_xdp_exception(dev, xdp_prog, act); + break; + } + + xdp_return_buff(xdp); + return NULL; +} + int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, struct net_device *dev_rx) { @@ -458,6 +507,11 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, { struct net_device *dev = dst->dev; + if (dst->xdp_prog) { + xdp = dev_map_run_prog(dev, xdp, dst->xdp_prog); + if (!xdp) + return 0; + } return __xdp_enqueue(dev, xdp, dev_rx); } @@ -494,6 +548,8 @@ static void __dev_map_entry_free(struct rcu_head *rcu) struct bpf_dtab_netdev *dev; dev = container_of(rcu, struct bpf_dtab_netdev, rcu); + if (dev->xdp_prog) + bpf_prog_put(dev->xdp_prog); dev_put(dev->dev); kfree(dev); } @@ -547,6 +603,7 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, struct bpf_devmap_val *val, unsigned int idx) { + struct bpf_prog *prog = NULL; struct bpf_dtab_netdev *dev; dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN, @@ -558,11 +615,31 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, if (!dev->dev) goto err_out; + if (val->bpf_prog.fd >= 0) { + prog = bpf_prog_get_type_dev(val->bpf_prog.fd, + BPF_PROG_TYPE_XDP, false); + if (IS_ERR(prog)) + goto err_put_dev; + if (prog->expected_attach_type != BPF_XDP_DEVMAP) + goto err_put_prog; + } + dev->idx = idx; dev->dtab = dtab; + if (prog) { + dev->xdp_prog = prog; + dev->val.bpf_prog.id = prog->aux->id; + } else { + dev->xdp_prog = NULL; + dev->val.bpf_prog.id = 0; + } dev->val.ifindex = val->ifindex; return dev; +err_put_prog: + bpf_prog_put(prog); +err_put_dev: + dev_put(dev->dev); err_out: kfree(dev); return ERR_PTR(-EINVAL); @@ -572,8 +649,8 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + struct bpf_devmap_val val = { .bpf_prog.fd = -1 }; struct bpf_dtab_netdev *dev, *old_dev; - struct bpf_devmap_val val = { }; u32 i = *(u32 *)key; if (unlikely(map_flags > BPF_EXIST)) @@ -588,6 +665,9 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map, if (!val.ifindex) { dev = NULL; + /* can not specify fd if ifindex is 0 */ + if (val.bpf_prog.fd != -1) + return -EINVAL; } else { dev = __dev_map_alloc_node(net, dtab, &val, i); if (IS_ERR(dev)) @@ -616,8 +696,8 @@ static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + struct bpf_devmap_val val = { .bpf_prog.fd = -1 }; struct bpf_dtab_netdev *dev, *old_dev; - struct bpf_devmap_val val = { }; u32 idx = *(u32 *)key; unsigned long flags; int err = -EEXIST; diff --git a/net/core/dev.c b/net/core/dev.c index ae37586f6ee8..10684833f864 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5420,6 +5420,18 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) struct bpf_prog *new = xdp->prog; int ret = 0; + if (new) { + u32 i; + + /* generic XDP does not work with DEVMAPs that can + * have a bpf_prog installed on an entry + */ + for (i = 0; i < new->aux->used_map_cnt; i++) { + if (dev_map_can_have_prog(new->aux->used_maps[i])) + return -EINVAL; + } + } + switch (xdp->command) { case XDP_SETUP_PROG: rcu_assign_pointer(dev->xdp_prog, new); @@ -8835,6 +8847,12 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, return -EINVAL; } + if (prog->expected_attach_type == BPF_XDP_DEVMAP) { + NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device"); + bpf_prog_put(prog); + return -EINVAL; + } + /* prog->aux->id may be 0 for orphaned device-bound progs */ if (prog->aux->id && prog->aux->id == prog_id) { bpf_prog_put(prog); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 974ca6e948e3..65d7717bce2f 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -225,6 +225,7 @@ enum bpf_attach_type { BPF_CGROUP_INET6_GETPEERNAME, BPF_CGROUP_INET4_GETSOCKNAME, BPF_CGROUP_INET6_GETSOCKNAME, + BPF_XDP_DEVMAP, __MAX_BPF_ATTACH_TYPE }; -- cgit v1.2.3 From 64b59025c15b244c0954cf52b24fbabfcf5ed8f6 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 29 May 2020 16:07:14 -0600 Subject: xdp: Add xdp_txq_info to xdp_buff MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add xdp_txq_info as the Tx counterpart to xdp_rxq_info. At the moment only the device is added. Other fields (queue_index) can be added as use cases arise. >From a UAPI perspective, add egress_ifindex to xdp context for bpf programs to see the Tx device. Update the verifier to only allow accesses to egress_ifindex by XDP programs with BPF_XDP_DEVMAP expected attach type. Signed-off-by: David Ahern Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20200529220716.75383-4-dsahern@kernel.org Signed-off-by: Alexei Starovoitov --- include/net/xdp.h | 5 +++++ include/uapi/linux/bpf.h | 2 ++ kernel/bpf/devmap.c | 3 +++ net/core/filter.c | 17 +++++++++++++++++ tools/include/uapi/linux/bpf.h | 2 ++ 5 files changed, 29 insertions(+) (limited to 'net') diff --git a/include/net/xdp.h b/include/net/xdp.h index 90f11760bd12..d54022959491 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -61,12 +61,17 @@ struct xdp_rxq_info { struct xdp_mem_info mem; } ____cacheline_aligned; /* perf critical, avoid false-sharing */ +struct xdp_txq_info { + struct net_device *dev; +}; + struct xdp_buff { void *data; void *data_end; void *data_meta; void *data_hard_start; struct xdp_rxq_info *rxq; + struct xdp_txq_info *txq; u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/ }; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f1e364d69007..f862a58fb567 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3707,6 +3707,8 @@ struct xdp_md { /* Below access go through struct xdp_rxq_info */ __u32 ingress_ifindex; /* rxq->dev->ifindex */ __u32 rx_queue_index; /* rxq->queue_index */ + + __u32 egress_ifindex; /* txq->dev->ifindex */ }; enum sk_action { diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 0089d56617ec..c04fb1c72f5e 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -476,8 +476,11 @@ static struct xdp_buff *dev_map_run_prog(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { + struct xdp_txq_info txq = { .dev = dev }; u32 act; + xdp->txq = &txq; + act = bpf_prog_run_xdp(xdp_prog, xdp); switch (act) { case XDP_PASS: diff --git a/net/core/filter.c b/net/core/filter.c index 0008b029d644..85ff827aab73 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7015,6 +7015,13 @@ static bool xdp_is_valid_access(int off, int size, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { + if (prog->expected_attach_type != BPF_XDP_DEVMAP) { + switch (off) { + case offsetof(struct xdp_md, egress_ifindex): + return false; + } + } + if (type == BPF_WRITE) { if (bpf_prog_is_dev_bound(prog->aux)) { switch (off) { @@ -7985,6 +7992,16 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, offsetof(struct xdp_rxq_info, queue_index)); break; + case offsetof(struct xdp_md, egress_ifindex): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, txq), + si->dst_reg, si->src_reg, + offsetof(struct xdp_buff, txq)); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_txq_info, dev), + si->dst_reg, si->dst_reg, + offsetof(struct xdp_txq_info, dev)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct net_device, ifindex)); + break; } return insn - insn_buf; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 65d7717bce2f..f74bc4a2385e 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3706,6 +3706,8 @@ struct xdp_md { /* Below access go through struct xdp_rxq_info */ __u32 ingress_ifindex; /* rxq->dev->ifindex */ __u32 rx_queue_index; /* rxq->queue_index */ + + __u32 egress_ifindex; /* txq->dev->ifindex */ }; enum sk_action { -- cgit v1.2.3 From ca2f5f21dbbd5e3a00cd3e97f728aa2ca0b2e011 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 29 May 2020 16:06:41 -0700 Subject: bpf: Refactor sockmap redirect code so its easy to reuse We will need this block of code called from tls context shortly lets refactor the redirect logic so its easy to use. This also cleans up the switch stmt so we have fewer fallthrough cases. No logic changes are intended. Fixes: d829e9c4112b5 ("tls: convert to generic sk_msg interface") Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov Reviewed-by: Jakub Sitnicki Acked-by: Song Liu Link: https://lore.kernel.org/bpf/159079360110.5745.7024009076049029819.stgit@john-Precision-5820-Tower Signed-off-by: Alexei Starovoitov --- net/core/skmsg.c | 55 ++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 34 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index c479372f2cd2..9d72f71e9b47 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -682,13 +682,43 @@ static struct sk_psock *sk_psock_from_strp(struct strparser *strp) return container_of(parser, struct sk_psock, parser); } -static void sk_psock_verdict_apply(struct sk_psock *psock, - struct sk_buff *skb, int verdict) +static void sk_psock_skb_redirect(struct sk_psock *psock, struct sk_buff *skb) { struct sk_psock *psock_other; struct sock *sk_other; bool ingress; + sk_other = tcp_skb_bpf_redirect_fetch(skb); + if (unlikely(!sk_other)) { + kfree_skb(skb); + return; + } + psock_other = sk_psock(sk_other); + if (!psock_other || sock_flag(sk_other, SOCK_DEAD) || + !sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) { + kfree_skb(skb); + return; + } + + ingress = tcp_skb_bpf_ingress(skb); + if ((!ingress && sock_writeable(sk_other)) || + (ingress && + atomic_read(&sk_other->sk_rmem_alloc) <= + sk_other->sk_rcvbuf)) { + if (!ingress) + skb_set_owner_w(skb, sk_other); + skb_queue_tail(&psock_other->ingress_skb, skb); + schedule_work(&psock_other->work); + } else { + kfree_skb(skb); + } +} + +static void sk_psock_verdict_apply(struct sk_psock *psock, + struct sk_buff *skb, int verdict) +{ + struct sock *sk_other; + switch (verdict) { case __SK_PASS: sk_other = psock->sk; @@ -707,25 +737,8 @@ static void sk_psock_verdict_apply(struct sk_psock *psock, } goto out_free; case __SK_REDIRECT: - sk_other = tcp_skb_bpf_redirect_fetch(skb); - if (unlikely(!sk_other)) - goto out_free; - psock_other = sk_psock(sk_other); - if (!psock_other || sock_flag(sk_other, SOCK_DEAD) || - !sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) - goto out_free; - ingress = tcp_skb_bpf_ingress(skb); - if ((!ingress && sock_writeable(sk_other)) || - (ingress && - atomic_read(&sk_other->sk_rmem_alloc) <= - sk_other->sk_rcvbuf)) { - if (!ingress) - skb_set_owner_w(skb, sk_other); - skb_queue_tail(&psock_other->ingress_skb, skb); - schedule_work(&psock_other->work); - break; - } - /* fall-through */ + sk_psock_skb_redirect(psock, skb); + break; case __SK_DROP: /* fall-through */ default: -- cgit v1.2.3 From e91de6afa81c10e9f855c5695eb9a53168d96b73 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 29 May 2020 16:06:59 -0700 Subject: bpf: Fix running sk_skb program types with ktls KTLS uses a stream parser to collect TLS messages and send them to the upper layer tls receive handler. This ensures the tls receiver has a full TLS header to parse when it is run. However, when a socket has BPF_SK_SKB_STREAM_VERDICT program attached before KTLS is enabled we end up with two stream parsers running on the same socket. The result is both try to run on the same socket. First the KTLS stream parser runs and calls read_sock() which will tcp_read_sock which in turn calls tcp_rcv_skb(). This dequeues the skb from the sk_receive_queue. When this is done KTLS code then data_ready() callback which because we stacked KTLS on top of the bpf stream verdict program has been replaced with sk_psock_start_strp(). This will in turn kick the stream parser again and eventually do the same thing KTLS did above calling into tcp_rcv_skb() and dequeuing a skb from the sk_receive_queue. At this point the data stream is broke. Part of the stream was handled by the KTLS side some other bytes may have been handled by the BPF side. Generally this results in either missing data or more likely a "Bad Message" complaint from the kTLS receive handler as the BPF program steals some bytes meant to be in a TLS header and/or the TLS header length is no longer correct. We've already broke the idealized model where we can stack ULPs in any order with generic callbacks on the TX side to handle this. So in this patch we do the same thing but for RX side. We add a sk_psock_strp_enabled() helper so TLS can learn a BPF verdict program is running and add a tls_sw_has_ctx_rx() helper so BPF side can learn there is a TLS ULP on the socket. Then on BPF side we omit calling our stream parser to avoid breaking the data stream for the KTLS receiver. Then on the KTLS side we call BPF_SK_SKB_STREAM_VERDICT once the KTLS receiver is done with the packet but before it posts the msg to userspace. This gives us symmetry between the TX and RX halfs and IMO makes it usable again. On the TX side we process packets in this order BPF -> TLS -> TCP and on the receive side in the reverse order TCP -> TLS -> BPF. Discovered while testing OpenSSL 3.0 Alpha2.0 release. Fixes: d829e9c4112b5 ("tls: convert to generic sk_msg interface") Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/159079361946.5745.605854335665044485.stgit@john-Precision-5820-Tower Signed-off-by: Alexei Starovoitov --- include/linux/skmsg.h | 8 ++++++++ include/net/tls.h | 9 +++++++++ net/core/skmsg.c | 43 ++++++++++++++++++++++++++++++++++++++++--- net/tls/tls_sw.c | 20 ++++++++++++++++++-- 4 files changed, 75 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index ad31c9fb7158..08674cd14d5a 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -437,4 +437,12 @@ static inline void psock_progs_drop(struct sk_psock_progs *progs) psock_set_prog(&progs->skb_verdict, NULL); } +int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb); + +static inline bool sk_psock_strp_enabled(struct sk_psock *psock) +{ + if (!psock) + return false; + return psock->parser.enabled; +} #endif /* _LINUX_SKMSG_H */ diff --git a/include/net/tls.h b/include/net/tls.h index 3e7b44cae0d9..3212d3c214a9 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -571,6 +571,15 @@ static inline bool tls_sw_has_ctx_tx(const struct sock *sk) return !!tls_sw_ctx_tx(ctx); } +static inline bool tls_sw_has_ctx_rx(const struct sock *sk) +{ + struct tls_context *ctx = tls_get_ctx(sk); + + if (!ctx) + return false; + return !!tls_sw_ctx_rx(ctx); +} + void tls_sw_write_space(struct sock *sk, struct tls_context *ctx); void tls_device_write_space(struct sock *sk, struct tls_context *ctx); diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 9d72f71e9b47..351afbf6bfba 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -7,6 +7,7 @@ #include #include +#include static bool sk_msg_try_coalesce_ok(struct sk_msg *msg, int elem_first_coalesce) { @@ -714,6 +715,38 @@ static void sk_psock_skb_redirect(struct sk_psock *psock, struct sk_buff *skb) } } +static void sk_psock_tls_verdict_apply(struct sk_psock *psock, + struct sk_buff *skb, int verdict) +{ + switch (verdict) { + case __SK_REDIRECT: + sk_psock_skb_redirect(psock, skb); + break; + case __SK_PASS: + case __SK_DROP: + default: + break; + } +} + +int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb) +{ + struct bpf_prog *prog; + int ret = __SK_PASS; + + rcu_read_lock(); + prog = READ_ONCE(psock->progs.skb_verdict); + if (likely(prog)) { + tcp_skb_bpf_redirect_clear(skb); + ret = sk_psock_bpf_run(psock, prog, skb); + ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); + } + rcu_read_unlock(); + sk_psock_tls_verdict_apply(psock, skb, ret); + return ret; +} +EXPORT_SYMBOL_GPL(sk_psock_tls_strp_read); + static void sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb, int verdict) { @@ -792,9 +825,13 @@ static void sk_psock_strp_data_ready(struct sock *sk) rcu_read_lock(); psock = sk_psock(sk); if (likely(psock)) { - write_lock_bh(&sk->sk_callback_lock); - strp_data_ready(&psock->parser.strp); - write_unlock_bh(&sk->sk_callback_lock); + if (tls_sw_has_ctx_rx(sk)) { + psock->parser.saved_data_ready(sk); + } else { + write_lock_bh(&sk->sk_callback_lock); + strp_data_ready(&psock->parser.strp); + write_unlock_bh(&sk->sk_callback_lock); + } } rcu_read_unlock(); } diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 8c2763eb6aae..24f64bc0de18 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1742,6 +1742,7 @@ int tls_sw_recvmsg(struct sock *sk, long timeo; bool is_kvec = iov_iter_is_kvec(&msg->msg_iter); bool is_peek = flags & MSG_PEEK; + bool bpf_strp_enabled; int num_async = 0; int pending; @@ -1752,6 +1753,7 @@ int tls_sw_recvmsg(struct sock *sk, psock = sk_psock_get(sk); lock_sock(sk); + bpf_strp_enabled = sk_psock_strp_enabled(psock); /* Process pending decrypted records. It must be non-zero-copy */ err = process_rx_list(ctx, msg, &control, &cmsg, 0, len, false, @@ -1805,11 +1807,12 @@ int tls_sw_recvmsg(struct sock *sk, if (to_decrypt <= len && !is_kvec && !is_peek && ctx->control == TLS_RECORD_TYPE_DATA && - prot->version != TLS_1_3_VERSION) + prot->version != TLS_1_3_VERSION && + !bpf_strp_enabled) zc = true; /* Do not use async mode if record is non-data */ - if (ctx->control == TLS_RECORD_TYPE_DATA) + if (ctx->control == TLS_RECORD_TYPE_DATA && !bpf_strp_enabled) async_capable = ctx->async_capable; else async_capable = false; @@ -1859,6 +1862,19 @@ int tls_sw_recvmsg(struct sock *sk, goto pick_next_record; if (!zc) { + if (bpf_strp_enabled) { + err = sk_psock_tls_strp_read(psock, skb); + if (err != __SK_PASS) { + rxm->offset = rxm->offset + rxm->full_len; + rxm->full_len = 0; + if (err == __SK_DROP) + consume_skb(skb); + ctx->recv_pkt = NULL; + __strp_unpause(&ctx->strp); + continue; + } + } + if (rxm->full_len > len) { retain_skb = true; chunk = len; -- cgit v1.2.3 From 8ea204c2b658eaef55b4716fde469fb66c589a3d Mon Sep 17 00:00:00 2001 From: Ferenc Fejes Date: Sat, 30 May 2020 23:09:00 +0200 Subject: net: Make locking in sock_bindtoindex optional The sock_bindtoindex intended for kernel wide usage however it will lock the socket regardless of the context. This modification relax this behavior optionally: locking the socket will be optional by calling the sock_bindtoindex with lock_sk = true. The modification applied to all users of the sock_bindtoindex. Signed-off-by: Ferenc Fejes Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/bee6355da40d9e991b2f2d12b67d55ebb5f5b207.1590871065.git.fejes@inf.elte.hu --- include/net/sock.h | 2 +- net/core/sock.c | 10 ++++++---- net/ipv4/udp_tunnel.c | 2 +- net/ipv6/ip6_udp_tunnel.c | 2 +- 4 files changed, 9 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/net/sock.h b/include/net/sock.h index 6e9f713a7860..c53cc42b5ab9 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2690,7 +2690,7 @@ static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif) void sock_def_readable(struct sock *sk); -int sock_bindtoindex(struct sock *sk, int ifindex); +int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk); void sock_enable_timestamps(struct sock *sk); void sock_no_linger(struct sock *sk); void sock_set_keepalive(struct sock *sk); diff --git a/net/core/sock.c b/net/core/sock.c index 61ec573221a6..6c4acf1f0220 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -594,13 +594,15 @@ out: return ret; } -int sock_bindtoindex(struct sock *sk, int ifindex) +int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk) { int ret; - lock_sock(sk); + if (lock_sk) + lock_sock(sk); ret = sock_bindtoindex_locked(sk, ifindex); - release_sock(sk); + if (lock_sk) + release_sock(sk); return ret; } @@ -646,7 +648,7 @@ static int sock_setbindtodevice(struct sock *sk, char __user *optval, goto out; } - return sock_bindtoindex(sk, index); + return sock_bindtoindex(sk, index, true); out: #endif diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c index 2158e8bddf41..3eecba0874aa 100644 --- a/net/ipv4/udp_tunnel.c +++ b/net/ipv4/udp_tunnel.c @@ -22,7 +22,7 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, goto error; if (cfg->bind_ifindex) { - err = sock_bindtoindex(sock->sk, cfg->bind_ifindex); + err = sock_bindtoindex(sock->sk, cfg->bind_ifindex, true); if (err < 0) goto error; } diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c index 2e0ad1bc84a8..cdc4d4ee2420 100644 --- a/net/ipv6/ip6_udp_tunnel.c +++ b/net/ipv6/ip6_udp_tunnel.c @@ -30,7 +30,7 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, goto error; } if (cfg->bind_ifindex) { - err = sock_bindtoindex(sock->sk, cfg->bind_ifindex); + err = sock_bindtoindex(sock->sk, cfg->bind_ifindex, true); if (err < 0) goto error; } -- cgit v1.2.3 From 70c58997c1e864c96dfdf072572047303db8f42a Mon Sep 17 00:00:00 2001 From: Ferenc Fejes Date: Sat, 30 May 2020 23:09:01 +0200 Subject: bpf: Allow SO_BINDTODEVICE opt in bpf_setsockopt Extending the supported sockopts in bpf_setsockopt with SO_BINDTODEVICE. We call sock_bindtoindex with parameter lock_sk = false in this context because we already owning the socket. Signed-off-by: Ferenc Fejes Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/4149e304867b8d5a606a305bc59e29b063e51f49.1590871065.git.fejes@inf.elte.hu --- net/core/filter.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 85ff827aab73..ae82bcb03124 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4248,6 +4248,9 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = { static int _bpf_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen, u32 flags) { + char devname[IFNAMSIZ]; + struct net *net; + int ifindex; int ret = 0; int val; @@ -4257,7 +4260,7 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, sock_owned_by_me(sk); if (level == SOL_SOCKET) { - if (optlen != sizeof(int)) + if (optlen != sizeof(int) && optname != SO_BINDTODEVICE) return -EINVAL; val = *((int *)optval); @@ -4298,6 +4301,29 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, sk_dst_reset(sk); } break; + case SO_BINDTODEVICE: + ret = -ENOPROTOOPT; +#ifdef CONFIG_NETDEVICES + optlen = min_t(long, optlen, IFNAMSIZ - 1); + strncpy(devname, optval, optlen); + devname[optlen] = 0; + + ifindex = 0; + if (devname[0] != '\0') { + struct net_device *dev; + + ret = -ENODEV; + + net = sock_net(sk); + dev = dev_get_by_name(net, devname); + if (!dev) + break; + ifindex = dev->ifindex; + dev_put(dev); + } + ret = sock_bindtoindex(sk, ifindex, false); +#endif + break; default: ret = -EINVAL; } -- cgit v1.2.3 From 171526f6fee84de0c39e2b7aa7e666ba0bbfd173 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Sun, 31 May 2020 10:28:35 +0200 Subject: flow_dissector: Pull locking up from prog attach callback Split out the part of attach callback that happens with attach/detach lock acquired. This structures the prog attach callback in a way that opens up doors for moving the locking out of flow_dissector and into generic callbacks for attaching/detaching progs to netns in subsequent patches. Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Reviewed-by: Stanislav Fomichev Link: https://lore.kernel.org/bpf/20200531082846.2117903-2-jakub@cloudflare.com --- net/core/flow_dissector.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 0aeb33572feb..b64a44a083fd 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -109,15 +109,10 @@ int skb_flow_dissector_prog_query(const union bpf_attr *attr, return 0; } -int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr, - struct bpf_prog *prog) +static int flow_dissector_bpf_prog_attach(struct net *net, + struct bpf_prog *prog) { struct bpf_prog *attached; - struct net *net; - int ret = 0; - - net = current->nsproxy->net_ns; - mutex_lock(&flow_dissector_mutex); if (net == &init_net) { /* BPF flow dissector in the root namespace overrides @@ -130,33 +125,38 @@ int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr, for_each_net(ns) { if (ns == &init_net) continue; - if (rcu_access_pointer(ns->flow_dissector_prog)) { - ret = -EEXIST; - goto out; - } + if (rcu_access_pointer(ns->flow_dissector_prog)) + return -EEXIST; } } else { /* Make sure root flow dissector is not attached * when attaching to the non-root namespace. */ - if (rcu_access_pointer(init_net.flow_dissector_prog)) { - ret = -EEXIST; - goto out; - } + if (rcu_access_pointer(init_net.flow_dissector_prog)) + return -EEXIST; } attached = rcu_dereference_protected(net->flow_dissector_prog, lockdep_is_held(&flow_dissector_mutex)); - if (attached == prog) { + if (attached == prog) /* The same program cannot be attached twice */ - ret = -EINVAL; - goto out; - } + return -EINVAL; + rcu_assign_pointer(net->flow_dissector_prog, prog); if (attached) bpf_prog_put(attached); -out: + return 0; +} + +int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr, + struct bpf_prog *prog) +{ + int ret; + + mutex_lock(&flow_dissector_mutex); + ret = flow_dissector_bpf_prog_attach(current->nsproxy->net_ns, prog); mutex_unlock(&flow_dissector_mutex); + return ret; } -- cgit v1.2.3 From a3fd7ceee05431d2c51ed86c6cae015d236a51f0 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Sun, 31 May 2020 10:28:36 +0200 Subject: net: Introduce netns_bpf for BPF programs attached to netns In order to: (1) attach more than one BPF program type to netns, or (2) support attaching BPF programs to netns with bpf_link, or (3) support multi-prog attach points for netns we will need to keep more state per netns than a single pointer like we have now for BPF flow dissector program. Prepare for the above by extracting netns_bpf that is part of struct net, for storing all state related to BPF programs attached to netns. Turn flow dissector callbacks for querying/attaching/detaching a program into generic ones that operate on netns_bpf. Next patch will move the generic callbacks into their own module. This is similar to how it is organized for cgroup with cgroup_bpf. Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Cc: Stanislav Fomichev Link: https://lore.kernel.org/bpf/20200531082846.2117903-3-jakub@cloudflare.com --- include/linux/bpf-netns.h | 56 +++++++++++++++++++++++ include/linux/skbuff.h | 26 ----------- include/net/net_namespace.h | 4 +- include/net/netns/bpf.h | 17 +++++++ kernel/bpf/syscall.c | 7 +-- net/core/flow_dissector.c | 105 +++++++++++++++++++++++++++++--------------- 6 files changed, 149 insertions(+), 66 deletions(-) create mode 100644 include/linux/bpf-netns.h create mode 100644 include/net/netns/bpf.h (limited to 'net') diff --git a/include/linux/bpf-netns.h b/include/linux/bpf-netns.h new file mode 100644 index 000000000000..f3aec3d79824 --- /dev/null +++ b/include/linux/bpf-netns.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BPF_NETNS_H +#define _BPF_NETNS_H + +#include +#include + +enum netns_bpf_attach_type { + NETNS_BPF_INVALID = -1, + NETNS_BPF_FLOW_DISSECTOR = 0, + MAX_NETNS_BPF_ATTACH_TYPE +}; + +static inline enum netns_bpf_attach_type +to_netns_bpf_attach_type(enum bpf_attach_type attach_type) +{ + switch (attach_type) { + case BPF_FLOW_DISSECTOR: + return NETNS_BPF_FLOW_DISSECTOR; + default: + return NETNS_BPF_INVALID; + } +} + +/* Protects updates to netns_bpf */ +extern struct mutex netns_bpf_mutex; + +union bpf_attr; +struct bpf_prog; + +#ifdef CONFIG_NET +int netns_bpf_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr); +int netns_bpf_prog_attach(const union bpf_attr *attr, + struct bpf_prog *prog); +int netns_bpf_prog_detach(const union bpf_attr *attr); +#else +static inline int netns_bpf_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return -EOPNOTSUPP; +} + +static inline int netns_bpf_prog_attach(const union bpf_attr *attr, + struct bpf_prog *prog) +{ + return -EOPNOTSUPP; +} + +static inline int netns_bpf_prog_detach(const union bpf_attr *attr) +{ + return -EOPNOTSUPP; +} +#endif + +#endif /* _BPF_NETNS_H */ diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 531843952809..a0d5c2760103 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1283,32 +1283,6 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector, const struct flow_dissector_key *key, unsigned int key_count); -#ifdef CONFIG_NET -int skb_flow_dissector_prog_query(const union bpf_attr *attr, - union bpf_attr __user *uattr); -int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr, - struct bpf_prog *prog); - -int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr); -#else -static inline int skb_flow_dissector_prog_query(const union bpf_attr *attr, - union bpf_attr __user *uattr) -{ - return -EOPNOTSUPP; -} - -static inline int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr, - struct bpf_prog *prog) -{ - return -EOPNOTSUPP; -} - -static inline int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr) -{ - return -EOPNOTSUPP; -} -#endif - struct bpf_flow_dissector; bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, __be16 proto, int nhoff, int hlen, unsigned int flags); diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 8e001e049497..2ee5901bec7a 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -162,7 +163,8 @@ struct net { #endif struct net_generic __rcu *gen; - struct bpf_prog __rcu *flow_dissector_prog; + /* Used to store attached BPF programs */ + struct netns_bpf bpf; /* Note : following structs are cache line aligned */ #ifdef CONFIG_XFRM diff --git a/include/net/netns/bpf.h b/include/net/netns/bpf.h new file mode 100644 index 000000000000..a858d1c5b166 --- /dev/null +++ b/include/net/netns/bpf.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * BPF programs attached to network namespace + */ + +#ifndef __NETNS_BPF_H__ +#define __NETNS_BPF_H__ + +#include + +struct bpf_prog; + +struct netns_bpf { + struct bpf_prog __rcu *progs[MAX_NETNS_BPF_ATTACH_TYPE]; +}; + +#endif /* __NETNS_BPF_H__ */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e83b0818b529..c77ab9c76f7b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -27,6 +27,7 @@ #include #include #include +#include #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ @@ -2868,7 +2869,7 @@ static int bpf_prog_attach(const union bpf_attr *attr) ret = lirc_prog_attach(attr, prog); break; case BPF_PROG_TYPE_FLOW_DISSECTOR: - ret = skb_flow_dissector_bpf_prog_attach(attr, prog); + ret = netns_bpf_prog_attach(attr, prog); break; case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_CGROUP_SKB: @@ -2908,7 +2909,7 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_PROG_TYPE_FLOW_DISSECTOR: if (!capable(CAP_NET_ADMIN)) return -EPERM; - return skb_flow_dissector_bpf_prog_detach(attr); + return netns_bpf_prog_detach(attr); case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_CGROUP_SKB: case BPF_PROG_TYPE_CGROUP_SOCK: @@ -2961,7 +2962,7 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_LIRC_MODE2: return lirc_prog_query(attr, uattr); case BPF_FLOW_DISSECTOR: - return skb_flow_dissector_prog_query(attr, uattr); + return netns_bpf_prog_query(attr, uattr); default: return -EINVAL; } diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index b64a44a083fd..6c1b8e43d611 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -31,8 +31,10 @@ #include #include #endif +#include -static DEFINE_MUTEX(flow_dissector_mutex); +/* Protects updates to netns_bpf */ +DEFINE_MUTEX(netns_bpf_mutex); static void dissector_set_key(struct flow_dissector *flow_dissector, enum flow_dissector_key_id key_id) @@ -70,23 +72,28 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector, } EXPORT_SYMBOL(skb_flow_dissector_init); -int skb_flow_dissector_prog_query(const union bpf_attr *attr, - union bpf_attr __user *uattr) +int netns_bpf_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) { __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); u32 prog_id, prog_cnt = 0, flags = 0; + enum netns_bpf_attach_type type; struct bpf_prog *attached; struct net *net; if (attr->query.query_flags) return -EINVAL; + type = to_netns_bpf_attach_type(attr->query.attach_type); + if (type < 0) + return -EINVAL; + net = get_net_ns_by_fd(attr->query.target_fd); if (IS_ERR(net)) return PTR_ERR(net); rcu_read_lock(); - attached = rcu_dereference(net->flow_dissector_prog); + attached = rcu_dereference(net->bpf.progs[type]); if (attached) { prog_cnt = 1; prog_id = attached->aux->id; @@ -112,6 +119,7 @@ int skb_flow_dissector_prog_query(const union bpf_attr *attr, static int flow_dissector_bpf_prog_attach(struct net *net, struct bpf_prog *prog) { + enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR; struct bpf_prog *attached; if (net == &init_net) { @@ -125,74 +133,97 @@ static int flow_dissector_bpf_prog_attach(struct net *net, for_each_net(ns) { if (ns == &init_net) continue; - if (rcu_access_pointer(ns->flow_dissector_prog)) + if (rcu_access_pointer(ns->bpf.progs[type])) return -EEXIST; } } else { /* Make sure root flow dissector is not attached * when attaching to the non-root namespace. */ - if (rcu_access_pointer(init_net.flow_dissector_prog)) + if (rcu_access_pointer(init_net.bpf.progs[type])) return -EEXIST; } - attached = rcu_dereference_protected(net->flow_dissector_prog, - lockdep_is_held(&flow_dissector_mutex)); + attached = rcu_dereference_protected(net->bpf.progs[type], + lockdep_is_held(&netns_bpf_mutex)); if (attached == prog) /* The same program cannot be attached twice */ return -EINVAL; - rcu_assign_pointer(net->flow_dissector_prog, prog); + rcu_assign_pointer(net->bpf.progs[type], prog); if (attached) bpf_prog_put(attached); return 0; } -int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr, - struct bpf_prog *prog) +int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) { + enum netns_bpf_attach_type type; + struct net *net; int ret; - mutex_lock(&flow_dissector_mutex); - ret = flow_dissector_bpf_prog_attach(current->nsproxy->net_ns, prog); - mutex_unlock(&flow_dissector_mutex); + type = to_netns_bpf_attach_type(attr->attach_type); + if (type < 0) + return -EINVAL; + + net = current->nsproxy->net_ns; + mutex_lock(&netns_bpf_mutex); + switch (type) { + case NETNS_BPF_FLOW_DISSECTOR: + ret = flow_dissector_bpf_prog_attach(net, prog); + break; + default: + ret = -EINVAL; + break; + } + mutex_unlock(&netns_bpf_mutex); return ret; } -static int flow_dissector_bpf_prog_detach(struct net *net) +/* Must be called with netns_bpf_mutex held. */ +static int __netns_bpf_prog_detach(struct net *net, + enum netns_bpf_attach_type type) { struct bpf_prog *attached; - mutex_lock(&flow_dissector_mutex); - attached = rcu_dereference_protected(net->flow_dissector_prog, - lockdep_is_held(&flow_dissector_mutex)); - if (!attached) { - mutex_unlock(&flow_dissector_mutex); + attached = rcu_dereference_protected(net->bpf.progs[type], + lockdep_is_held(&netns_bpf_mutex)); + if (!attached) return -ENOENT; - } - RCU_INIT_POINTER(net->flow_dissector_prog, NULL); + RCU_INIT_POINTER(net->bpf.progs[type], NULL); bpf_prog_put(attached); - mutex_unlock(&flow_dissector_mutex); return 0; } -int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr) +int netns_bpf_prog_detach(const union bpf_attr *attr) { - return flow_dissector_bpf_prog_detach(current->nsproxy->net_ns); + enum netns_bpf_attach_type type; + int ret; + + type = to_netns_bpf_attach_type(attr->attach_type); + if (type < 0) + return -EINVAL; + + mutex_lock(&netns_bpf_mutex); + ret = __netns_bpf_prog_detach(current->nsproxy->net_ns, type); + mutex_unlock(&netns_bpf_mutex); + + return ret; } -static void __net_exit flow_dissector_pernet_pre_exit(struct net *net) +static void __net_exit netns_bpf_pernet_pre_exit(struct net *net) { - /* We're not racing with attach/detach because there are no - * references to netns left when pre_exit gets called. - */ - if (rcu_access_pointer(net->flow_dissector_prog)) - flow_dissector_bpf_prog_detach(net); + enum netns_bpf_attach_type type; + + mutex_lock(&netns_bpf_mutex); + for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) + __netns_bpf_prog_detach(net, type); + mutex_unlock(&netns_bpf_mutex); } -static struct pernet_operations flow_dissector_pernet_ops __net_initdata = { - .pre_exit = flow_dissector_pernet_pre_exit, +static struct pernet_operations netns_bpf_pernet_ops __net_initdata = { + .pre_exit = netns_bpf_pernet_pre_exit, }; /** @@ -1044,11 +1075,13 @@ bool __skb_flow_dissect(const struct net *net, WARN_ON_ONCE(!net); if (net) { + enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR; + rcu_read_lock(); - attached = rcu_dereference(init_net.flow_dissector_prog); + attached = rcu_dereference(init_net.bpf.progs[type]); if (!attached) - attached = rcu_dereference(net->flow_dissector_prog); + attached = rcu_dereference(net->bpf.progs[type]); if (attached) { struct bpf_flow_keys flow_keys; @@ -1870,6 +1903,6 @@ static int __init init_default_flow_dissectors(void) flow_keys_basic_dissector_keys, ARRAY_SIZE(flow_keys_basic_dissector_keys)); - return register_pernet_subsys(&flow_dissector_pernet_ops); + return register_pernet_subsys(&netns_bpf_pernet_ops); } core_initcall(init_default_flow_dissectors); -- cgit v1.2.3 From b27f7bb590ba835b32ef122389db158e44cfda1e Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Sun, 31 May 2020 10:28:37 +0200 Subject: flow_dissector: Move out netns_bpf prog callbacks Move functions to manage BPF programs attached to netns that are not specific to flow dissector to a dedicated module named bpf/net_namespace.c. The set of functions will grow with the addition of bpf_link support for netns attached programs. This patch prepares ground by creating a place for it. This is a code move with no functional changes intended. Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200531082846.2117903-4-jakub@cloudflare.com --- include/net/flow_dissector.h | 6 ++ kernel/bpf/Makefile | 1 + kernel/bpf/net_namespace.c | 133 +++++++++++++++++++++++++++++++++++++++++++ net/core/flow_dissector.c | 125 ++-------------------------------------- 4 files changed, 144 insertions(+), 121 deletions(-) create mode 100644 kernel/bpf/net_namespace.c (limited to 'net') diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index 4fb1a69c6ecf..a7eba43fe4e4 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -8,6 +8,8 @@ #include #include +struct bpf_prog; +struct net; struct sk_buff; /** @@ -369,4 +371,8 @@ flow_dissector_init_keys(struct flow_dissector_key_control *key_control, memset(key_basic, 0, sizeof(*key_basic)); } +#ifdef CONFIG_BPF_SYSCALL +int flow_dissector_bpf_prog_attach(struct net *net, struct bpf_prog *prog); +#endif /* CONFIG_BPF_SYSCALL */ + #endif diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 8fca02f64811..1131a921e1a6 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -13,6 +13,7 @@ ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o obj-$(CONFIG_BPF_SYSCALL) += cpumap.o obj-$(CONFIG_BPF_SYSCALL) += offload.o +obj-$(CONFIG_BPF_SYSCALL) += net_namespace.o endif ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c new file mode 100644 index 000000000000..b37d81450c3a --- /dev/null +++ b/kernel/bpf/net_namespace.c @@ -0,0 +1,133 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include + +/* + * Functions to manage BPF programs attached to netns + */ + +/* Protects updates to netns_bpf */ +DEFINE_MUTEX(netns_bpf_mutex); + +int netns_bpf_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); + u32 prog_id, prog_cnt = 0, flags = 0; + enum netns_bpf_attach_type type; + struct bpf_prog *attached; + struct net *net; + + if (attr->query.query_flags) + return -EINVAL; + + type = to_netns_bpf_attach_type(attr->query.attach_type); + if (type < 0) + return -EINVAL; + + net = get_net_ns_by_fd(attr->query.target_fd); + if (IS_ERR(net)) + return PTR_ERR(net); + + rcu_read_lock(); + attached = rcu_dereference(net->bpf.progs[type]); + if (attached) { + prog_cnt = 1; + prog_id = attached->aux->id; + } + rcu_read_unlock(); + + put_net(net); + + if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) + return -EFAULT; + if (copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt))) + return -EFAULT; + + if (!attr->query.prog_cnt || !prog_ids || !prog_cnt) + return 0; + + if (copy_to_user(prog_ids, &prog_id, sizeof(u32))) + return -EFAULT; + + return 0; +} + +int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + enum netns_bpf_attach_type type; + struct net *net; + int ret; + + type = to_netns_bpf_attach_type(attr->attach_type); + if (type < 0) + return -EINVAL; + + net = current->nsproxy->net_ns; + mutex_lock(&netns_bpf_mutex); + switch (type) { + case NETNS_BPF_FLOW_DISSECTOR: + ret = flow_dissector_bpf_prog_attach(net, prog); + break; + default: + ret = -EINVAL; + break; + } + mutex_unlock(&netns_bpf_mutex); + + return ret; +} + +/* Must be called with netns_bpf_mutex held. */ +static int __netns_bpf_prog_detach(struct net *net, + enum netns_bpf_attach_type type) +{ + struct bpf_prog *attached; + + attached = rcu_dereference_protected(net->bpf.progs[type], + lockdep_is_held(&netns_bpf_mutex)); + if (!attached) + return -ENOENT; + RCU_INIT_POINTER(net->bpf.progs[type], NULL); + bpf_prog_put(attached); + return 0; +} + +int netns_bpf_prog_detach(const union bpf_attr *attr) +{ + enum netns_bpf_attach_type type; + int ret; + + type = to_netns_bpf_attach_type(attr->attach_type); + if (type < 0) + return -EINVAL; + + mutex_lock(&netns_bpf_mutex); + ret = __netns_bpf_prog_detach(current->nsproxy->net_ns, type); + mutex_unlock(&netns_bpf_mutex); + + return ret; +} + +static void __net_exit netns_bpf_pernet_pre_exit(struct net *net) +{ + enum netns_bpf_attach_type type; + + mutex_lock(&netns_bpf_mutex); + for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) + __netns_bpf_prog_detach(net, type); + mutex_unlock(&netns_bpf_mutex); +} + +static struct pernet_operations netns_bpf_pernet_ops __net_initdata = { + .pre_exit = netns_bpf_pernet_pre_exit, +}; + +static int __init netns_bpf_init(void) +{ + return register_pernet_subsys(&netns_bpf_pernet_ops); +} + +subsys_initcall(netns_bpf_init); diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 6c1b8e43d611..d02df0b6d0d9 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -33,9 +33,6 @@ #endif #include -/* Protects updates to netns_bpf */ -DEFINE_MUTEX(netns_bpf_mutex); - static void dissector_set_key(struct flow_dissector *flow_dissector, enum flow_dissector_key_id key_id) { @@ -72,52 +69,8 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector, } EXPORT_SYMBOL(skb_flow_dissector_init); -int netns_bpf_prog_query(const union bpf_attr *attr, - union bpf_attr __user *uattr) -{ - __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); - u32 prog_id, prog_cnt = 0, flags = 0; - enum netns_bpf_attach_type type; - struct bpf_prog *attached; - struct net *net; - - if (attr->query.query_flags) - return -EINVAL; - - type = to_netns_bpf_attach_type(attr->query.attach_type); - if (type < 0) - return -EINVAL; - - net = get_net_ns_by_fd(attr->query.target_fd); - if (IS_ERR(net)) - return PTR_ERR(net); - - rcu_read_lock(); - attached = rcu_dereference(net->bpf.progs[type]); - if (attached) { - prog_cnt = 1; - prog_id = attached->aux->id; - } - rcu_read_unlock(); - - put_net(net); - - if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) - return -EFAULT; - if (copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt))) - return -EFAULT; - - if (!attr->query.prog_cnt || !prog_ids || !prog_cnt) - return 0; - - if (copy_to_user(prog_ids, &prog_id, sizeof(u32))) - return -EFAULT; - - return 0; -} - -static int flow_dissector_bpf_prog_attach(struct net *net, - struct bpf_prog *prog) +#ifdef CONFIG_BPF_SYSCALL +int flow_dissector_bpf_prog_attach(struct net *net, struct bpf_prog *prog) { enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR; struct bpf_prog *attached; @@ -155,76 +108,7 @@ static int flow_dissector_bpf_prog_attach(struct net *net, bpf_prog_put(attached); return 0; } - -int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) -{ - enum netns_bpf_attach_type type; - struct net *net; - int ret; - - type = to_netns_bpf_attach_type(attr->attach_type); - if (type < 0) - return -EINVAL; - - net = current->nsproxy->net_ns; - mutex_lock(&netns_bpf_mutex); - switch (type) { - case NETNS_BPF_FLOW_DISSECTOR: - ret = flow_dissector_bpf_prog_attach(net, prog); - break; - default: - ret = -EINVAL; - break; - } - mutex_unlock(&netns_bpf_mutex); - - return ret; -} - -/* Must be called with netns_bpf_mutex held. */ -static int __netns_bpf_prog_detach(struct net *net, - enum netns_bpf_attach_type type) -{ - struct bpf_prog *attached; - - attached = rcu_dereference_protected(net->bpf.progs[type], - lockdep_is_held(&netns_bpf_mutex)); - if (!attached) - return -ENOENT; - RCU_INIT_POINTER(net->bpf.progs[type], NULL); - bpf_prog_put(attached); - return 0; -} - -int netns_bpf_prog_detach(const union bpf_attr *attr) -{ - enum netns_bpf_attach_type type; - int ret; - - type = to_netns_bpf_attach_type(attr->attach_type); - if (type < 0) - return -EINVAL; - - mutex_lock(&netns_bpf_mutex); - ret = __netns_bpf_prog_detach(current->nsproxy->net_ns, type); - mutex_unlock(&netns_bpf_mutex); - - return ret; -} - -static void __net_exit netns_bpf_pernet_pre_exit(struct net *net) -{ - enum netns_bpf_attach_type type; - - mutex_lock(&netns_bpf_mutex); - for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) - __netns_bpf_prog_detach(net, type); - mutex_unlock(&netns_bpf_mutex); -} - -static struct pernet_operations netns_bpf_pernet_ops __net_initdata = { - .pre_exit = netns_bpf_pernet_pre_exit, -}; +#endif /* CONFIG_BPF_SYSCALL */ /** * __skb_flow_get_ports - extract the upper layer ports and return them @@ -1902,7 +1786,6 @@ static int __init init_default_flow_dissectors(void) skb_flow_dissector_init(&flow_keys_basic_dissector, flow_keys_basic_dissector_keys, ARRAY_SIZE(flow_keys_basic_dissector_keys)); - - return register_pernet_subsys(&netns_bpf_pernet_ops); + return 0; } core_initcall(init_default_flow_dissectors); -- cgit v1.2.3 From 4c21daae3dbc9f8536cc18e6e53627821fa2c90c Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Thu, 28 May 2020 22:34:07 +0800 Subject: tipc: Fix NULL pointer dereference in __tipc_sendstream() tipc_sendstream() may send zero length packet, then tipc_msg_append() do not alloc skb, skb_peek_tail() will get NULL, msg_set_ack_required will trigger NULL pointer dereference. Reported-by: syzbot+8eac6d030e7807c21d32@syzkaller.appspotmail.com Fixes: 0a3e060f340d ("tipc: add test for Nagle algorithm effectiveness") Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- net/tipc/socket.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 3734cdbedc9c..26123f4177fd 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1588,8 +1588,12 @@ static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dlen) tsk->pkt_cnt += skb_queue_len(txq); } else { skb = skb_peek_tail(txq); - msg_set_ack_required(buf_msg(skb)); - tsk->expect_ack = true; + if (skb) { + msg_set_ack_required(buf_msg(skb)); + tsk->expect_ack = true; + } else { + tsk->expect_ack = false; + } tsk->msg_acc = 0; tsk->pkt_cnt = 0; } -- cgit v1.2.3 From 79a1f0ccdbb4ad700590f61b00525b390cb53905 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Mon, 1 Jun 2020 11:55:03 +0800 Subject: ipv6: fix IPV6_ADDRFORM operation logic Socket option IPV6_ADDRFORM supports UDP/UDPLITE and TCP at present. Previously the checking logic looks like: if (sk->sk_protocol == IPPROTO_UDP || sk->sk_protocol == IPPROTO_UDPLITE) do_some_check; else if (sk->sk_protocol != IPPROTO_TCP) break; After commit b6f6118901d1 ("ipv6: restrict IPV6_ADDRFORM operation"), TCP was blocked as the logic changed to: if (sk->sk_protocol == IPPROTO_UDP || sk->sk_protocol == IPPROTO_UDPLITE) do_some_check; else if (sk->sk_protocol == IPPROTO_TCP) do_some_check; break; else break; Then after commit 82c9ae440857 ("ipv6: fix restrict IPV6_ADDRFORM operation") UDP/UDPLITE were blocked as the logic changed to: if (sk->sk_protocol == IPPROTO_UDP || sk->sk_protocol == IPPROTO_UDPLITE) do_some_check; if (sk->sk_protocol == IPPROTO_TCP) do_some_check; if (sk->sk_protocol != IPPROTO_TCP) break; Fix it by using Eric's code and simply remove the break in TCP check, which looks like: if (sk->sk_protocol == IPPROTO_UDP || sk->sk_protocol == IPPROTO_UDPLITE) do_some_check; else if (sk->sk_protocol == IPPROTO_TCP) do_some_check; else break; Fixes: 82c9ae440857 ("ipv6: fix restrict IPV6_ADDRFORM operation") Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- net/ipv6/ipv6_sockglue.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index adbfed6adf11..2c843ff5e3a9 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -218,14 +218,15 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, retv = -EBUSY; break; } - } - if (sk->sk_protocol == IPPROTO_TCP && - sk->sk_prot != &tcpv6_prot) { - retv = -EBUSY; + } else if (sk->sk_protocol == IPPROTO_TCP) { + if (sk->sk_prot != &tcpv6_prot) { + retv = -EBUSY; + break; + } + } else { break; } - if (sk->sk_protocol != IPPROTO_TCP) - break; + if (sk->sk_state != TCP_ESTABLISHED) { retv = -ENOTCONN; break; -- cgit v1.2.3 From 6abde0b241224347cd88e2ae75902e07f55c42cb Mon Sep 17 00:00:00 2001 From: Vinay Kumar Yadav Date: Tue, 2 Jun 2020 00:07:05 +0530 Subject: crypto/chtls: IPv6 support for inline TLS Extends support to IPv6 for Inline TLS server. Signed-off-by: Vinay Kumar Yadav v1->v2: - cc'd tcp folks. v2->v3: - changed EXPORT_SYMBOL() to EXPORT_SYMBOL_GPL() Signed-off-by: David S. Miller --- drivers/crypto/chelsio/chtls/chtls_cm.c | 195 ++++++++++++++++++++++++------ drivers/crypto/chelsio/chtls/chtls_cm.h | 1 + drivers/crypto/chelsio/chtls/chtls_main.c | 14 ++- net/ipv6/tcp_ipv6.c | 1 + 4 files changed, 168 insertions(+), 43 deletions(-) (limited to 'net') diff --git a/drivers/crypto/chelsio/chtls/chtls_cm.c b/drivers/crypto/chelsio/chtls/chtls_cm.c index d5720a859443..9a642c79a657 100644 --- a/drivers/crypto/chelsio/chtls/chtls_cm.c +++ b/drivers/crypto/chelsio/chtls/chtls_cm.c @@ -18,13 +18,20 @@ #include #include #include +#include +#include +#include +#include #include #include #include #include +#include +#include #include "chtls.h" #include "chtls_cm.h" +#include "clip_tbl.h" /* * State transitions and actions for close. Note that if we are in SYN_SENT @@ -82,15 +89,36 @@ static void chtls_sock_release(struct kref *ref) kfree(csk); } -static struct net_device *chtls_ipv4_netdev(struct chtls_dev *cdev, +static struct net_device *chtls_find_netdev(struct chtls_dev *cdev, struct sock *sk) { struct net_device *ndev = cdev->ports[0]; + struct net_device *temp; + int addr_type; + + switch (sk->sk_family) { + case PF_INET: + if (likely(!inet_sk(sk)->inet_rcv_saddr)) + return ndev; + ndev = ip_dev_find(&init_net, inet_sk(sk)->inet_rcv_saddr); + break; + case PF_INET6: + addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr); + if (likely(addr_type == IPV6_ADDR_ANY)) + return ndev; + + for_each_netdev_rcu(&init_net, temp) { + if (ipv6_chk_addr(&init_net, (struct in6_addr *) + &sk->sk_v6_rcv_saddr, temp, 1)) { + ndev = temp; + break; + } + } + break; + default: + return NULL; + } - if (likely(!inet_sk(sk)->inet_rcv_saddr)) - return ndev; - - ndev = ip_dev_find(&init_net, inet_sk(sk)->inet_rcv_saddr); if (!ndev) return NULL; @@ -446,7 +474,10 @@ void chtls_destroy_sock(struct sock *sk) free_tls_keyid(sk); kref_put(&csk->kref, chtls_sock_release); csk->cdev = NULL; - sk->sk_prot = &tcp_prot; + if (sk->sk_family == AF_INET) + sk->sk_prot = &tcp_prot; + else + sk->sk_prot = &tcpv6_prot; sk->sk_prot->destroy(sk); } @@ -473,7 +504,8 @@ static void chtls_disconnect_acceptq(struct sock *listen_sk) while (*pprev) { struct request_sock *req = *pprev; - if (req->rsk_ops == &chtls_rsk_ops) { + if (req->rsk_ops == &chtls_rsk_ops || + req->rsk_ops == &chtls_rsk_opsv6) { struct sock *child = req->sk; *pprev = req->dl_next; @@ -600,14 +632,13 @@ int chtls_listen_start(struct chtls_dev *cdev, struct sock *sk) struct listen_ctx *ctx; struct adapter *adap; struct port_info *pi; + bool clip_valid; int stid; int ret; - if (sk->sk_family != PF_INET) - return -EAGAIN; - + clip_valid = false; rcu_read_lock(); - ndev = chtls_ipv4_netdev(cdev, sk); + ndev = chtls_find_netdev(cdev, sk); rcu_read_unlock(); if (!ndev) return -EBADF; @@ -638,16 +669,35 @@ int chtls_listen_start(struct chtls_dev *cdev, struct sock *sk) if (!listen_hash_add(cdev, sk, stid)) goto free_stid; - ret = cxgb4_create_server(ndev, stid, - inet_sk(sk)->inet_rcv_saddr, - inet_sk(sk)->inet_sport, 0, - cdev->lldi->rxq_ids[0]); + if (sk->sk_family == PF_INET) { + ret = cxgb4_create_server(ndev, stid, + inet_sk(sk)->inet_rcv_saddr, + inet_sk(sk)->inet_sport, 0, + cdev->lldi->rxq_ids[0]); + } else { + int addr_type; + + addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr); + if (addr_type != IPV6_ADDR_ANY) { + ret = cxgb4_clip_get(ndev, (const u32 *) + &sk->sk_v6_rcv_saddr, 1); + if (ret) + goto del_hash; + clip_valid = true; + } + ret = cxgb4_create_server6(ndev, stid, + &sk->sk_v6_rcv_saddr, + inet_sk(sk)->inet_sport, + cdev->lldi->rxq_ids[0]); + } if (ret > 0) ret = net_xmit_errno(ret); if (ret) goto del_hash; return 0; del_hash: + if (clip_valid) + cxgb4_clip_release(ndev, (const u32 *)&sk->sk_v6_rcv_saddr, 1); listen_hash_del(cdev, sk); free_stid: cxgb4_free_stid(cdev->tids, stid, sk->sk_family); @@ -661,6 +711,8 @@ free_ctx: void chtls_listen_stop(struct chtls_dev *cdev, struct sock *sk) { struct listen_ctx *listen_ctx; + struct chtls_sock *csk; + int addr_type = 0; int stid; stid = listen_hash_del(cdev, sk); @@ -671,7 +723,16 @@ void chtls_listen_stop(struct chtls_dev *cdev, struct sock *sk) chtls_reset_synq(listen_ctx); cxgb4_remove_server(cdev->lldi->ports[0], stid, - cdev->lldi->rxq_ids[0], 0); + cdev->lldi->rxq_ids[0], sk->sk_family == PF_INET6); + + if (sk->sk_family == PF_INET6) { + csk = rcu_dereference_sk_user_data(sk); + addr_type = ipv6_addr_type((const struct in6_addr *) + &sk->sk_v6_rcv_saddr); + if (addr_type != IPV6_ADDR_ANY) + cxgb4_clip_release(csk->egress_dev, (const u32 *) + &sk->sk_v6_rcv_saddr, 1); + } chtls_disconnect_acceptq(sk); } @@ -880,7 +941,10 @@ static unsigned int chtls_select_mss(const struct chtls_sock *csk, tp = tcp_sk(sk); tcpoptsz = 0; - iphdrsz = sizeof(struct iphdr) + sizeof(struct tcphdr); + if (sk->sk_family == AF_INET6) + iphdrsz = sizeof(struct ipv6hdr) + sizeof(struct tcphdr); + else + iphdrsz = sizeof(struct iphdr) + sizeof(struct tcphdr); if (req->tcpopt.tstamp) tcpoptsz += round_up(TCPOLEN_TIMESTAMP, 4); @@ -1045,11 +1109,29 @@ static struct sock *chtls_recv_sock(struct sock *lsk, if (!newsk) goto free_oreq; - dst = inet_csk_route_child_sock(lsk, newsk, oreq); - if (!dst) - goto free_sk; + if (lsk->sk_family == AF_INET) { + dst = inet_csk_route_child_sock(lsk, newsk, oreq); + if (!dst) + goto free_sk; - n = dst_neigh_lookup(dst, &iph->saddr); + n = dst_neigh_lookup(dst, &iph->saddr); + } else { + const struct ipv6hdr *ip6h; + struct flowi6 fl6; + + ip6h = (const struct ipv6hdr *)network_hdr; + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_proto = IPPROTO_TCP; + fl6.saddr = ip6h->daddr; + fl6.daddr = ip6h->saddr; + fl6.fl6_dport = inet_rsk(oreq)->ir_rmt_port; + fl6.fl6_sport = htons(inet_rsk(oreq)->ir_num); + security_req_classify_flow(oreq, flowi6_to_flowi(&fl6)); + dst = ip6_dst_lookup_flow(sock_net(lsk), lsk, &fl6, NULL); + if (IS_ERR(dst)) + goto free_sk; + n = dst_neigh_lookup(dst, &ip6h->saddr); + } if (!n) goto free_sk; @@ -1072,9 +1154,28 @@ static struct sock *chtls_recv_sock(struct sock *lsk, tp = tcp_sk(newsk); newinet = inet_sk(newsk); - newinet->inet_daddr = iph->saddr; - newinet->inet_rcv_saddr = iph->daddr; - newinet->inet_saddr = iph->daddr; + if (iph->version == 0x4) { + newinet->inet_daddr = iph->saddr; + newinet->inet_rcv_saddr = iph->daddr; + newinet->inet_saddr = iph->daddr; + } else { + struct tcp6_sock *newtcp6sk = (struct tcp6_sock *)newsk; + struct inet_request_sock *treq = inet_rsk(oreq); + struct ipv6_pinfo *newnp = inet6_sk(newsk); + struct ipv6_pinfo *np = inet6_sk(lsk); + + inet_sk(newsk)->pinet6 = &newtcp6sk->inet6; + memcpy(newnp, np, sizeof(struct ipv6_pinfo)); + newsk->sk_v6_daddr = treq->ir_v6_rmt_addr; + newsk->sk_v6_rcv_saddr = treq->ir_v6_loc_addr; + inet6_sk(newsk)->saddr = treq->ir_v6_loc_addr; + newnp->ipv6_fl_list = NULL; + newnp->pktoptions = NULL; + newsk->sk_bound_dev_if = treq->ir_iif; + newinet->inet_opt = NULL; + newinet->inet_daddr = LOOPBACK4_IPV6; + newinet->inet_saddr = LOOPBACK4_IPV6; + } oreq->ts_recent = PASS_OPEN_TID_G(ntohl(req->tos_stid)); sk_setup_caps(newsk, dst); @@ -1156,6 +1257,7 @@ static void chtls_pass_accept_request(struct sock *sk, struct sk_buff *reply_skb; struct chtls_sock *csk; struct chtls_dev *cdev; + struct ipv6hdr *ip6h; struct tcphdr *tcph; struct sock *newsk; struct ethhdr *eh; @@ -1196,37 +1298,50 @@ static void chtls_pass_accept_request(struct sock *sk, if (sk_acceptq_is_full(sk)) goto reject; - oreq = inet_reqsk_alloc(&chtls_rsk_ops, sk, true); - if (!oreq) - goto reject; - - oreq->rsk_rcv_wnd = 0; - oreq->rsk_window_clamp = 0; - oreq->cookie_ts = 0; - oreq->mss = 0; - oreq->ts_recent = 0; eth_hdr_len = T6_ETH_HDR_LEN_G(ntohl(req->hdr_len)); if (eth_hdr_len == ETH_HLEN) { eh = (struct ethhdr *)(req + 1); iph = (struct iphdr *)(eh + 1); + ip6h = (struct ipv6hdr *)(eh + 1); network_hdr = (void *)(eh + 1); } else { vlan_eh = (struct vlan_ethhdr *)(req + 1); iph = (struct iphdr *)(vlan_eh + 1); + ip6h = (struct ipv6hdr *)(vlan_eh + 1); network_hdr = (void *)(vlan_eh + 1); } - if (iph->version != 0x4) - goto free_oreq; - tcph = (struct tcphdr *)(iph + 1); - skb_set_network_header(skb, (void *)iph - (void *)req); + if (iph->version == 0x4) { + tcph = (struct tcphdr *)(iph + 1); + skb_set_network_header(skb, (void *)iph - (void *)req); + oreq = inet_reqsk_alloc(&chtls_rsk_ops, sk, true); + } else { + tcph = (struct tcphdr *)(ip6h + 1); + skb_set_network_header(skb, (void *)ip6h - (void *)req); + oreq = inet_reqsk_alloc(&chtls_rsk_opsv6, sk, false); + } + + if (!oreq) + goto reject; + + oreq->rsk_rcv_wnd = 0; + oreq->rsk_window_clamp = 0; + oreq->cookie_ts = 0; + oreq->mss = 0; + oreq->ts_recent = 0; tcp_rsk(oreq)->tfo_listener = false; tcp_rsk(oreq)->rcv_isn = ntohl(tcph->seq); chtls_set_req_port(oreq, tcph->source, tcph->dest); - chtls_set_req_addr(oreq, iph->daddr, iph->saddr); - ip_dsfield = ipv4_get_dsfield(iph); + if (iph->version == 0x4) { + chtls_set_req_addr(oreq, iph->daddr, iph->saddr); + ip_dsfield = ipv4_get_dsfield(iph); + } else { + inet_rsk(oreq)->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; + inet_rsk(oreq)->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; + ip_dsfield = ipv6_get_dsfield(ipv6_hdr(skb)); + } if (req->tcpopt.wsf <= 14 && sock_net(sk)->ipv4.sysctl_tcp_window_scaling) { inet_rsk(oreq)->wscale_ok = 1; @@ -1243,7 +1358,7 @@ static void chtls_pass_accept_request(struct sock *sk, newsk = chtls_recv_sock(sk, oreq, network_hdr, req, cdev); if (!newsk) - goto reject; + goto free_oreq; if (chtls_get_module(newsk)) goto reject; diff --git a/drivers/crypto/chelsio/chtls/chtls_cm.h b/drivers/crypto/chelsio/chtls/chtls_cm.h index 3fac0c74a41f..47ba81e42f5d 100644 --- a/drivers/crypto/chelsio/chtls/chtls_cm.h +++ b/drivers/crypto/chelsio/chtls/chtls_cm.h @@ -79,6 +79,7 @@ enum { typedef void (*defer_handler_t)(struct chtls_dev *dev, struct sk_buff *skb); extern struct request_sock_ops chtls_rsk_ops; +extern struct request_sock_ops chtls_rsk_opsv6; struct deferred_skb_cb { defer_handler_t handler; diff --git a/drivers/crypto/chelsio/chtls/chtls_main.c b/drivers/crypto/chelsio/chtls/chtls_main.c index 2110d0893bc7..7dfffdde9593 100644 --- a/drivers/crypto/chelsio/chtls/chtls_main.c +++ b/drivers/crypto/chelsio/chtls/chtls_main.c @@ -13,6 +13,8 @@ #include #include #include +#include +#include #include #include @@ -30,8 +32,8 @@ static DEFINE_MUTEX(cdev_mutex); static DEFINE_MUTEX(notify_mutex); static RAW_NOTIFIER_HEAD(listen_notify_list); -static struct proto chtls_cpl_prot; -struct request_sock_ops chtls_rsk_ops; +static struct proto chtls_cpl_prot, chtls_cpl_protv6; +struct request_sock_ops chtls_rsk_ops, chtls_rsk_opsv6; static uint send_page_order = (14 - PAGE_SHIFT < 0) ? 0 : 14 - PAGE_SHIFT; static void register_listen_notifier(struct notifier_block *nb) @@ -586,7 +588,10 @@ static struct cxgb4_uld_info chtls_uld_info = { void chtls_install_cpl_ops(struct sock *sk) { - sk->sk_prot = &chtls_cpl_prot; + if (sk->sk_family == AF_INET) + sk->sk_prot = &chtls_cpl_prot; + else + sk->sk_prot = &chtls_cpl_protv6; } static void __init chtls_init_ulp_ops(void) @@ -603,6 +608,9 @@ static void __init chtls_init_ulp_ops(void) chtls_cpl_prot.recvmsg = chtls_recvmsg; chtls_cpl_prot.setsockopt = chtls_setsockopt; chtls_cpl_prot.getsockopt = chtls_getsockopt; + chtls_cpl_protv6 = chtls_cpl_prot; + chtls_init_rsk_ops(&chtls_cpl_protv6, &chtls_rsk_opsv6, + &tcpv6_prot, PF_INET6); } static int __init chtls_register(void) diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index b7415ca75c2d..f67d45ff00b4 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2121,6 +2121,7 @@ struct proto tcpv6_prot = { #endif .diag_destroy = tcp_abort, }; +EXPORT_SYMBOL_GPL(tcpv6_prot); /* thinking of making this const? Don't. * early_demux can change based on sysctl. -- cgit v1.2.3 From 836e66c218f355ec01ba57671c85abf32961dcea Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 2 Jun 2020 16:58:32 +0200 Subject: bpf: Fix up bpf_skb_adjust_room helper's skb csum setting Lorenz recently reported: In our TC classifier cls_redirect [0], we use the following sequence of helper calls to decapsulate a GUE (basically IP + UDP + custom header) encapsulated packet: bpf_skb_adjust_room(skb, -encap_len, BPF_ADJ_ROOM_MAC, BPF_F_ADJ_ROOM_FIXED_GSO) bpf_redirect(skb->ifindex, BPF_F_INGRESS) It seems like some checksums of the inner headers are not validated in this case. For example, a TCP SYN packet with invalid TCP checksum is still accepted by the network stack and elicits a SYN ACK. [...] That is, we receive the following packet from the driver: | ETH | IP | UDP | GUE | IP | TCP | skb->ip_summed == CHECKSUM_UNNECESSARY ip_summed is CHECKSUM_UNNECESSARY because our NICs do rx checksum offloading. On this packet we run skb_adjust_room_mac(-encap_len), and get the following: | ETH | IP | TCP | skb->ip_summed == CHECKSUM_UNNECESSARY Note that ip_summed is still CHECKSUM_UNNECESSARY. After bpf_redirect()'ing into the ingress, we end up in tcp_v4_rcv(). There, skb_checksum_init() is turned into a no-op due to CHECKSUM_UNNECESSARY. The bpf_skb_adjust_room() helper is not aware of protocol specifics. Internally, it handles the CHECKSUM_COMPLETE case via skb_postpull_rcsum(), but that does not cover CHECKSUM_UNNECESSARY. In this case skb->csum_level of the original skb prior to bpf_skb_adjust_room() call was 0, that is, covering UDP. Right now there is no way to adjust the skb->csum_level. NICs that have checksum offload disabled (CHECKSUM_NONE) or that support CHECKSUM_COMPLETE are not affected. Use a safe default for CHECKSUM_UNNECESSARY by resetting to CHECKSUM_NONE and add a flag to the helper called BPF_F_ADJ_ROOM_NO_CSUM_RESET that allows users from opting out. Opting out is useful for the case where we don't remove/add full protocol headers, or for the case where a user wants to adjust the csum level manually e.g. through bpf_csum_level() helper that is added in subsequent patch. The bpf_skb_proto_{4_to_6,6_to_4}() for NAT64/46 translation from the BPF bpf_skb_change_proto() helper uses bpf_skb_net_hdr_{push,pop}() pair internally as well but doesn't change layers, only transitions between v4 to v6 and vice versa, therefore no adoption is required there. [0] https://lore.kernel.org/bpf/20200424185556.7358-1-lmb@cloudflare.com/ Fixes: 2be7e212d541 ("bpf: add bpf_skb_adjust_room helper") Reported-by: Lorenz Bauer Reported-by: Alan Maguire Signed-off-by: Daniel Borkmann Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Reviewed-by: Alan Maguire Link: https://lore.kernel.org/bpf/CACAyw9-uU_52esMd1JjuA80fRPHJv5vsSg8GnfW3t_qDU4aVKQ@mail.gmail.com/ Link: https://lore.kernel.org/bpf/11a90472e7cce83e76ddbfce81fdfce7bfc68808.1591108731.git.daniel@iogearbox.net --- include/linux/skbuff.h | 8 ++++++++ include/uapi/linux/bpf.h | 8 ++++++++ net/core/filter.c | 8 ++++++-- tools/include/uapi/linux/bpf.h | 8 ++++++++ 4 files changed, 30 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index a0d5c2760103..0c0377fc00c2 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3919,6 +3919,14 @@ static inline void __skb_incr_checksum_unnecessary(struct sk_buff *skb) } } +static inline void __skb_reset_checksum_unnecessary(struct sk_buff *skb) +{ + if (skb->ip_summed == CHECKSUM_UNNECESSARY) { + skb->ip_summed = CHECKSUM_NONE; + skb->csum_level = 0; + } +} + /* Check if we need to perform checksum complete validation. * * Returns true if checksum complete is needed, false otherwise diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b9ed9f14f2a2..3ba2bbbed80c 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1635,6 +1635,13 @@ union bpf_attr { * Grow or shrink the room for data in the packet associated to * *skb* by *len_diff*, and according to the selected *mode*. * + * By default, the helper will reset any offloaded checksum + * indicator of the skb to CHECKSUM_NONE. This can be avoided + * by the following flag: + * + * * **BPF_F_ADJ_ROOM_NO_CSUM_RESET**: Do not reset offloaded + * checksum data of the skb to CHECKSUM_NONE. + * * There are two supported modes at this time: * * * **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer @@ -3433,6 +3440,7 @@ enum { BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 = (1ULL << 2), BPF_F_ADJ_ROOM_ENCAP_L4_GRE = (1ULL << 3), BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4), + BPF_F_ADJ_ROOM_NO_CSUM_RESET = (1ULL << 5), }; enum { diff --git a/net/core/filter.c b/net/core/filter.c index ae82bcb03124..278dcc0af961 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3113,7 +3113,8 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff, { int ret; - if (flags & ~BPF_F_ADJ_ROOM_FIXED_GSO) + if (unlikely(flags & ~(BPF_F_ADJ_ROOM_FIXED_GSO | + BPF_F_ADJ_ROOM_NO_CSUM_RESET))) return -EINVAL; if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) { @@ -3163,7 +3164,8 @@ BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, u32 off; int ret; - if (unlikely(flags & ~BPF_F_ADJ_ROOM_MASK)) + if (unlikely(flags & ~(BPF_F_ADJ_ROOM_MASK | + BPF_F_ADJ_ROOM_NO_CSUM_RESET))) return -EINVAL; if (unlikely(len_diff_abs > 0xfffU)) return -EFAULT; @@ -3191,6 +3193,8 @@ BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, ret = shrink ? bpf_skb_net_shrink(skb, off, len_diff_abs, flags) : bpf_skb_net_grow(skb, off, len_diff_abs, flags); + if (!ret && !(flags & BPF_F_ADJ_ROOM_NO_CSUM_RESET)) + __skb_reset_checksum_unnecessary(skb); bpf_compute_data_pointers(skb); return ret; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index b9ed9f14f2a2..3ba2bbbed80c 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1635,6 +1635,13 @@ union bpf_attr { * Grow or shrink the room for data in the packet associated to * *skb* by *len_diff*, and according to the selected *mode*. * + * By default, the helper will reset any offloaded checksum + * indicator of the skb to CHECKSUM_NONE. This can be avoided + * by the following flag: + * + * * **BPF_F_ADJ_ROOM_NO_CSUM_RESET**: Do not reset offloaded + * checksum data of the skb to CHECKSUM_NONE. + * * There are two supported modes at this time: * * * **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer @@ -3433,6 +3440,7 @@ enum { BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 = (1ULL << 2), BPF_F_ADJ_ROOM_ENCAP_L4_GRE = (1ULL << 3), BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4), + BPF_F_ADJ_ROOM_NO_CSUM_RESET = (1ULL << 5), }; enum { -- cgit v1.2.3 From 7cdec54f9713256bb170873a1fc5c75c9127c9d2 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 2 Jun 2020 16:58:33 +0200 Subject: bpf: Add csum_level helper for fixing up csum levels Add a bpf_csum_level() helper which BPF programs can use in combination with bpf_skb_adjust_room() when they pass in BPF_F_ADJ_ROOM_NO_CSUM_RESET flag to the latter to avoid falling back to CHECKSUM_NONE. The bpf_csum_level() allows to adjust CHECKSUM_UNNECESSARY skb->csum_levels via BPF_CSUM_LEVEL_{INC,DEC} which calls __skb_{incr,decr}_checksum_unnecessary() on the skb. The helper also allows a BPF_CSUM_LEVEL_RESET which sets the skb's csum to CHECKSUM_NONE as well as a BPF_CSUM_LEVEL_QUERY to just return the current level. Without this helper, there is no way to otherwise adjust the skb->csum_level. I did not add an extra dummy flags as there is plenty of free bitspace in level argument itself iff ever needed in future. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Reviewed-by: Alan Maguire Acked-by: Lorenz Bauer Link: https://lore.kernel.org/bpf/279ae3717cb3d03c0ffeb511493c93c450a01e1a.1591108731.git.daniel@iogearbox.net --- include/uapi/linux/bpf.h | 43 +++++++++++++++++++++++++++++++++++++++++- net/core/filter.c | 38 +++++++++++++++++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 43 +++++++++++++++++++++++++++++++++++++++++- 3 files changed, 122 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 3ba2bbbed80c..c65b374a5090 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3220,6 +3220,38 @@ union bpf_attr { * calculation. * Return * Requested value, or 0, if flags are not recognized. + * + * int bpf_csum_level(struct sk_buff *skb, u64 level) + * Description + * Change the skbs checksum level by one layer up or down, or + * reset it entirely to none in order to have the stack perform + * checksum validation. The level is applicable to the following + * protocols: TCP, UDP, GRE, SCTP, FCOE. For example, a decap of + * | ETH | IP | UDP | GUE | IP | TCP | into | ETH | IP | TCP | + * through **bpf_skb_adjust_room**\ () helper with passing in + * **BPF_F_ADJ_ROOM_NO_CSUM_RESET** flag would require one call + * to **bpf_csum_level**\ () with **BPF_CSUM_LEVEL_DEC** since + * the UDP header is removed. Similarly, an encap of the latter + * into the former could be accompanied by a helper call to + * **bpf_csum_level**\ () with **BPF_CSUM_LEVEL_INC** if the + * skb is still intended to be processed in higher layers of the + * stack instead of just egressing at tc. + * + * There are three supported level settings at this time: + * + * * **BPF_CSUM_LEVEL_INC**: Increases skb->csum_level for skbs + * with CHECKSUM_UNNECESSARY. + * * **BPF_CSUM_LEVEL_DEC**: Decreases skb->csum_level for skbs + * with CHECKSUM_UNNECESSARY. + * * **BPF_CSUM_LEVEL_RESET**: Resets skb->csum_level to 0 and + * sets CHECKSUM_NONE to force checksum validation by the stack. + * * **BPF_CSUM_LEVEL_QUERY**: No-op, returns the current + * skb->csum_level. + * Return + * 0 on success, or a negative error in case of failure. In the + * case of **BPF_CSUM_LEVEL_QUERY**, the current skb->csum_level + * is returned or the error code -EACCES in case the skb is not + * subject to CHECKSUM_UNNECESSARY. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3356,7 +3388,8 @@ union bpf_attr { FN(ringbuf_reserve), \ FN(ringbuf_submit), \ FN(ringbuf_discard), \ - FN(ringbuf_query), + FN(ringbuf_query), \ + FN(csum_level), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -3433,6 +3466,14 @@ enum { BPF_F_CURRENT_NETNS = (-1L), }; +/* BPF_FUNC_csum_level level values. */ +enum { + BPF_CSUM_LEVEL_QUERY, + BPF_CSUM_LEVEL_INC, + BPF_CSUM_LEVEL_DEC, + BPF_CSUM_LEVEL_RESET, +}; + /* BPF_FUNC_skb_adjust_room flags. */ enum { BPF_F_ADJ_ROOM_FIXED_GSO = (1ULL << 0), diff --git a/net/core/filter.c b/net/core/filter.c index 278dcc0af961..d01a244b5087 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2015,6 +2015,40 @@ static const struct bpf_func_proto bpf_csum_update_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_2(bpf_csum_level, struct sk_buff *, skb, u64, level) +{ + /* The interface is to be used in combination with bpf_skb_adjust_room() + * for encap/decap of packet headers when BPF_F_ADJ_ROOM_NO_CSUM_RESET + * is passed as flags, for example. + */ + switch (level) { + case BPF_CSUM_LEVEL_INC: + __skb_incr_checksum_unnecessary(skb); + break; + case BPF_CSUM_LEVEL_DEC: + __skb_decr_checksum_unnecessary(skb); + break; + case BPF_CSUM_LEVEL_RESET: + __skb_reset_checksum_unnecessary(skb); + break; + case BPF_CSUM_LEVEL_QUERY: + return skb->ip_summed == CHECKSUM_UNNECESSARY ? + skb->csum_level : -EACCES; + default: + return -EINVAL; + } + + return 0; +} + +static const struct bpf_func_proto bpf_csum_level_proto = { + .func = bpf_csum_level, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb) { return dev_forward_skb(dev, skb); @@ -6280,6 +6314,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_csum_diff_proto; case BPF_FUNC_csum_update: return &bpf_csum_update_proto; + case BPF_FUNC_csum_level: + return &bpf_csum_level_proto; case BPF_FUNC_l3_csum_replace: return &bpf_l3_csum_replace_proto; case BPF_FUNC_l4_csum_replace: @@ -6613,6 +6649,8 @@ lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skb_store_bytes_proto; case BPF_FUNC_csum_update: return &bpf_csum_update_proto; + case BPF_FUNC_csum_level: + return &bpf_csum_level_proto; case BPF_FUNC_l3_csum_replace: return &bpf_l3_csum_replace_proto; case BPF_FUNC_l4_csum_replace: diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 3ba2bbbed80c..c65b374a5090 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3220,6 +3220,38 @@ union bpf_attr { * calculation. * Return * Requested value, or 0, if flags are not recognized. + * + * int bpf_csum_level(struct sk_buff *skb, u64 level) + * Description + * Change the skbs checksum level by one layer up or down, or + * reset it entirely to none in order to have the stack perform + * checksum validation. The level is applicable to the following + * protocols: TCP, UDP, GRE, SCTP, FCOE. For example, a decap of + * | ETH | IP | UDP | GUE | IP | TCP | into | ETH | IP | TCP | + * through **bpf_skb_adjust_room**\ () helper with passing in + * **BPF_F_ADJ_ROOM_NO_CSUM_RESET** flag would require one call + * to **bpf_csum_level**\ () with **BPF_CSUM_LEVEL_DEC** since + * the UDP header is removed. Similarly, an encap of the latter + * into the former could be accompanied by a helper call to + * **bpf_csum_level**\ () with **BPF_CSUM_LEVEL_INC** if the + * skb is still intended to be processed in higher layers of the + * stack instead of just egressing at tc. + * + * There are three supported level settings at this time: + * + * * **BPF_CSUM_LEVEL_INC**: Increases skb->csum_level for skbs + * with CHECKSUM_UNNECESSARY. + * * **BPF_CSUM_LEVEL_DEC**: Decreases skb->csum_level for skbs + * with CHECKSUM_UNNECESSARY. + * * **BPF_CSUM_LEVEL_RESET**: Resets skb->csum_level to 0 and + * sets CHECKSUM_NONE to force checksum validation by the stack. + * * **BPF_CSUM_LEVEL_QUERY**: No-op, returns the current + * skb->csum_level. + * Return + * 0 on success, or a negative error in case of failure. In the + * case of **BPF_CSUM_LEVEL_QUERY**, the current skb->csum_level + * is returned or the error code -EACCES in case the skb is not + * subject to CHECKSUM_UNNECESSARY. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3356,7 +3388,8 @@ union bpf_attr { FN(ringbuf_reserve), \ FN(ringbuf_submit), \ FN(ringbuf_discard), \ - FN(ringbuf_query), + FN(ringbuf_query), \ + FN(csum_level), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -3433,6 +3466,14 @@ enum { BPF_F_CURRENT_NETNS = (-1L), }; +/* BPF_FUNC_csum_level level values. */ +enum { + BPF_CSUM_LEVEL_QUERY, + BPF_CSUM_LEVEL_INC, + BPF_CSUM_LEVEL_DEC, + BPF_CSUM_LEVEL_RESET, +}; + /* BPF_FUNC_skb_adjust_room flags. */ enum { BPF_F_ADJ_ROOM_FIXED_GSO = (1ULL << 0), -- cgit v1.2.3 From 049fa17f7ae6b0971ad41b761479962facafea4b Mon Sep 17 00:00:00 2001 From: Tuong Lien Date: Tue, 2 Jun 2020 11:46:40 +0700 Subject: Revert "tipc: Fix potential tipc_node refcnt leak in tipc_rcv" This reverts commit de058420767df21e2b6b0f3bb36d1616fb962032. There is no actual tipc_node refcnt leak as stated in the above commit. The refcnt is hold carefully for the case of an asynchronous decryption (i.e. -EINPROGRESS/-EBUSY and skb = NULL is returned), so that the node object cannot be freed in the meantime. The counter will be re-balanced when the operation's callback arrives with the decrypted buffer if any. In other cases, e.g. a synchronous crypto the counter will be decreased immediately when it is done. Now with that commit, a kernel panic will occur when there is no node found (i.e. n = NULL) in the 'tipc_rcv()' or a premature release of the node object. This commit solves the issues by reverting the said commit, but keeping one valid case that the 'skb_linearize()' is failed. Acked-by: Jon Maloy Signed-off-by: Tuong Lien Tested-by: Hoang Le Signed-off-by: David S. Miller --- net/tipc/node.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/tipc/node.c b/net/tipc/node.c index 0312fb181d94..a4c2816c3746 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -2038,7 +2038,6 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b) n = tipc_node_find_by_id(net, ehdr->id); } tipc_crypto_rcv(net, (n) ? n->crypto_rx : NULL, &skb, b); - tipc_node_put(n); if (!skb) return; -- cgit v1.2.3 From a275727b1899d14d33d6c8c70f303b73f01cdbc6 Mon Sep 17 00:00:00 2001 From: Tuong Lien Date: Tue, 2 Jun 2020 11:46:41 +0700 Subject: Revert "tipc: Fix potential tipc_aead refcnt leak in tipc_crypto_rcv" This reverts commit 441870ee4240cf67b5d3ab8e16216a9ff42eb5d6. Like the previous patch in this series, we revert the above commit that causes similar issues with the 'aead' object. Acked-by: Jon Maloy Signed-off-by: Tuong Lien Signed-off-by: David S. Miller --- net/tipc/crypto.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c index 8c47ded2edb6..c8c47fc72653 100644 --- a/net/tipc/crypto.c +++ b/net/tipc/crypto.c @@ -1712,7 +1712,6 @@ exit: case -EBUSY: this_cpu_inc(stats->stat[STAT_ASYNC]); *skb = NULL; - tipc_aead_put(aead); return rc; default: this_cpu_inc(stats->stat[STAT_NOK]); -- cgit v1.2.3