From 51d62f2f2c501a93d9a6a46f43731f984e227764 Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Tue, 5 Jan 2021 16:56:57 +0200 Subject: cfg80211: Save the regulatory domain with a lock Saving the regulatory domain while setting custom regulatory domain was done while accessing a RCU protected pointer but without any protection. Fix this by using RTNL while accessing the pointer. Signed-off-by: Ilan Peer Reported-by: syzbot+27771d4abcd9b7a1f5d3@syzkaller.appspotmail.com Reported-by: syzbot+db4035751c56c0079282@syzkaller.appspotmail.com Reported-by: Hans de Goede Fixes: beee24695157 ("cfg80211: Save the regulatory domain when setting custom regulatory") Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20210105165657.613e9a876829.Ia38d27dbebea28bf9c56d70691d243186ede70e7@changeid Signed-off-by: Johannes Berg --- net/wireless/reg.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/reg.c b/net/wireless/reg.c index bb72447ad960..8114bba8556c 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -5,7 +5,7 @@ * Copyright 2008-2011 Luis R. Rodriguez * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2017 Intel Deutschland GmbH - * Copyright (C) 2018 - 2019 Intel Corporation + * Copyright (C) 2018 - 2021 Intel Corporation * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -139,6 +139,11 @@ static const struct ieee80211_regdomain *get_cfg80211_regdom(void) return rcu_dereference_rtnl(cfg80211_regdomain); } +/* + * Returns the regulatory domain associated with the wiphy. + * + * Requires either RTNL or RCU protection + */ const struct ieee80211_regdomain *get_wiphy_regdom(struct wiphy *wiphy) { return rcu_dereference_rtnl(wiphy->regd); @@ -2571,9 +2576,13 @@ void wiphy_apply_custom_regulatory(struct wiphy *wiphy, if (IS_ERR(new_regd)) return; + rtnl_lock(); + tmp = get_wiphy_regdom(wiphy); rcu_assign_pointer(wiphy->regd, new_regd); rcu_free_regdom(tmp); + + rtnl_unlock(); } EXPORT_SYMBOL(wiphy_apply_custom_regulatory); -- cgit v1.2.3 From 5f39d2713bd80e8a3e6d9299930aec8844872c0e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 3 Jan 2021 14:39:27 -0500 Subject: SUNRPC: Move the svc_xdr_recvfrom tracepoint again Commit 156708adf2d9 ("SUNRPC: Move the svc_xdr_recvfrom() tracepoint") tried to capture the correct XID in the trace record, but this line in svc_recv: rqstp->rq_xid = svc_getu32(&rqstp->rq_arg.head[0]); alters the size of rq_arg.head[0].iov_len. The tracepoint records the correct XID but an incorrect value for the length of the xdr_buf's head. To keep the trace callsites simple, I've created two trace classes. One assumes the xdr_buf contains a full RPC message, and the XID can be extracted from it. The other assumes the contents of the xdr_buf are arbitrary, and the xid will be provided by the caller. Currently there is only one user of each class, but I expect we will need a few more tracepoints using each class as time goes on. Signed-off-by: Chuck Lever --- include/trace/events/sunrpc.h | 59 ++++++++++++++++++++++++++++++++++++++----- net/sunrpc/svc_xprt.c | 4 +-- 2 files changed, 55 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 58994e013022..6f89c27265f5 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -1424,13 +1424,61 @@ TRACE_EVENT(rpcb_unregister, ) ); +/* Record an xdr_buf containing a fully-formed RPC message */ +DECLARE_EVENT_CLASS(svc_xdr_msg_class, + TP_PROTO( + const struct xdr_buf *xdr + ), + + TP_ARGS(xdr), + + TP_STRUCT__entry( + __field(u32, xid) + __field(const void *, head_base) + __field(size_t, head_len) + __field(const void *, tail_base) + __field(size_t, tail_len) + __field(unsigned int, page_len) + __field(unsigned int, msg_len) + ), + + TP_fast_assign( + __be32 *p = (__be32 *)xdr->head[0].iov_base; + + __entry->xid = be32_to_cpu(*p); + __entry->head_base = p; + __entry->head_len = xdr->head[0].iov_len; + __entry->tail_base = xdr->tail[0].iov_base; + __entry->tail_len = xdr->tail[0].iov_len; + __entry->page_len = xdr->page_len; + __entry->msg_len = xdr->len; + ), + + TP_printk("xid=0x%08x head=[%p,%zu] page=%u tail=[%p,%zu] len=%u", + __entry->xid, + __entry->head_base, __entry->head_len, __entry->page_len, + __entry->tail_base, __entry->tail_len, __entry->msg_len + ) +); + +#define DEFINE_SVCXDRMSG_EVENT(name) \ + DEFINE_EVENT(svc_xdr_msg_class, \ + svc_xdr_##name, \ + TP_PROTO( \ + const struct xdr_buf *xdr \ + ), \ + TP_ARGS(xdr)) + +DEFINE_SVCXDRMSG_EVENT(recvfrom); + +/* Record an xdr_buf containing arbitrary data, tagged with an XID */ DECLARE_EVENT_CLASS(svc_xdr_buf_class, TP_PROTO( - const struct svc_rqst *rqst, + __be32 xid, const struct xdr_buf *xdr ), - TP_ARGS(rqst, xdr), + TP_ARGS(xid, xdr), TP_STRUCT__entry( __field(u32, xid) @@ -1443,7 +1491,7 @@ DECLARE_EVENT_CLASS(svc_xdr_buf_class, ), TP_fast_assign( - __entry->xid = be32_to_cpu(rqst->rq_xid); + __entry->xid = be32_to_cpu(xid); __entry->head_base = xdr->head[0].iov_base; __entry->head_len = xdr->head[0].iov_len; __entry->tail_base = xdr->tail[0].iov_base; @@ -1463,12 +1511,11 @@ DECLARE_EVENT_CLASS(svc_xdr_buf_class, DEFINE_EVENT(svc_xdr_buf_class, \ svc_xdr_##name, \ TP_PROTO( \ - const struct svc_rqst *rqst, \ + __be32 xid, \ const struct xdr_buf *xdr \ ), \ - TP_ARGS(rqst, xdr)) + TP_ARGS(xid, xdr)) -DEFINE_SVCXDRBUF_EVENT(recvfrom); DEFINE_SVCXDRBUF_EVENT(sendto); /* diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 5fb9164aa690..dcc50ae54550 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -857,6 +857,7 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) err = -EAGAIN; if (len <= 0) goto out_release; + trace_svc_xdr_recvfrom(&rqstp->rq_arg); clear_bit(XPT_OLD, &xprt->xpt_flags); @@ -866,7 +867,6 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) if (serv->sv_stats) serv->sv_stats->netcnt++; - trace_svc_xdr_recvfrom(rqstp, &rqstp->rq_arg); return len; out_release: rqstp->rq_res.len = 0; @@ -904,7 +904,7 @@ int svc_send(struct svc_rqst *rqstp) xb->len = xb->head[0].iov_len + xb->page_len + xb->tail[0].iov_len; - trace_svc_xdr_sendto(rqstp, xb); + trace_svc_xdr_sendto(rqstp->rq_xid, xb); trace_svc_stats_latency(rqstp); len = xprt->xpt_ops->xpo_sendto(rqstp); -- cgit v1.2.3 From 7ac6ad051150592557520b45773201b987ecfce3 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Tue, 12 Jan 2021 15:42:54 -0800 Subject: bpf: Reject too big ctx_size_in for raw_tp test run syzbot reported a WARNING for allocating too big memory: WARNING: CPU: 1 PID: 8484 at mm/page_alloc.c:4976 __alloc_pages_nodemask+0x5f8/0x730 mm/page_alloc.c:5011 Modules linked in: CPU: 1 PID: 8484 Comm: syz-executor862 Not tainted 5.11.0-rc2-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:__alloc_pages_nodemask+0x5f8/0x730 mm/page_alloc.c:4976 Code: 00 00 0c 00 0f 85 a7 00 00 00 8b 3c 24 4c 89 f2 44 89 e6 c6 44 24 70 00 48 89 6c 24 58 e8 d0 d7 ff ff 49 89 c5 e9 ea fc ff ff <0f> 0b e9 b5 fd ff ff 89 74 24 14 4c 89 4c 24 08 4c 89 74 24 18 e8 RSP: 0018:ffffc900012efb10 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 1ffff9200025df66 RCX: 0000000000000000 RDX: 0000000000000000 RSI: dffffc0000000000 RDI: 0000000000140dc0 RBP: 0000000000140dc0 R08: 0000000000000000 R09: 0000000000000000 R10: ffffffff81b1f7e1 R11: 0000000000000000 R12: 0000000000000014 R13: 0000000000000014 R14: 0000000000000000 R15: 0000000000000000 FS: 000000000190c880(0000) GS:ffff8880b9e00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f08b7f316c0 CR3: 0000000012073000 CR4: 00000000001506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: alloc_pages_current+0x18c/0x2a0 mm/mempolicy.c:2267 alloc_pages include/linux/gfp.h:547 [inline] kmalloc_order+0x2e/0xb0 mm/slab_common.c:837 kmalloc_order_trace+0x14/0x120 mm/slab_common.c:853 kmalloc include/linux/slab.h:557 [inline] kzalloc include/linux/slab.h:682 [inline] bpf_prog_test_run_raw_tp+0x4b5/0x670 net/bpf/test_run.c:282 bpf_prog_test_run kernel/bpf/syscall.c:3120 [inline] __do_sys_bpf+0x1ea9/0x4f10 kernel/bpf/syscall.c:4398 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x440499 Code: 18 89 d0 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 7b 13 fc ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007ffe1f3bfb18 EFLAGS: 00000246 ORIG_RAX: 0000000000000141 RAX: ffffffffffffffda RBX: 00000000004002c8 RCX: 0000000000440499 RDX: 0000000000000048 RSI: 0000000020000600 RDI: 000000000000000a RBP: 00000000006ca018 R08: 0000000000000000 R09: 00000000004002c8 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000401ca0 R13: 0000000000401d30 R14: 0000000000000000 R15: 0000000000000000 This is because we didn't filter out too big ctx_size_in. Fix it by rejecting ctx_size_in that are bigger than MAX_BPF_FUNC_ARGS (12) u64 numbers. Fixes: 1b4d60ec162f ("bpf: Enable BPF_PROG_TEST_RUN for raw_tracepoint") Reported-by: syzbot+4f98876664c7337a4ae6@syzkaller.appspotmail.com Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210112234254.1906829-1-songliubraving@fb.com --- net/bpf/test_run.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index c1c30a9f76f3..8b796c499cbb 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -272,7 +272,8 @@ int bpf_prog_test_run_raw_tp(struct bpf_prog *prog, kattr->test.repeat) return -EINVAL; - if (ctx_size_in < prog->aux->max_ctx_offset) + if (ctx_size_in < prog->aux->max_ctx_offset || + ctx_size_in > MAX_BPF_FUNC_ARGS * sizeof(u64)) return -EINVAL; if ((kattr->test.flags & BPF_F_TEST_RUN_ON_CPU) == 0 && cpu != 0) -- cgit v1.2.3 From 6020d534fa012b80c6d13811dc4d2dfedca2e403 Mon Sep 17 00:00:00 2001 From: Shayne Chen Date: Tue, 12 Jan 2021 11:20:28 +0800 Subject: mac80211: fix incorrect strlen of .write in debugfs This fixes strlen mismatch problems happening in some .write callbacks of debugfs. When trying to configure airtime_flags in debugfs, an error appeared: ash: write error: Invalid argument The error is returned from kstrtou16() since a wrong length makes it miss the real end of input string. To fix this, use count as the string length, and set proper end of string for a char buffer. The debug print is shown - airtime_flags_write: count = 2, len = 8, where the actual length is 2, but "len = strlen(buf)" gets 8. Also cleanup the other similar cases for the sake of consistency. Signed-off-by: Sujuan Chen Signed-off-by: Ryder Lee Signed-off-by: Shayne Chen Link: https://lore.kernel.org/r/20210112032028.7482-1-shayne.chen@mediatek.com Signed-off-by: Johannes Berg --- net/mac80211/debugfs.c | 44 ++++++++++++++++++++------------------------ 1 file changed, 20 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index 48f144f107d5..9e723d943421 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -120,18 +120,17 @@ static ssize_t aqm_write(struct file *file, { struct ieee80211_local *local = file->private_data; char buf[100]; - size_t len; - if (count > sizeof(buf)) + if (count >= sizeof(buf)) return -EINVAL; if (copy_from_user(buf, user_buf, count)) return -EFAULT; - buf[sizeof(buf) - 1] = '\0'; - len = strlen(buf); - if (len > 0 && buf[len-1] == '\n') - buf[len-1] = 0; + if (count && buf[count - 1] == '\n') + buf[count - 1] = '\0'; + else + buf[count] = '\0'; if (sscanf(buf, "fq_limit %u", &local->fq.limit) == 1) return count; @@ -177,18 +176,17 @@ static ssize_t airtime_flags_write(struct file *file, { struct ieee80211_local *local = file->private_data; char buf[16]; - size_t len; - if (count > sizeof(buf)) + if (count >= sizeof(buf)) return -EINVAL; if (copy_from_user(buf, user_buf, count)) return -EFAULT; - buf[sizeof(buf) - 1] = 0; - len = strlen(buf); - if (len > 0 && buf[len - 1] == '\n') - buf[len - 1] = 0; + if (count && buf[count - 1] == '\n') + buf[count - 1] = '\0'; + else + buf[count] = '\0'; if (kstrtou16(buf, 0, &local->airtime_flags)) return -EINVAL; @@ -237,20 +235,19 @@ static ssize_t aql_txq_limit_write(struct file *file, { struct ieee80211_local *local = file->private_data; char buf[100]; - size_t len; u32 ac, q_limit_low, q_limit_high, q_limit_low_old, q_limit_high_old; struct sta_info *sta; - if (count > sizeof(buf)) + if (count >= sizeof(buf)) return -EINVAL; if (copy_from_user(buf, user_buf, count)) return -EFAULT; - buf[sizeof(buf) - 1] = 0; - len = strlen(buf); - if (len > 0 && buf[len - 1] == '\n') - buf[len - 1] = 0; + if (count && buf[count - 1] == '\n') + buf[count - 1] = '\0'; + else + buf[count] = '\0'; if (sscanf(buf, "%u %u %u", &ac, &q_limit_low, &q_limit_high) != 3) return -EINVAL; @@ -306,18 +303,17 @@ static ssize_t force_tx_status_write(struct file *file, { struct ieee80211_local *local = file->private_data; char buf[3]; - size_t len; - if (count > sizeof(buf)) + if (count >= sizeof(buf)) return -EINVAL; if (copy_from_user(buf, user_buf, count)) return -EFAULT; - buf[sizeof(buf) - 1] = '\0'; - len = strlen(buf); - if (len > 0 && buf[len - 1] == '\n') - buf[len - 1] = 0; + if (count && buf[count - 1] == '\n') + buf[count - 1] = '\0'; + else + buf[count] = '\0'; if (buf[0] == '0' && buf[1] == '\0') local->force_tx_status = 0; -- cgit v1.2.3 From 622d3b4e39381262da7b18ca1ed1311df227de86 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Fri, 18 Dec 2020 19:47:17 +0100 Subject: mac80211: fix fast-rx encryption check When using WEP, the default unicast key needs to be selected, instead of the STA PTK. Signed-off-by: Felix Fietkau Link: https://lore.kernel.org/r/20201218184718.93650-5-nbd@nbd.name Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 13b9bcc4865d..972895e9f22d 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -4176,6 +4176,8 @@ void ieee80211_check_fast_rx(struct sta_info *sta) rcu_read_lock(); key = rcu_dereference(sta->ptk[sta->ptk_idx]); + if (!key) + key = rcu_dereference(sdata->default_unicast_key); if (key) { switch (key->conf.cipher) { case WLAN_CIPHER_SUITE_TKIP: -- cgit v1.2.3 From b101dd2d22f45d203010b40c739df346a0cbebef Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Fri, 18 Dec 2020 19:47:16 +0100 Subject: mac80211: fix encryption key selection for 802.3 xmit When using WEP, the default unicast key needs to be selected, instead of the STA PTK. Signed-off-by: Felix Fietkau Link: https://lore.kernel.org/r/20201218184718.93650-4-nbd@nbd.name Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 6422da6690f7..2d606dbab37c 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -4251,7 +4251,6 @@ netdev_tx_t ieee80211_subif_start_xmit_8023(struct sk_buff *skb, struct ethhdr *ehdr = (struct ethhdr *)skb->data; struct ieee80211_key *key; struct sta_info *sta; - bool offload = true; if (unlikely(skb->len < ETH_HLEN)) { kfree_skb(skb); @@ -4267,18 +4266,22 @@ netdev_tx_t ieee80211_subif_start_xmit_8023(struct sk_buff *skb, if (unlikely(IS_ERR_OR_NULL(sta) || !sta->uploaded || !test_sta_flag(sta, WLAN_STA_AUTHORIZED) || - sdata->control_port_protocol == ehdr->h_proto)) - offload = false; - else if ((key = rcu_dereference(sta->ptk[sta->ptk_idx])) && - (!(key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) || - key->conf.cipher == WLAN_CIPHER_SUITE_TKIP)) - offload = false; - - if (offload) - ieee80211_8023_xmit(sdata, dev, sta, key, skb); - else - ieee80211_subif_start_xmit(skb, dev); + sdata->control_port_protocol == ehdr->h_proto)) + goto skip_offload; + + key = rcu_dereference(sta->ptk[sta->ptk_idx]); + if (!key) + key = rcu_dereference(sdata->default_unicast_key); + + if (key && (!(key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) || + key->conf.cipher == WLAN_CIPHER_SUITE_TKIP)) + goto skip_offload; + + ieee80211_8023_xmit(sdata, dev, sta, key, skb); + goto out; +skip_offload: + ieee80211_subif_start_xmit(skb, dev); out: rcu_read_unlock(); -- cgit v1.2.3 From 2463ec86cd0338a2c2edbfb0b9d50c52ff76ff43 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Fri, 18 Dec 2020 20:15:25 +0100 Subject: mac80211: do not drop tx nulldata packets on encrypted links ieee80211_tx_h_select_key drops any non-mgmt packets without a key when encryption is used. This is wrong for nulldata packets that can't be encrypted and are sent out for probing clients and indicating 4-address mode. Reported-by: Sebastian Gottschall Fixes: a0761a301746 ("mac80211: drop data frames without key on encrypted links") Signed-off-by: Felix Fietkau Link: https://lore.kernel.org/r/20201218191525.1168-1-nbd@nbd.name Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 2d606dbab37c..de8325c79dd5 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -649,7 +649,7 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx) if (!skip_hw && tx->key && tx->key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) info->control.hw_key = &tx->key->conf; - } else if (!ieee80211_is_mgmt(hdr->frame_control) && tx->sta && + } else if (ieee80211_is_data_present(hdr->frame_control) && tx->sta && test_sta_flag(tx->sta, WLAN_STA_USES_ENCRYPTION)) { return TX_DROP; } -- cgit v1.2.3 From c13cf5c159660451c8fbdc37efb998b198e1d305 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Sat, 26 Dec 2020 10:39:08 +0100 Subject: mac80211: check if atf has been disabled in __ieee80211_schedule_txq MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Check if atf has been disabled in __ieee80211_schedule_txq() in order to avoid a given sta is always put to the beginning of the active_txqs list and never moved to the end since deficit is not decremented in ieee80211_sta_register_airtime() Fixes: b4809e9484da1 ("mac80211: Add airtime accounting and scheduling to TXQs") Signed-off-by: Lorenzo Bianconi Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/93889406c50f1416214c079ca0b8c9faecc5143e.1608975195.git.lorenzo@kernel.org Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index de8325c79dd5..ebb3228ce971 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -3809,7 +3809,7 @@ void __ieee80211_schedule_txq(struct ieee80211_hw *hw, * get immediately moved to the back of the list on the next * call to ieee80211_next_txq(). */ - if (txqi->txq.sta && + if (txqi->txq.sta && local->airtime_flags && wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_AIRTIME_FAIRNESS)) list_add(&txqi->schedule_order, -- cgit v1.2.3 From c96adff95619178e2118925578343ad54857c80c Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Fri, 15 Jan 2021 10:50:24 -0800 Subject: cls_flower: call nla_ok() before nla_next() fl_set_enc_opt() simply checks if there are still bytes left to parse, but this is not sufficent as syzbot seems to be able to generate malformatted netlink messages. nla_ok() is more strict so should be used to validate the next nlattr here. And nla_validate_nested_deprecated() has less strict check too, it is probably too late to switch to the strict version, but we can just call nla_ok() too after it. Reported-and-tested-by: syzbot+2624e3778b18fc497c92@syzkaller.appspotmail.com Fixes: 0a6e77784f49 ("net/sched: allow flower to match tunnel options") Fixes: 79b1011cb33d ("net: sched: allow flower to match erspan options") Cc: Jamal Hadi Salim Cc: Xin Long Cc: Jiri Pirko Signed-off-by: Cong Wang Link: https://lore.kernel.org/r/20210115185024.72298-1-xiyou.wangcong@gmail.com Signed-off-by: Jakub Kicinski --- net/sched/cls_flower.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 1319986693fc..84f932532db7 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -1272,6 +1272,10 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, nla_opt_msk = nla_data(tb[TCA_FLOWER_KEY_ENC_OPTS_MASK]); msk_depth = nla_len(tb[TCA_FLOWER_KEY_ENC_OPTS_MASK]); + if (!nla_ok(nla_opt_msk, msk_depth)) { + NL_SET_ERR_MSG(extack, "Invalid nested attribute for masks"); + return -EINVAL; + } } nla_for_each_attr(nla_opt_key, nla_enc_key, @@ -1307,9 +1311,6 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, NL_SET_ERR_MSG(extack, "Key and mask miss aligned"); return -EINVAL; } - - if (msk_depth) - nla_opt_msk = nla_next(nla_opt_msk, &msk_depth); break; case TCA_FLOWER_KEY_ENC_OPTS_VXLAN: if (key->enc_opts.dst_opt_type) { @@ -1340,9 +1341,6 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, NL_SET_ERR_MSG(extack, "Key and mask miss aligned"); return -EINVAL; } - - if (msk_depth) - nla_opt_msk = nla_next(nla_opt_msk, &msk_depth); break; case TCA_FLOWER_KEY_ENC_OPTS_ERSPAN: if (key->enc_opts.dst_opt_type) { @@ -1373,14 +1371,20 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, NL_SET_ERR_MSG(extack, "Key and mask miss aligned"); return -EINVAL; } - - if (msk_depth) - nla_opt_msk = nla_next(nla_opt_msk, &msk_depth); break; default: NL_SET_ERR_MSG(extack, "Unknown tunnel option type"); return -EINVAL; } + + if (!msk_depth) + continue; + + if (!nla_ok(nla_opt_msk, msk_depth)) { + NL_SET_ERR_MSG(extack, "A mask attribute is invalid"); + return -EINVAL; + } + nla_opt_msk = nla_next(nla_opt_msk, &msk_depth); } return 0; -- cgit v1.2.3 From e4bedf48aaa5552bc1f49703abd17606e7e6e82a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 14 Jan 2021 08:06:37 -0800 Subject: net_sched: reject silly cell_log in qdisc_get_rtab() iproute2 probably never goes beyond 8 for the cell exponent, but stick to the max shift exponent for signed 32bit. UBSAN reported: UBSAN: shift-out-of-bounds in net/sched/sch_api.c:389:22 shift exponent 130 is too large for 32-bit type 'int' CPU: 1 PID: 8450 Comm: syz-executor586 Not tainted 5.11.0-rc3-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:79 [inline] dump_stack+0x183/0x22e lib/dump_stack.c:120 ubsan_epilogue lib/ubsan.c:148 [inline] __ubsan_handle_shift_out_of_bounds+0x432/0x4d0 lib/ubsan.c:395 __detect_linklayer+0x2a9/0x330 net/sched/sch_api.c:389 qdisc_get_rtab+0x2b5/0x410 net/sched/sch_api.c:435 cbq_init+0x28f/0x12c0 net/sched/sch_cbq.c:1180 qdisc_create+0x801/0x1470 net/sched/sch_api.c:1246 tc_modify_qdisc+0x9e3/0x1fc0 net/sched/sch_api.c:1662 rtnetlink_rcv_msg+0xb1d/0xe60 net/core/rtnetlink.c:5564 netlink_rcv_skb+0x1f0/0x460 net/netlink/af_netlink.c:2494 netlink_unicast_kernel net/netlink/af_netlink.c:1304 [inline] netlink_unicast+0x7de/0x9b0 net/netlink/af_netlink.c:1330 netlink_sendmsg+0xaa6/0xe90 net/netlink/af_netlink.c:1919 sock_sendmsg_nosec net/socket.c:652 [inline] sock_sendmsg net/socket.c:672 [inline] ____sys_sendmsg+0x5a2/0x900 net/socket.c:2345 ___sys_sendmsg net/socket.c:2399 [inline] __sys_sendmsg+0x319/0x400 net/socket.c:2432 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reported-by: syzbot Acked-by: Cong Wang Link: https://lore.kernel.org/r/20210114160637.1660597-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- net/sched/sch_api.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 51cb553e4317..6fe4e5cc807c 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -412,7 +412,8 @@ struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, { struct qdisc_rate_table *rtab; - if (tab == NULL || r->rate == 0 || r->cell_log == 0 || + if (tab == NULL || r->rate == 0 || + r->cell_log == 0 || r->cell_log >= 32 || nla_len(tab) != TC_RTAB_SIZE) { NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching"); return NULL; -- cgit v1.2.3 From dd5e073381f2ada3630f36be42833c6e9c78b75e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 14 Jan 2021 10:19:29 -0800 Subject: net_sched: gen_estimator: support large ewma log syzbot report reminded us that very big ewma_log were supported in the past, even if they made litle sense. tc qdisc replace dev xxx root est 1sec 131072sec ... While fixing the bug, also add boundary checks for ewma_log, in line with range supported by iproute2. UBSAN: shift-out-of-bounds in net/core/gen_estimator.c:83:38 shift exponent -1 is negative CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.10.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:79 [inline] dump_stack+0x107/0x163 lib/dump_stack.c:120 ubsan_epilogue+0xb/0x5a lib/ubsan.c:148 __ubsan_handle_shift_out_of_bounds.cold+0xb1/0x181 lib/ubsan.c:395 est_timer.cold+0xbb/0x12d net/core/gen_estimator.c:83 call_timer_fn+0x1a5/0x710 kernel/time/timer.c:1417 expire_timers kernel/time/timer.c:1462 [inline] __run_timers.part.0+0x692/0xa80 kernel/time/timer.c:1731 __run_timers kernel/time/timer.c:1712 [inline] run_timer_softirq+0xb3/0x1d0 kernel/time/timer.c:1744 __do_softirq+0x2bc/0xa77 kernel/softirq.c:343 asm_call_irq_on_stack+0xf/0x20 __run_on_irqstack arch/x86/include/asm/irq_stack.h:26 [inline] run_on_irqstack_cond arch/x86/include/asm/irq_stack.h:77 [inline] do_softirq_own_stack+0xaa/0xd0 arch/x86/kernel/irq_64.c:77 invoke_softirq kernel/softirq.c:226 [inline] __irq_exit_rcu+0x17f/0x200 kernel/softirq.c:420 irq_exit_rcu+0x5/0x20 kernel/softirq.c:432 sysvec_apic_timer_interrupt+0x4d/0x100 arch/x86/kernel/apic/apic.c:1096 asm_sysvec_apic_timer_interrupt+0x12/0x20 arch/x86/include/asm/idtentry.h:628 RIP: 0010:native_save_fl arch/x86/include/asm/irqflags.h:29 [inline] RIP: 0010:arch_local_save_flags arch/x86/include/asm/irqflags.h:79 [inline] RIP: 0010:arch_irqs_disabled arch/x86/include/asm/irqflags.h:169 [inline] RIP: 0010:acpi_safe_halt drivers/acpi/processor_idle.c:111 [inline] RIP: 0010:acpi_idle_do_entry+0x1c9/0x250 drivers/acpi/processor_idle.c:516 Fixes: 1c0d32fde5bd ("net_sched: gen_estimator: complete rewrite of rate estimators") Signed-off-by: Eric Dumazet Reported-by: syzbot Link: https://lore.kernel.org/r/20210114181929.1717985-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- net/core/gen_estimator.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index 80dbf2f4016e..8e582e29a41e 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -80,11 +80,11 @@ static void est_timer(struct timer_list *t) u64 rate, brate; est_fetch_counters(est, &b); - brate = (b.bytes - est->last_bytes) << (10 - est->ewma_log - est->intvl_log); - brate -= (est->avbps >> est->ewma_log); + brate = (b.bytes - est->last_bytes) << (10 - est->intvl_log); + brate = (brate >> est->ewma_log) - (est->avbps >> est->ewma_log); - rate = (b.packets - est->last_packets) << (10 - est->ewma_log - est->intvl_log); - rate -= (est->avpps >> est->ewma_log); + rate = (b.packets - est->last_packets) << (10 - est->intvl_log); + rate = (rate >> est->ewma_log) - (est->avpps >> est->ewma_log); write_seqcount_begin(&est->seq); est->avbps += brate; @@ -143,6 +143,9 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats, if (parm->interval < -2 || parm->interval > 3) return -EINVAL; + if (parm->ewma_log == 0 || parm->ewma_log >= 31) + return -EINVAL; + est = kzalloc(sizeof(*est), GFP_KERNEL); if (!est) return -ENOBUFS; -- cgit v1.2.3 From bcd0cf19ef8258ac31b9a20248b05c15a1f4b4b0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 14 Jan 2021 10:52:29 -0800 Subject: net_sched: avoid shift-out-of-bounds in tcindex_set_parms() tc_index being 16bit wide, we need to check that TCA_TCINDEX_SHIFT attribute is not silly. UBSAN: shift-out-of-bounds in net/sched/cls_tcindex.c:260:29 shift exponent 255 is too large for 32-bit type 'int' CPU: 0 PID: 8516 Comm: syz-executor228 Not tainted 5.10.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:79 [inline] dump_stack+0x107/0x163 lib/dump_stack.c:120 ubsan_epilogue+0xb/0x5a lib/ubsan.c:148 __ubsan_handle_shift_out_of_bounds.cold+0xb1/0x181 lib/ubsan.c:395 valid_perfect_hash net/sched/cls_tcindex.c:260 [inline] tcindex_set_parms.cold+0x1b/0x215 net/sched/cls_tcindex.c:425 tcindex_change+0x232/0x340 net/sched/cls_tcindex.c:546 tc_new_tfilter+0x13fb/0x21b0 net/sched/cls_api.c:2127 rtnetlink_rcv_msg+0x8b6/0xb80 net/core/rtnetlink.c:5555 netlink_rcv_skb+0x153/0x420 net/netlink/af_netlink.c:2494 netlink_unicast_kernel net/netlink/af_netlink.c:1304 [inline] netlink_unicast+0x533/0x7d0 net/netlink/af_netlink.c:1330 netlink_sendmsg+0x907/0xe40 net/netlink/af_netlink.c:1919 sock_sendmsg_nosec net/socket.c:652 [inline] sock_sendmsg+0xcf/0x120 net/socket.c:672 ____sys_sendmsg+0x6e8/0x810 net/socket.c:2336 ___sys_sendmsg+0xf3/0x170 net/socket.c:2390 __sys_sendmsg+0xe5/0x1b0 net/socket.c:2423 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reported-by: syzbot Link: https://lore.kernel.org/r/20210114185229.1742255-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- net/sched/cls_tcindex.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index 78bec347b8b6..c4007b9cd16d 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -366,9 +366,13 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, if (tb[TCA_TCINDEX_MASK]) cp->mask = nla_get_u16(tb[TCA_TCINDEX_MASK]); - if (tb[TCA_TCINDEX_SHIFT]) + if (tb[TCA_TCINDEX_SHIFT]) { cp->shift = nla_get_u32(tb[TCA_TCINDEX_SHIFT]); - + if (cp->shift > 16) { + err = -EINVAL; + goto errout; + } + } if (!cp->hash) { /* Hash not specified, use perfect hash if the upper limit * of the hashing index is below the threshold. -- cgit v1.2.3 From 66c556025d687dbdd0f748c5e1df89c977b6c02a Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Fri, 15 Jan 2021 15:04:40 +0000 Subject: skbuff: back tiny skbs with kmalloc() in __netdev_alloc_skb() too Commit 3226b158e67c ("net: avoid 32 x truesize under-estimation for tiny skbs") ensured that skbs with data size lower than 1025 bytes will be kmalloc'ed to avoid excessive page cache fragmentation and memory consumption. However, the fix adressed only __napi_alloc_skb() (primarily for virtio_net and napi_get_frags()), but the issue can still be achieved through __netdev_alloc_skb(), which is still used by several drivers. Drivers often allocate a tiny skb for headers and place the rest of the frame to frags (so-called copybreak). Mirror the condition to __netdev_alloc_skb() to handle this case too. Since v1 [0]: - fix "Fixes:" tag; - refine commit message (mention copybreak usecase). [0] https://lore.kernel.org/netdev/20210114235423.232737-1-alobakin@pm.me Fixes: a1c7fff7e18f ("net: netdev_alloc_skb() use build_skb()") Signed-off-by: Alexander Lobakin Link: https://lore.kernel.org/r/20210115150354.85967-1-alobakin@pm.me Signed-off-by: Jakub Kicinski --- net/core/skbuff.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index c1a6f262636a..785daff48030 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -437,7 +437,11 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, len += NET_SKB_PAD; - if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) || + /* If requested length is either too small or too big, + * we use kmalloc() for skb->head allocation. + */ + if (len <= SKB_WITH_OVERHEAD(1024) || + len > SKB_WITH_OVERHEAD(PAGE_SIZE) || (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); if (!skb) -- cgit v1.2.3 From a826b04303a40d52439aa141035fca5654ccaccd Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Fri, 15 Jan 2021 19:42:08 +0100 Subject: ipv6: create multicast route with RTPROT_KERNEL The ff00::/8 multicast route is created without specifying the fc_protocol field, so the default RTPROT_BOOT value is used: $ ip -6 -d route unicast ::1 dev lo proto kernel scope global metric 256 pref medium unicast fe80::/64 dev eth0 proto kernel scope global metric 256 pref medium unicast ff00::/8 dev eth0 proto boot scope global metric 256 pref medium As the documentation says, this value identifies routes installed during boot, but the route is created when interface is set up. Change the value to RTPROT_KERNEL which is a better value. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Matteo Croce Signed-off-by: Jakub Kicinski --- net/ipv6/addrconf.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index eff2cacd5209..19bf6822911c 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2469,6 +2469,7 @@ static void addrconf_add_mroute(struct net_device *dev) .fc_flags = RTF_UP, .fc_type = RTN_UNICAST, .fc_nlinfo.nl_net = dev_net(dev), + .fc_protocol = RTPROT_KERNEL, }; ipv6_addr_set(&cfg.fc_dst, htonl(0xFF000000), 0, 0, 0); -- cgit v1.2.3 From ceed9038b2783d14e0422bdc6fd04f70580efb4c Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Fri, 15 Jan 2021 19:42:09 +0100 Subject: ipv6: set multicast flag on the multicast route The multicast route ff00::/8 is created with type RTN_UNICAST: $ ip -6 -d route unicast ::1 dev lo proto kernel scope global metric 256 pref medium unicast fe80::/64 dev eth0 proto kernel scope global metric 256 pref medium unicast ff00::/8 dev eth0 proto kernel scope global metric 256 pref medium Set the type to RTN_MULTICAST which is more appropriate. Fixes: e8478e80e5a7 ("net/ipv6: Save route type in rt6_info") Signed-off-by: Matteo Croce Signed-off-by: Jakub Kicinski --- net/ipv6/addrconf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 19bf6822911c..9edc5bb2d531 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2467,7 +2467,7 @@ static void addrconf_add_mroute(struct net_device *dev) .fc_ifindex = dev->ifindex, .fc_dst_len = 8, .fc_flags = RTF_UP, - .fc_type = RTN_UNICAST, + .fc_type = RTN_MULTICAST, .fc_nlinfo.nl_net = dev_net(dev), .fc_protocol = RTPROT_KERNEL, }; -- cgit v1.2.3 From 9d9b1ee0b2d1c9e02b2338c4a4b0a062d2d3edac Mon Sep 17 00:00:00 2001 From: Enke Chen Date: Fri, 15 Jan 2021 14:30:58 -0800 Subject: tcp: fix TCP_USER_TIMEOUT with zero window The TCP session does not terminate with TCP_USER_TIMEOUT when data remain untransmitted due to zero window. The number of unanswered zero-window probes (tcp_probes_out) is reset to zero with incoming acks irrespective of the window size, as described in tcp_probe_timer(): RFC 1122 4.2.2.17 requires the sender to stay open indefinitely as long as the receiver continues to respond probes. We support this by default and reset icsk_probes_out with incoming ACKs. This counter, however, is the wrong one to be used in calculating the duration that the window remains closed and data remain untransmitted. Thanks to Jonathan Maxwell for diagnosing the actual issue. In this patch a new timestamp is introduced for the socket in order to track the elapsed time for the zero-window probes that have not been answered with any non-zero window ack. Fixes: 9721e709fa68 ("tcp: simplify window probe aborting on USER_TIMEOUT") Reported-by: William McCall Co-developed-by: Neal Cardwell Signed-off-by: Neal Cardwell Signed-off-by: Enke Chen Reviewed-by: Yuchung Cheng Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20210115223058.GA39267@localhost.localdomain Signed-off-by: Jakub Kicinski --- include/net/inet_connection_sock.h | 3 +++ net/ipv4/inet_connection_sock.c | 1 + net/ipv4/tcp.c | 1 + net/ipv4/tcp_input.c | 1 + net/ipv4/tcp_output.c | 1 + net/ipv4/tcp_timer.c | 14 +++++++------- 6 files changed, 14 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 7338b3865a2a..111d7771b208 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -76,6 +76,8 @@ struct inet_connection_sock_af_ops { * @icsk_ext_hdr_len: Network protocol overhead (IP/IPv6 options) * @icsk_ack: Delayed ACK control data * @icsk_mtup; MTU probing control data + * @icsk_probes_tstamp: Probe timestamp (cleared by non-zero window ack) + * @icsk_user_timeout: TCP_USER_TIMEOUT value */ struct inet_connection_sock { /* inet_sock has to be the first member! */ @@ -129,6 +131,7 @@ struct inet_connection_sock { u32 probe_timestamp; } icsk_mtup; + u32 icsk_probes_tstamp; u32 icsk_user_timeout; u64 icsk_ca_priv[104 / sizeof(u64)]; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index fd8b8800a2c3..6bd7ca09af03 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -851,6 +851,7 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, newicsk->icsk_retransmits = 0; newicsk->icsk_backoff = 0; newicsk->icsk_probes_out = 0; + newicsk->icsk_probes_tstamp = 0; /* Deinitialize accept_queue to trap illegal accesses. */ memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue)); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index ed42d2193c5c..32545ecf2ab1 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2937,6 +2937,7 @@ int tcp_disconnect(struct sock *sk, int flags) icsk->icsk_backoff = 0; icsk->icsk_probes_out = 0; + icsk->icsk_probes_tstamp = 0; icsk->icsk_rto = TCP_TIMEOUT_INIT; icsk->icsk_rto_min = TCP_RTO_MIN; icsk->icsk_delack_max = TCP_DELACK_MAX; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c7e16b0ed791..bafcab75f425 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3384,6 +3384,7 @@ static void tcp_ack_probe(struct sock *sk) return; if (!after(TCP_SKB_CB(head)->end_seq, tcp_wnd_end(tp))) { icsk->icsk_backoff = 0; + icsk->icsk_probes_tstamp = 0; inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0); /* Socket must be waked up by subsequent tcp_data_snd_check(). * This function is not for random using! diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f322e798a351..ab458697881e 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -4084,6 +4084,7 @@ void tcp_send_probe0(struct sock *sk) /* Cancel probe timer, if it is not required. */ icsk->icsk_probes_out = 0; icsk->icsk_backoff = 0; + icsk->icsk_probes_tstamp = 0; return; } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 6c62b9ea1320..454732ecc8f3 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -349,6 +349,7 @@ static void tcp_probe_timer(struct sock *sk) if (tp->packets_out || !skb) { icsk->icsk_probes_out = 0; + icsk->icsk_probes_tstamp = 0; return; } @@ -360,13 +361,12 @@ static void tcp_probe_timer(struct sock *sk) * corresponding system limit. We also implement similar policy when * we use RTO to probe window in tcp_retransmit_timer(). */ - if (icsk->icsk_user_timeout) { - u32 elapsed = tcp_model_timeout(sk, icsk->icsk_probes_out, - tcp_probe0_base(sk)); - - if (elapsed >= icsk->icsk_user_timeout) - goto abort; - } + if (!icsk->icsk_probes_tstamp) + icsk->icsk_probes_tstamp = tcp_jiffies32; + else if (icsk->icsk_user_timeout && + (s32)(tcp_jiffies32 - icsk->icsk_probes_tstamp) >= + msecs_to_jiffies(icsk->icsk_user_timeout)) + goto abort; max_probes = sock_net(sk)->ipv4.sysctl_tcp_retries2; if (sock_flag(sk, SOCK_DEAD)) { -- cgit v1.2.3 From 7e238de8283acd32c26c2bc2a50672d0ea862ff7 Mon Sep 17 00:00:00 2001 From: Oleksandr Mazur Date: Tue, 19 Jan 2021 10:53:33 +0200 Subject: net: core: devlink: use right genl user_ptr when handling port param get/set Fix incorrect user_ptr dereferencing when handling port param get/set: idx [0] stores the 'struct devlink' pointer; idx [1] stores the 'struct devlink_port' pointer; Fixes: 637989b5d77e ("devlink: Always use user_ptr[0] for devlink and simplify post_doit") CC: Parav Pandit Signed-off-by: Oleksandr Mazur Signed-off-by: Vadym Kochan Link: https://lore.kernel.org/r/20210119085333.16833-1-vadym.kochan@plvision.eu Signed-off-by: Jakub Kicinski --- net/core/devlink.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/devlink.c b/net/core/devlink.c index ee828e4b1007..738d4344d679 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -4146,7 +4146,7 @@ out: static int devlink_nl_cmd_port_param_get_doit(struct sk_buff *skb, struct genl_info *info) { - struct devlink_port *devlink_port = info->user_ptr[0]; + struct devlink_port *devlink_port = info->user_ptr[1]; struct devlink_param_item *param_item; struct sk_buff *msg; int err; @@ -4175,7 +4175,7 @@ static int devlink_nl_cmd_port_param_get_doit(struct sk_buff *skb, static int devlink_nl_cmd_port_param_set_doit(struct sk_buff *skb, struct genl_info *info) { - struct devlink_port *devlink_port = info->user_ptr[0]; + struct devlink_port *devlink_port = info->user_ptr[1]; return __devlink_nl_cmd_param_set_doit(devlink_port->devlink, devlink_port->index, -- cgit v1.2.3 From b425e24a934e21a502d25089c6c7443d799c5594 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Mon, 18 Jan 2021 18:03:33 +0200 Subject: xsk: Clear pool even for inactive queues MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The number of queues can change by other means, rather than ethtool. For example, attaching an mqprio qdisc with num_tc > 1 leads to creating multiple sets of TX queues, which may be then destroyed when mqprio is deleted. If an AF_XDP socket is created while mqprio is active, dev->_tx[queue_id].pool will be filled, but then real_num_tx_queues may decrease with deletion of mqprio, which will mean that the pool won't be NULLed, and a further increase of the number of TX queues may expose a dangling pointer. To avoid any potential misbehavior, this commit clears pool for RX and TX queues, regardless of real_num_*_queues, still taking into consideration num_*_queues to avoid overflows. Fixes: 1c1efc2af158 ("xsk: Create and free buffer pool independently from umem") Fixes: a41b4f3c58dd ("xsk: simplify xdp_clear_umem_at_qid implementation") Signed-off-by: Maxim Mikityanskiy Signed-off-by: Daniel Borkmann Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/20210118160333.333439-1-maximmi@mellanox.com --- net/xdp/xsk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 8037b04a9edd..4a83117507f5 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -108,9 +108,9 @@ EXPORT_SYMBOL(xsk_get_pool_from_qid); void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id) { - if (queue_id < dev->real_num_rx_queues) + if (queue_id < dev->num_rx_queues) dev->_rx[queue_id].pool = NULL; - if (queue_id < dev->real_num_tx_queues) + if (queue_id < dev->num_tx_queues) dev->_tx[queue_id].pool = NULL; } -- cgit v1.2.3 From 8d2b51b008c25240914984208b2ced57d1dd25a5 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Sat, 16 Jan 2021 11:44:22 +0100 Subject: udp: mask TOS bits in udp_v4_early_demux() udp_v4_early_demux() is the only function that calls ip_mc_validate_source() with a TOS that hasn't been masked with IPTOS_RT_MASK. This results in different behaviours for incoming multicast UDPv4 packets, depending on if ip_mc_validate_source() is called from the early-demux path (udp_v4_early_demux) or from the regular input path (ip_route_input_noref). ECN would normally not be used with UDP multicast packets, so the practical consequences should be limited on that side. However, IPTOS_RT_MASK is used to also masks the TOS' high order bits, to align with the non-early-demux path behaviour. Reproducer: Setup two netns, connected with veth: $ ip netns add ns0 $ ip netns add ns1 $ ip -netns ns0 link set dev lo up $ ip -netns ns1 link set dev lo up $ ip link add name veth01 netns ns0 type veth peer name veth10 netns ns1 $ ip -netns ns0 link set dev veth01 up $ ip -netns ns1 link set dev veth10 up $ ip -netns ns0 address add 192.0.2.10 peer 192.0.2.11/32 dev veth01 $ ip -netns ns1 address add 192.0.2.11 peer 192.0.2.10/32 dev veth10 In ns0, add route to multicast address 224.0.2.0/24 using source address 198.51.100.10: $ ip -netns ns0 address add 198.51.100.10/32 dev lo $ ip -netns ns0 route add 224.0.2.0/24 dev veth01 src 198.51.100.10 In ns1, define route to 198.51.100.10, only for packets with TOS 4: $ ip -netns ns1 route add 198.51.100.10/32 tos 4 dev veth10 Also activate rp_filter in ns1, so that incoming packets not matching the above route get dropped: $ ip netns exec ns1 sysctl -wq net.ipv4.conf.veth10.rp_filter=1 Now try to receive packets on 224.0.2.11: $ ip netns exec ns1 socat UDP-RECVFROM:1111,ip-add-membership=224.0.2.11:veth10,ignoreeof - In ns0, send packet to 224.0.2.11 with TOS 4 and ECT(0) (that is, tos 6 for socat): $ echo test0 | ip netns exec ns0 socat - UDP-DATAGRAM:224.0.2.11:1111,bind=:1111,tos=6 The "test0" message is properly received by socat in ns1, because early-demux has no cached dst to use, so source address validation is done by ip_route_input_mc(), which receives a TOS that has the ECN bits masked. Now send another packet to 224.0.2.11, still with TOS 4 and ECT(0): $ echo test1 | ip netns exec ns0 socat - UDP-DATAGRAM:224.0.2.11:1111,bind=:1111,tos=6 The "test1" message isn't received by socat in ns1, because, now, early-demux has a cached dst to use and calls ip_mc_validate_source() immediately, without masking the ECN bits. Fixes: bc044e8db796 ("udp: perform source validation for mcast early demux") Signed-off-by: Guillaume Nault Signed-off-by: Jakub Kicinski --- net/ipv4/udp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 7103b0a89756..69ea76578abb 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2555,7 +2555,8 @@ int udp_v4_early_demux(struct sk_buff *skb) */ if (!inet_sk(sk)->inet_daddr && in_dev) return ip_mc_validate_source(skb, iph->daddr, - iph->saddr, iph->tos, + iph->saddr, + iph->tos & IPTOS_RT_MASK, skb->dev, in_dev, &itag); } return 0; -- cgit v1.2.3 From 2e5a6266fbb11ae93c468dfecab169aca9c27b43 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Sat, 16 Jan 2021 11:44:26 +0100 Subject: netfilter: rpfilter: mask ecn bits before fib lookup RT_TOS() only masks one of the two ECN bits. Therefore rpfilter_mt() treats Not-ECT or ECT(1) packets in a different way than those with ECT(0) or CE. Reproducer: Create two netns, connected with a veth: $ ip netns add ns0 $ ip netns add ns1 $ ip link add name veth01 netns ns0 type veth peer name veth10 netns ns1 $ ip -netns ns0 link set dev veth01 up $ ip -netns ns1 link set dev veth10 up $ ip -netns ns0 address add 192.0.2.10/32 dev veth01 $ ip -netns ns1 address add 192.0.2.11/32 dev veth10 Add a route to ns1 in ns0: $ ip -netns ns0 route add 192.0.2.11/32 dev veth01 In ns1, only packets with TOS 4 can be routed to ns0: $ ip -netns ns1 route add 192.0.2.10/32 tos 4 dev veth10 Ping from ns0 to ns1 works regardless of the ECN bits, as long as TOS is 4: $ ip netns exec ns0 ping -Q 4 192.0.2.11 # TOS 4, Not-ECT ... 0% packet loss ... $ ip netns exec ns0 ping -Q 5 192.0.2.11 # TOS 4, ECT(1) ... 0% packet loss ... $ ip netns exec ns0 ping -Q 6 192.0.2.11 # TOS 4, ECT(0) ... 0% packet loss ... $ ip netns exec ns0 ping -Q 7 192.0.2.11 # TOS 4, CE ... 0% packet loss ... Now use iptable's rpfilter module in ns1: $ ip netns exec ns1 iptables-legacy -t raw -A PREROUTING -m rpfilter --invert -j DROP Not-ECT and ECT(1) packets still pass: $ ip netns exec ns0 ping -Q 4 192.0.2.11 # TOS 4, Not-ECT ... 0% packet loss ... $ ip netns exec ns0 ping -Q 5 192.0.2.11 # TOS 4, ECT(1) ... 0% packet loss ... But ECT(0) and ECN packets are dropped: $ ip netns exec ns0 ping -Q 6 192.0.2.11 # TOS 4, ECT(0) ... 100% packet loss ... $ ip netns exec ns0 ping -Q 7 192.0.2.11 # TOS 4, CE ... 100% packet loss ... After this patch, rpfilter doesn't drop ECT(0) and CE packets anymore. Fixes: 8f97339d3feb ("netfilter: add ipv4 reverse path filter match") Signed-off-by: Guillaume Nault Signed-off-by: Jakub Kicinski --- net/ipv4/netfilter/ipt_rpfilter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c index cc23f1ce239c..8cd3224d913e 100644 --- a/net/ipv4/netfilter/ipt_rpfilter.c +++ b/net/ipv4/netfilter/ipt_rpfilter.c @@ -76,7 +76,7 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) flow.daddr = iph->saddr; flow.saddr = rpfilter_get_saddr(iph->daddr); flow.flowi4_mark = info->flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0; - flow.flowi4_tos = RT_TOS(iph->tos); + flow.flowi4_tos = iph->tos & IPTOS_RT_MASK; flow.flowi4_scope = RT_SCOPE_UNIVERSE; flow.flowi4_oif = l3mdev_master_ifindex_rcu(xt_in(par)); -- cgit v1.2.3 From a3eb4e9d4c9218476d05c52dfd2be3d6fdce6b91 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Sun, 17 Jan 2021 17:15:38 +0200 Subject: net: Disable NETIF_F_HW_TLS_RX when RXCSUM is disabled With NETIF_F_HW_TLS_RX packets are decrypted in HW. This cannot be logically done when RXCSUM offload is off. Fixes: 14136564c8ee ("net: Add TLS RX offload feature") Signed-off-by: Tariq Toukan Reviewed-by: Boris Pismenny Link: https://lore.kernel.org/r/20210117151538.9411-1-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- Documentation/networking/tls-offload.rst | 3 +++ net/core/dev.c | 5 +++++ 2 files changed, 8 insertions(+) (limited to 'net') diff --git a/Documentation/networking/tls-offload.rst b/Documentation/networking/tls-offload.rst index 9af3334d9ad0..5f0dea3d571e 100644 --- a/Documentation/networking/tls-offload.rst +++ b/Documentation/networking/tls-offload.rst @@ -534,3 +534,6 @@ offload. Hence, TLS TX device feature flag requires TX csum offload being set. Disabling the latter implies clearing the former. Disabling TX checksum offload should not affect old connections, and drivers should make sure checksum calculation does not break for them. +Similarly, device-offloaded TLS decryption implies doing RXCSUM. If the user +does not want to enable RX csum offload, TLS RX device feature is disabled +as well. diff --git a/net/core/dev.c b/net/core/dev.c index c360bb5367e2..a979b86dbacd 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -9672,6 +9672,11 @@ static netdev_features_t netdev_fix_features(struct net_device *dev, } } + if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) { + netdev_dbg(dev, "Dropping TLS RX HW offload feature since no RXCSUM feature.\n"); + features &= ~NETIF_F_HW_TLS_RX; + } + return features; } -- cgit v1.2.3 From 4964e5a1e080f785f5518b402a9e48c527fe6cbd Mon Sep 17 00:00:00 2001 From: Bongsu Jeon Date: Tue, 19 Jan 2021 05:55:22 +0900 Subject: net: nfc: nci: fix the wrong NCI_CORE_INIT parameters Fix the code because NCI_CORE_INIT_CMD includes two parameters in NCI2.0 but there is no parameters in NCI1.x. Fixes: bcd684aace34 ("net/nfc/nci: Support NCI 2.x initial sequence") Signed-off-by: Bongsu Jeon Link: https://lore.kernel.org/r/20210118205522.317087-1-bongsu.jeon@samsung.com Signed-off-by: Jakub Kicinski --- net/nfc/nci/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index e64727e1a72f..02a1f13f0798 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -508,7 +508,7 @@ static int nci_open_device(struct nci_dev *ndev) }; unsigned long opt = 0; - if (!(ndev->nci_ver & NCI_VER_2_MASK)) + if (ndev->nci_ver & NCI_VER_2_MASK) opt = (unsigned long)&nci_init_v2_cmd; rc = __nci_request(ndev, nci_init_req, opt, -- cgit v1.2.3 From b160c28548bc0a87cbd16d5af6d3edcfd70b8c9a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 19 Jan 2021 08:49:00 -0800 Subject: tcp: do not mess with cloned skbs in tcp_add_backlog() Heiner Kallweit reported that some skbs were sent with the following invalid GSO properties : - gso_size > 0 - gso_type == 0 This was triggerring a WARN_ON_ONCE() in rtl8169_tso_csum_v2. Juerg Haefliger was able to reproduce a similar issue using a lan78xx NIC and a workload mixing TCP incoming traffic and forwarded packets. The problem is that tcp_add_backlog() is writing over gso_segs and gso_size even if the incoming packet will not be coalesced to the backlog tail packet. While skb_try_coalesce() would bail out if tail packet is cloned, this overwriting would lead to corruptions of other packets cooked by lan78xx, sharing a common super-packet. The strategy used by lan78xx is to use a big skb, and split it into all received packets using skb_clone() to avoid copies. The drawback of this strategy is that all the small skb share a common struct skb_shared_info. This patch rewrites TCP gso_size/gso_segs handling to only happen on the tail skb, since skb_try_coalesce() made sure it was not cloned. Fixes: 4f693b55c3d2 ("tcp: implement coalescing on backlog queue") Signed-off-by: Eric Dumazet Bisected-by: Juerg Haefliger Tested-by: Juerg Haefliger Reported-by: Heiner Kallweit Link: https://bugzilla.kernel.org/show_bug.cgi?id=209423 Link: https://lore.kernel.org/r/20210119164900.766957-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_ipv4.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 58207c7769d0..4e82745d336f 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1760,6 +1760,7 @@ int tcp_v4_early_demux(struct sk_buff *skb) bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) { u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf); + u32 tail_gso_size, tail_gso_segs; struct skb_shared_info *shinfo; const struct tcphdr *th; struct tcphdr *thtail; @@ -1767,6 +1768,7 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) unsigned int hdrlen; bool fragstolen; u32 gso_segs; + u32 gso_size; int delta; /* In case all data was pulled from skb frags (in __pskb_pull_tail()), @@ -1792,13 +1794,6 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) */ th = (const struct tcphdr *)skb->data; hdrlen = th->doff * 4; - shinfo = skb_shinfo(skb); - - if (!shinfo->gso_size) - shinfo->gso_size = skb->len - hdrlen; - - if (!shinfo->gso_segs) - shinfo->gso_segs = 1; tail = sk->sk_backlog.tail; if (!tail) @@ -1821,6 +1816,15 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) goto no_coalesce; __skb_pull(skb, hdrlen); + + shinfo = skb_shinfo(skb); + gso_size = shinfo->gso_size ?: skb->len; + gso_segs = shinfo->gso_segs ?: 1; + + shinfo = skb_shinfo(tail); + tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen); + tail_gso_segs = shinfo->gso_segs ?: 1; + if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; @@ -1847,11 +1851,8 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) } /* Not as strict as GRO. We only need to carry mss max value */ - skb_shinfo(tail)->gso_size = max(shinfo->gso_size, - skb_shinfo(tail)->gso_size); - - gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs; - skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF); + shinfo->gso_size = max(gso_size, tail_gso_size); + shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF); sk->sk_backlog.len += delta; __NET_INC_STATS(sock_net(sk), -- cgit v1.2.3 From 9c30ae8398b0813e237bde387d67a7f74ab2db2d Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Tue, 19 Jan 2021 11:26:19 -0800 Subject: tcp: fix TCP socket rehash stats mis-accounting The previous commit 32efcc06d2a1 ("tcp: export count for rehash attempts") would mis-account rehashing SNMP and socket stats: a. During handshake of an active open, only counts the first SYN timeout b. After handshake of passive and active open, stop updating after (roughly) TCP_RETRIES1 recurring RTOs c. After the socket aborts, over count timeout_rehash by 1 This patch fixes this by checking the rehash result from sk_rethink_txhash. Fixes: 32efcc06d2a1 ("tcp: export count for rehash attempts") Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Signed-off-by: Neal Cardwell Link: https://lore.kernel.org/r/20210119192619.1848270-1-ycheng@google.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 17 ++++++++++++----- net/ipv4/tcp_input.c | 5 ++--- net/ipv4/tcp_timer.c | 22 ++++++++-------------- 3 files changed, 22 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/include/net/sock.h b/include/net/sock.h index bdc4323ce53c..129d200bccb4 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1921,10 +1921,13 @@ static inline void sk_set_txhash(struct sock *sk) sk->sk_txhash = net_tx_rndhash(); } -static inline void sk_rethink_txhash(struct sock *sk) +static inline bool sk_rethink_txhash(struct sock *sk) { - if (sk->sk_txhash) + if (sk->sk_txhash) { sk_set_txhash(sk); + return true; + } + return false; } static inline struct dst_entry * @@ -1947,12 +1950,10 @@ sk_dst_get(struct sock *sk) return dst; } -static inline void dst_negative_advice(struct sock *sk) +static inline void __dst_negative_advice(struct sock *sk) { struct dst_entry *ndst, *dst = __sk_dst_get(sk); - sk_rethink_txhash(sk); - if (dst && dst->ops->negative_advice) { ndst = dst->ops->negative_advice(dst); @@ -1964,6 +1965,12 @@ static inline void dst_negative_advice(struct sock *sk) } } +static inline void dst_negative_advice(struct sock *sk) +{ + sk_rethink_txhash(sk); + __dst_negative_advice(sk); +} + static inline void __sk_dst_set(struct sock *sk, struct dst_entry *dst) { diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index bafcab75f425..a7dfca0a38cd 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4397,10 +4397,9 @@ static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb) * The receiver remembers and reflects via DSACKs. Leverage the * DSACK state and change the txhash to re-route speculatively. */ - if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq) { - sk_rethink_txhash(sk); + if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq && + sk_rethink_txhash(sk)) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDUPLICATEDATAREHASH); - } } static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb) diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 454732ecc8f3..faa92948441b 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -219,14 +219,8 @@ static int tcp_write_timeout(struct sock *sk) int retry_until; if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { - if (icsk->icsk_retransmits) { - dst_negative_advice(sk); - } else { - sk_rethink_txhash(sk); - tp->timeout_rehash++; - __NET_INC_STATS(sock_net(sk), - LINUX_MIB_TCPTIMEOUTREHASH); - } + if (icsk->icsk_retransmits) + __dst_negative_advice(sk); retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries; expired = icsk->icsk_retransmits >= retry_until; } else { @@ -234,12 +228,7 @@ static int tcp_write_timeout(struct sock *sk) /* Black hole detection */ tcp_mtu_probing(icsk, sk); - dst_negative_advice(sk); - } else { - sk_rethink_txhash(sk); - tp->timeout_rehash++; - __NET_INC_STATS(sock_net(sk), - LINUX_MIB_TCPTIMEOUTREHASH); + __dst_negative_advice(sk); } retry_until = net->ipv4.sysctl_tcp_retries2; @@ -270,6 +259,11 @@ static int tcp_write_timeout(struct sock *sk) return 1; } + if (sk_rethink_txhash(sk)) { + tp->timeout_rehash++; + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEOUTREHASH); + } + return 0; } -- cgit v1.2.3 From c89dffc70b340780e5b933832d8c3e045ef3791e Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 18 Jan 2021 14:59:20 +0900 Subject: tcp: Fix potential use-after-free due to double kfree() Receiving ACK with a valid SYN cookie, cookie_v4_check() allocates struct request_sock and then can allocate inet_rsk(req)->ireq_opt. After that, tcp_v4_syn_recv_sock() allocates struct sock and copies ireq_opt to inet_sk(sk)->inet_opt. Normally, tcp_v4_syn_recv_sock() inserts the full socket into ehash and sets NULL to ireq_opt. Otherwise, tcp_v4_syn_recv_sock() has to reset inet_opt by NULL and free the full socket. The commit 01770a1661657 ("tcp: fix race condition when creating child sockets from syncookies") added a new path, in which more than one cores create full sockets for the same SYN cookie. Currently, the core which loses the race frees the full socket without resetting inet_opt, resulting in that both sock_put() and reqsk_put() call kfree() for the same memory: sock_put sk_free __sk_free sk_destruct __sk_destruct sk->sk_destruct/inet_sock_destruct kfree(rcu_dereference_protected(inet->inet_opt, 1)); reqsk_put reqsk_free __reqsk_free req->rsk_ops->destructor/tcp_v4_reqsk_destructor kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); Calling kmalloc() between the double kfree() can lead to use-after-free, so this patch fixes it by setting NULL to inet_opt before sock_put(). As a side note, this kind of issue does not happen for IPv6. This is because tcp_v6_syn_recv_sock() clones both ipv6_opt and pktopts which correspond to ireq_opt in IPv4. Fixes: 01770a166165 ("tcp: fix race condition when creating child sockets from syncookies") CC: Ricardo Dias Signed-off-by: Kuniyuki Iwashima Reviewed-by: Benjamin Herrenschmidt Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20210118055920.82516-1-kuniyu@amazon.co.jp Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_ipv4.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 4e82745d336f..777306b5bc22 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1595,6 +1595,8 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, tcp_move_syn(newtp, req); ireq->ireq_opt = NULL; } else { + newinet->inet_opt = NULL; + if (!req_unhash && found_dup_sk) { /* This code path should only be executed in the * syncookie case only @@ -1602,8 +1604,6 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, bh_unlock_sock(newsk); sock_put(newsk); newsk = NULL; - } else { - newinet->inet_opt = NULL; } } return newsk; -- cgit v1.2.3