From 3a5913183aa1b14148c723bda030e6102ad73008 Mon Sep 17 00:00:00 2001 From: Eyal Birger Date: Sun, 9 Oct 2022 22:16:43 +0300 Subject: xfrm: fix "disable_policy" on ipv4 early demux The commit in the "Fixes" tag tried to avoid a case where policy check is ignored due to dst caching in next hops. However, when the traffic is locally consumed, the dst may be cached in a local TCP or UDP socket as part of early demux. In this case the "disable_policy" flag is not checked as ip_route_input_noref() was only called before caching, and thus, packets after the initial packet in a flow will be dropped if not matching policies. Fix by checking the "disable_policy" flag also when a valid dst is already available. Link: https://bugzilla.kernel.org/show_bug.cgi?id=216557 Reported-by: Monil Patel Fixes: e6175a2ed1f1 ("xfrm: fix "disable_policy" flag use when arriving from different devices") Signed-off-by: Eyal Birger ---- v2: use dev instead of skb->dev Signed-off-by: Steffen Klassert --- net/ipv4/ip_input.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 1b512390b3cf..e880ce77322a 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -366,6 +366,11 @@ static int ip_rcv_finish_core(struct net *net, struct sock *sk, iph->tos, dev); if (unlikely(err)) goto drop_error; + } else { + struct in_device *in_dev = __in_dev_get_rcu(dev); + + if (in_dev && IN_DEV_ORCONF(in_dev, NOPOLICY)) + IPCB(skb)->flags |= IPSKB_NOPOLICY; } #ifdef CONFIG_IP_ROUTE_CLASSID -- cgit v1.2.3 From d83f7040e18489265b4b121f33f99b02e52dabda Mon Sep 17 00:00:00 2001 From: Eyal Birger Date: Tue, 11 Oct 2022 11:01:37 +0300 Subject: xfrm: lwtunnel: squelch kernel warning in case XFRM encap type is not available Ido reported that a kernel warning [1] can be triggered from user space when the kernel is compiled with CONFIG_MODULES=y and CONFIG_XFRM=n when adding an xfrm encap type route, e.g: $ ip route add 198.51.100.0/24 dev dummy1 encap xfrm if_id 1 Error: lwt encapsulation type not supported. The reason for the warning is that the LWT infrastructure has an autoloading feature which is meant only for encap types that don't use a net device, which is not the case in xfrm encap. Mute this warning for xfrm encap as there's no encap module to autoload in this case. [1] WARNING: CPU: 3 PID: 2746262 at net/core/lwtunnel.c:57 lwtunnel_valid_encap_type+0x4f/0x120 [...] Call Trace: rtm_to_fib_config+0x211/0x350 inet_rtm_newroute+0x3a/0xa0 rtnetlink_rcv_msg+0x154/0x3c0 netlink_rcv_skb+0x49/0xf0 netlink_unicast+0x22f/0x350 netlink_sendmsg+0x208/0x440 ____sys_sendmsg+0x21f/0x250 ___sys_sendmsg+0x83/0xd0 __sys_sendmsg+0x54/0xa0 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x63/0xcd Reported-by: Ido Schimmel Fixes: 2c2493b9da91 ("xfrm: lwtunnel: add lwtunnel support for xfrm interfaces in collect_md mode") Signed-off-by: Eyal Birger Tested-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Signed-off-by: Steffen Klassert --- net/core/lwtunnel.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index 6fac2f0ef074..711cd3b4347a 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -48,9 +48,11 @@ static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type) return "RPL"; case LWTUNNEL_ENCAP_IOAM6: return "IOAM6"; + case LWTUNNEL_ENCAP_XFRM: + /* module autoload not supported for encap type */ + return NULL; case LWTUNNEL_ENCAP_IP6: case LWTUNNEL_ENCAP_IP: - case LWTUNNEL_ENCAP_XFRM: case LWTUNNEL_ENCAP_NONE: case __LWTUNNEL_ENCAP_MAX: /* should not have got here */ -- cgit v1.2.3 From 4b549ccce941798703f159b227aa28c716aa78fa Mon Sep 17 00:00:00 2001 From: Christian Langrock Date: Mon, 17 Oct 2022 08:34:47 +0200 Subject: xfrm: replay: Fix ESN wrap around for GSO When using GSO it can happen that the wrong seq_hi is used for the last packets before the wrap around. This can lead to double usage of a sequence number. To avoid this, we should serialize this last GSO packet. Fixes: d7dbefc45cf5 ("xfrm: Add xfrm_replay_overflow functions for offloading") Co-developed-by: Steffen Klassert Signed-off-by: Christian Langrock Signed-off-by: Steffen Klassert --- net/ipv4/esp4_offload.c | 3 +++ net/ipv6/esp6_offload.c | 3 +++ net/xfrm/xfrm_device.c | 15 ++++++++++++++- net/xfrm/xfrm_replay.c | 2 +- 4 files changed, 21 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index 170152772d33..3969fa805679 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -314,6 +314,9 @@ static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_ xo->seq.low += skb_shinfo(skb)->gso_segs; } + if (xo->seq.low < seq) + xo->seq.hi++; + esp.seqno = cpu_to_be64(seq + ((u64)xo->seq.hi << 32)); ip_hdr(skb)->tot_len = htons(skb->len); diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index 79d43548279c..242f4295940e 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -346,6 +346,9 @@ static int esp6_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features xo->seq.low += skb_shinfo(skb)->gso_segs; } + if (xo->seq.low < seq) + xo->seq.hi++; + esp.seqno = cpu_to_be64(xo->seq.low + ((u64)xo->seq.hi << 32)); len = skb->len - sizeof(struct ipv6hdr); diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 5f5aafd418af..21269e8f2db4 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -97,6 +97,18 @@ static void xfrm_outer_mode_prep(struct xfrm_state *x, struct sk_buff *skb) } } +static inline bool xmit_xfrm_check_overflow(struct sk_buff *skb) +{ + struct xfrm_offload *xo = xfrm_offload(skb); + __u32 seq = xo->seq.low; + + seq += skb_shinfo(skb)->gso_segs; + if (unlikely(seq < xo->seq.low)) + return true; + + return false; +} + struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features, bool *again) { int err; @@ -134,7 +146,8 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur return skb; } - if (skb_is_gso(skb) && unlikely(x->xso.dev != dev)) { + if (skb_is_gso(skb) && (unlikely(x->xso.dev != dev) || + unlikely(xmit_xfrm_check_overflow(skb)))) { struct sk_buff *segs; /* Packet got rerouted, fixup features and segment it. */ diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c index 9f4d42eb090f..ce56d659c55a 100644 --- a/net/xfrm/xfrm_replay.c +++ b/net/xfrm/xfrm_replay.c @@ -714,7 +714,7 @@ static int xfrm_replay_overflow_offload_esn(struct xfrm_state *x, struct sk_buff oseq += skb_shinfo(skb)->gso_segs; } - if (unlikely(oseq < replay_esn->oseq)) { + if (unlikely(xo->seq.low < replay_esn->oseq)) { XFRM_SKB_CB(skb)->seq.output.hi = ++oseq_hi; xo->seq.hi = oseq_hi; replay_esn->oseq_hi = oseq_hi; -- cgit v1.2.3 From 03c0ad4b06c3566de624b4f4b78ac1a5d1e4c8e7 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 13 Oct 2022 19:41:51 +0200 Subject: wifi: cfg80211: silence a sparse RCU warning All we're going to do with this pointer is assign it to another __rcu pointer, but sparse can't see that, so use rcu_access_pointer() to silence the warning here. Fixes: c90b93b5b782 ("wifi: cfg80211: update hidden BSSes to avoid WARN_ON") Signed-off-by: Johannes Berg --- net/wireless/scan.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 806a5f1330ff..da752b0cc752 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -1674,7 +1674,9 @@ cfg80211_update_known_bss(struct cfg80211_registered_device *rdev, if (old == rcu_access_pointer(known->pub.ies)) rcu_assign_pointer(known->pub.ies, new->pub.beacon_ies); - cfg80211_update_hidden_bsses(known, new->pub.beacon_ies, old); + cfg80211_update_hidden_bsses(known, + rcu_access_pointer(new->pub.beacon_ies), + old); if (old) kfree_rcu((struct cfg80211_bss_ies *)old, rcu_head); -- cgit v1.2.3 From 50b2e8711462409cd368c41067405aa446dfa2af Mon Sep 17 00:00:00 2001 From: taozhang Date: Sat, 15 Oct 2022 17:38:31 +0800 Subject: wifi: mac80211: fix memory free error when registering wiphy fail ieee80211_register_hw free the allocated cipher suites when registering wiphy fail, and ieee80211_free_hw will re-free it. set wiphy_ciphers_allocated to false after freeing allocated cipher suites. Signed-off-by: taozhang Signed-off-by: Johannes Berg --- net/mac80211/main.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 46f3eddc2388..02b5abc7326b 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -1439,8 +1439,10 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) ieee80211_led_exit(local); destroy_workqueue(local->workqueue); fail_workqueue: - if (local->wiphy_ciphers_allocated) + if (local->wiphy_ciphers_allocated) { kfree(local->hw.wiphy->cipher_suites); + local->wiphy_ciphers_allocated = false; + } kfree(local->int_scan_req); return result; } @@ -1508,8 +1510,10 @@ void ieee80211_free_hw(struct ieee80211_hw *hw) mutex_destroy(&local->iflist_mtx); mutex_destroy(&local->mtx); - if (local->wiphy_ciphers_allocated) + if (local->wiphy_ciphers_allocated) { kfree(local->hw.wiphy->cipher_suites); + local->wiphy_ciphers_allocated = false; + } idr_for_each(&local->ack_status_frames, ieee80211_free_ack_frame, NULL); -- cgit v1.2.3 From 57b962e627ec0ae53d4d16d7bd1033e27e67677a Mon Sep 17 00:00:00 2001 From: Arend van Spriel Date: Thu, 20 Oct 2022 13:40:40 +0200 Subject: wifi: cfg80211: fix memory leak in query_regdb_file() In the function query_regdb_file() the alpha2 parameter is duplicated using kmemdup() and subsequently freed in regdb_fw_cb(). However, request_firmware_nowait() can fail without calling regdb_fw_cb() and thus leak memory. Fixes: 007f6c5e6eb4 ("cfg80211: support loading regulatory database as firmware file") Signed-off-by: Arend van Spriel Signed-off-by: Johannes Berg --- net/wireless/reg.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/wireless/reg.c b/net/wireless/reg.c index d5c7a5aa6853..c3d950d29432 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -1084,6 +1084,8 @@ MODULE_FIRMWARE("regulatory.db"); static int query_regdb_file(const char *alpha2) { + int err; + ASSERT_RTNL(); if (regdb) @@ -1093,9 +1095,13 @@ static int query_regdb_file(const char *alpha2) if (!alpha2) return -ENOMEM; - return request_firmware_nowait(THIS_MODULE, true, "regulatory.db", - ®_pdev->dev, GFP_KERNEL, - (void *)alpha2, regdb_fw_cb); + err = request_firmware_nowait(THIS_MODULE, true, "regulatory.db", + ®_pdev->dev, GFP_KERNEL, + (void *)alpha2, regdb_fw_cb); + if (err) + kfree(alpha2); + + return err; } int reg_reload_regdb(void) -- cgit v1.2.3 From 18429c51c7ff6e6bfd627316c54670230967a7e5 Mon Sep 17 00:00:00 2001 From: Paul Zhang Date: Tue, 11 Oct 2022 21:04:28 +0800 Subject: wifi: cfg80211: Fix bitrates overflow issue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When invoking function cfg80211_calculate_bitrate_eht about (320 MHz, EHT-MCS 13, EHT-NSS 2, EHT-GI 0), which means the parameters as flags: 0x80, bw: 7, mcs: 13, eht_gi: 0, nss: 2, this formula (result * rate->nss) will overflow and causes the returned bitrate to be 3959 when it should be 57646. Here is the explanation: u64 tmp; u32 result; … /* tmp = result = 4 * rates_996[0] * = 4 * 480388888 = 0x72889c60 */ tmp = result; /* tmp = 0x72889c60 * 6144 = 0xabccea90000 */ tmp *= SCALE; /* tmp = 0xabccea90000 / mcs_divisors[13] * = 0xabccea90000 / 5120 = 0x8970bba6 */ do_div(tmp, mcs_divisors[rate->mcs]); /* result = 0x8970bba6 */ result = tmp; /* normally (result * rate->nss) = 0x8970bba6 * 2 = 0x112e1774c, * but since result is u32, (result * rate->nss) = 0x12e1774c, * overflow happens and it loses the highest bit. * Then result = 0x12e1774c / 8 = 39595753, */ result = (result * rate->nss) / 8; Signed-off-by: Paul Zhang Signed-off-by: Johannes Berg --- net/wireless/util.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/wireless/util.c b/net/wireless/util.c index 1f285b515028..39680e7bad45 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -1557,10 +1557,12 @@ static u32 cfg80211_calculate_bitrate_eht(struct rate_info *rate) tmp = result; tmp *= SCALE; do_div(tmp, mcs_divisors[rate->mcs]); - result = tmp; /* and take NSS */ - result = (result * rate->nss) / 8; + tmp *= rate->nss; + do_div(tmp, 8); + + result = tmp; return result / 10000; } -- cgit v1.2.3 From 7f57f8165cb6d2c206e2b9ada53b9e2d6d8af42f Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 25 Oct 2022 14:06:48 +0800 Subject: af_key: Fix send_acquire race with pfkey_register The function pfkey_send_acquire may race with pfkey_register (which could even be in a different name space). This may result in a buffer overrun. Allocating the maximum amount of memory that could be used prevents this. Reported-by: syzbot+1e9af9185d8850e2c2fa@syzkaller.appspotmail.com Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Herbert Xu Reviewed-by: Sabrina Dubroca Reviewed-by: Eric Dumazet Signed-off-by: Steffen Klassert --- net/key/af_key.c | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/key/af_key.c b/net/key/af_key.c index c85df5b958d2..213287814328 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -2905,7 +2905,7 @@ static int count_ah_combs(const struct xfrm_tmpl *t) break; if (!aalg->pfkey_supported) continue; - if (aalg_tmpl_set(t, aalg) && aalg->available) + if (aalg_tmpl_set(t, aalg)) sz += sizeof(struct sadb_comb); } return sz + sizeof(struct sadb_prop); @@ -2923,7 +2923,7 @@ static int count_esp_combs(const struct xfrm_tmpl *t) if (!ealg->pfkey_supported) continue; - if (!(ealg_tmpl_set(t, ealg) && ealg->available)) + if (!(ealg_tmpl_set(t, ealg))) continue; for (k = 1; ; k++) { @@ -2934,16 +2934,17 @@ static int count_esp_combs(const struct xfrm_tmpl *t) if (!aalg->pfkey_supported) continue; - if (aalg_tmpl_set(t, aalg) && aalg->available) + if (aalg_tmpl_set(t, aalg)) sz += sizeof(struct sadb_comb); } } return sz + sizeof(struct sadb_prop); } -static void dump_ah_combs(struct sk_buff *skb, const struct xfrm_tmpl *t) +static int dump_ah_combs(struct sk_buff *skb, const struct xfrm_tmpl *t) { struct sadb_prop *p; + int sz = 0; int i; p = skb_put(skb, sizeof(struct sadb_prop)); @@ -2971,13 +2972,17 @@ static void dump_ah_combs(struct sk_buff *skb, const struct xfrm_tmpl *t) c->sadb_comb_soft_addtime = 20*60*60; c->sadb_comb_hard_usetime = 8*60*60; c->sadb_comb_soft_usetime = 7*60*60; + sz += sizeof(*c); } } + + return sz + sizeof(*p); } -static void dump_esp_combs(struct sk_buff *skb, const struct xfrm_tmpl *t) +static int dump_esp_combs(struct sk_buff *skb, const struct xfrm_tmpl *t) { struct sadb_prop *p; + int sz = 0; int i, k; p = skb_put(skb, sizeof(struct sadb_prop)); @@ -3019,8 +3024,11 @@ static void dump_esp_combs(struct sk_buff *skb, const struct xfrm_tmpl *t) c->sadb_comb_soft_addtime = 20*60*60; c->sadb_comb_hard_usetime = 8*60*60; c->sadb_comb_soft_usetime = 7*60*60; + sz += sizeof(*c); } } + + return sz + sizeof(*p); } static int key_notify_policy_expire(struct xfrm_policy *xp, const struct km_event *c) @@ -3150,6 +3158,7 @@ static int pfkey_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *t, struct struct sadb_x_sec_ctx *sec_ctx; struct xfrm_sec_ctx *xfrm_ctx; int ctx_size = 0; + int alg_size = 0; sockaddr_size = pfkey_sockaddr_size(x->props.family); if (!sockaddr_size) @@ -3161,16 +3170,16 @@ static int pfkey_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *t, struct sizeof(struct sadb_x_policy); if (x->id.proto == IPPROTO_AH) - size += count_ah_combs(t); + alg_size = count_ah_combs(t); else if (x->id.proto == IPPROTO_ESP) - size += count_esp_combs(t); + alg_size = count_esp_combs(t); if ((xfrm_ctx = x->security)) { ctx_size = PFKEY_ALIGN8(xfrm_ctx->ctx_len); size += sizeof(struct sadb_x_sec_ctx) + ctx_size; } - skb = alloc_skb(size + 16, GFP_ATOMIC); + skb = alloc_skb(size + alg_size + 16, GFP_ATOMIC); if (skb == NULL) return -ENOMEM; @@ -3224,10 +3233,13 @@ static int pfkey_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *t, struct pol->sadb_x_policy_priority = xp->priority; /* Set sadb_comb's. */ + alg_size = 0; if (x->id.proto == IPPROTO_AH) - dump_ah_combs(skb, t); + alg_size = dump_ah_combs(skb, t); else if (x->id.proto == IPPROTO_ESP) - dump_esp_combs(skb, t); + alg_size = dump_esp_combs(skb, t); + + hdr->sadb_msg_len += alg_size / 8; /* security context */ if (xfrm_ctx) { -- cgit v1.2.3 From cbdeaee94a415800c65a8c3fa04d9664a8b8fb3a Mon Sep 17 00:00:00 2001 From: Zhang Xiaoxu Date: Thu, 20 Oct 2022 11:42:17 +0800 Subject: SUNRPC: Fix null-ptr-deref when xps sysfs alloc failed There is a null-ptr-deref when xps sysfs alloc failed: BUG: KASAN: null-ptr-deref in sysfs_do_create_link_sd+0x40/0xd0 Read of size 8 at addr 0000000000000030 by task gssproxy/457 CPU: 5 PID: 457 Comm: gssproxy Not tainted 6.0.0-09040-g02357b27ee03 #9 Call Trace: dump_stack_lvl+0x34/0x44 kasan_report+0xa3/0x120 sysfs_do_create_link_sd+0x40/0xd0 rpc_sysfs_client_setup+0x161/0x1b0 rpc_new_client+0x3fc/0x6e0 rpc_create_xprt+0x71/0x220 rpc_create+0x1d4/0x350 gssp_rpc_create+0xc3/0x160 set_gssp_clnt+0xbc/0x140 write_gssp+0x116/0x1a0 proc_reg_write+0xd6/0x130 vfs_write+0x177/0x690 ksys_write+0xb9/0x150 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x46/0xb0 When the xprt_switch sysfs alloc failed, should not add xprt and switch sysfs to it, otherwise, maybe null-ptr-deref; also initialize the 'xps_sysfs' to NULL to avoid oops when destroy it. Fixes: 2a338a543163 ("sunrpc: add a symlink from rpc-client directory to the xprt_switch") Fixes: d408ebe04ac5 ("sunrpc: add add sysfs directory per xprt under each xprt_switch") Fixes: baea99445dd4 ("sunrpc: add xprt_switch direcotry to sunrpc's sysfs") Signed-off-by: Zhang Xiaoxu Signed-off-by: Anna Schumaker --- net/sunrpc/sysfs.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sunrpc/sysfs.c b/net/sunrpc/sysfs.c index c65c90ad626a..c1f559892ae8 100644 --- a/net/sunrpc/sysfs.c +++ b/net/sunrpc/sysfs.c @@ -518,13 +518,16 @@ void rpc_sysfs_client_setup(struct rpc_clnt *clnt, struct net *net) { struct rpc_sysfs_client *rpc_client; + struct rpc_sysfs_xprt_switch *xswitch = + (struct rpc_sysfs_xprt_switch *)xprt_switch->xps_sysfs; + + if (!xswitch) + return; rpc_client = rpc_sysfs_client_alloc(rpc_sunrpc_client_kobj, net, clnt->cl_clid); if (rpc_client) { char name[] = "switch"; - struct rpc_sysfs_xprt_switch *xswitch = - (struct rpc_sysfs_xprt_switch *)xprt_switch->xps_sysfs; int ret; clnt->cl_sysfs = rpc_client; @@ -558,6 +561,8 @@ void rpc_sysfs_xprt_switch_setup(struct rpc_xprt_switch *xprt_switch, rpc_xprt_switch->xprt_switch = xprt_switch; rpc_xprt_switch->xprt = xprt; kobject_uevent(&rpc_xprt_switch->kobject, KOBJ_ADD); + } else { + xprt_switch->xps_sysfs = NULL; } } @@ -569,6 +574,9 @@ void rpc_sysfs_xprt_setup(struct rpc_xprt_switch *xprt_switch, struct rpc_sysfs_xprt_switch *switch_obj = (struct rpc_sysfs_xprt_switch *)xprt_switch->xps_sysfs; + if (!switch_obj) + return; + rpc_xprt = rpc_sysfs_xprt_alloc(&switch_obj->kobject, xprt, gfp_flags); if (rpc_xprt) { xprt->xprt_sysfs = rpc_xprt; -- cgit v1.2.3 From 8a0fa3ff3b606b55c4edc71ad133e61529b64549 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 8 Oct 2022 14:58:29 -0400 Subject: SUNRPC: Fix crasher in gss_unwrap_resp_integ() If a zero length is passed to kmalloc() it returns 0x10, which is not a valid address. gss_unwrap_resp_integ() subsequently crashes when it attempts to dereference that pointer. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/auth_gss/auth_gss.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index a31a27816cc0..7bb247c51e2f 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -1989,7 +1989,7 @@ gss_unwrap_resp_integ(struct rpc_task *task, struct rpc_cred *cred, goto unwrap_failed; mic.len = len; mic.data = kmalloc(len, GFP_KERNEL); - if (!mic.data) + if (ZERO_OR_NULL_PTR(mic.data)) goto unwrap_failed; if (read_bytes_from_xdr_buf(rcv_buf, offset, mic.data, mic.len)) goto unwrap_failed; -- cgit v1.2.3 From e4ba4554209f626c52e2e57f26cba49a62663c8b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 27 Oct 2022 20:25:01 -0700 Subject: net: openvswitch: add missing .resv_start_op I missed one of the families in OvS when annotating .resv_start_op. This triggers the warning added in commit ce48ebdd5651 ("genetlink: limit the use of validation workarounds to old ops"). Reported-by: syzbot+40eb8c0447c0e47a7e9b@syzkaller.appspotmail.com Fixes: 9c5d03d36251 ("genetlink: start to validate reserved header bytes") Link: https://lore.kernel.org/r/20221028032501.2724270-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/openvswitch/datapath.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 155263e73512..8b84869eb2ac 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -2544,6 +2544,7 @@ struct genl_family dp_vport_genl_family __ro_after_init = { .parallel_ops = true, .small_ops = dp_vport_genl_ops, .n_small_ops = ARRAY_SIZE(dp_vport_genl_ops), + .resv_start_op = OVS_VPORT_CMD_SET + 1, .mcgrps = &ovs_dp_vport_multicast_group, .n_mcgrps = 1, .module = THIS_MODULE, -- cgit v1.2.3 From 8f279fb00bb29def9ac79e28c5d6d8e07d21f3fb Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 27 Oct 2022 00:25:56 +0100 Subject: udp: advertise ipv6 udp support for msghdr::ubuf_info Mark udp ipv6 as supporting msghdr::ubuf_info. In the original commit SOCK_SUPPORT_ZC was supposed to be set by a udp_init_sock() call from udp6_init_sock(), but d38afeec26ed4 ("tcp/udp: Call inet6_destroy_sock() in IPv6 ...") removed it and so ipv6 udp misses the flag. Cc: # 6.0 Fixes: e993ffe3da4bc ("net: flag sockets supporting msghdr originated zerocopy") Signed-off-by: Pavel Begunkov Signed-off-by: Jakub Kicinski --- net/ipv6/udp.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 129ec5a9b0eb..bc65e5b7195b 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -66,6 +66,7 @@ int udpv6_init_sock(struct sock *sk) { skb_queue_head_init(&udp_sk(sk)->reader_queue); sk->sk_destruct = udpv6_destruct_sock; + set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); return 0; } -- cgit v1.2.3 From fee9ac06647e59a69fb7aec58f25267c134264b4 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 27 Oct 2022 00:25:57 +0100 Subject: net: remove SOCK_SUPPORT_ZC from sockmap sockmap replaces ->sk_prot with its own callbacks, we should remove SOCK_SUPPORT_ZC as the new proto doesn't support msghdr::ubuf_info. Cc: # 6.0 Reported-by: Jakub Kicinski Fixes: e993ffe3da4bc ("net: flag sockets supporting msghdr originated zerocopy") Signed-off-by: Pavel Begunkov Signed-off-by: Jakub Kicinski --- include/net/sock.h | 7 +++++++ net/ipv4/tcp_bpf.c | 4 ++-- net/ipv4/udp_bpf.c | 4 ++-- net/unix/unix_bpf.c | 8 ++++---- 4 files changed, 15 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/net/sock.h b/include/net/sock.h index 22f8bab583dd..5db02546941c 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1889,6 +1889,13 @@ void sock_kfree_s(struct sock *sk, void *mem, int size); void sock_kzfree_s(struct sock *sk, void *mem, int size); void sk_send_sigurg(struct sock *sk); +static inline void sock_replace_proto(struct sock *sk, struct proto *proto) +{ + if (sk->sk_socket) + clear_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); + WRITE_ONCE(sk->sk_prot, proto); +} + struct sockcm_cookie { u64 transmit_time; u32 mark; diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index a1626afe87a1..c501c329b1db 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -607,7 +607,7 @@ int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) } else { sk->sk_write_space = psock->saved_write_space; /* Pairs with lockless read in sk_clone_lock() */ - WRITE_ONCE(sk->sk_prot, psock->sk_proto); + sock_replace_proto(sk, psock->sk_proto); } return 0; } @@ -620,7 +620,7 @@ int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) } /* Pairs with lockless read in sk_clone_lock() */ - WRITE_ONCE(sk->sk_prot, &tcp_bpf_prots[family][config]); + sock_replace_proto(sk, &tcp_bpf_prots[family][config]); return 0; } EXPORT_SYMBOL_GPL(tcp_bpf_update_proto); diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c index ff15918b7bdc..e5dc91d0e079 100644 --- a/net/ipv4/udp_bpf.c +++ b/net/ipv4/udp_bpf.c @@ -141,14 +141,14 @@ int udp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) if (restore) { sk->sk_write_space = psock->saved_write_space; - WRITE_ONCE(sk->sk_prot, psock->sk_proto); + sock_replace_proto(sk, psock->sk_proto); return 0; } if (sk->sk_family == AF_INET6) udp_bpf_check_v6_needs_rebuild(psock->sk_proto); - WRITE_ONCE(sk->sk_prot, &udp_bpf_prots[family]); + sock_replace_proto(sk, &udp_bpf_prots[family]); return 0; } EXPORT_SYMBOL_GPL(udp_bpf_update_proto); diff --git a/net/unix/unix_bpf.c b/net/unix/unix_bpf.c index 7cf14c6b1725..e9bf15513961 100644 --- a/net/unix/unix_bpf.c +++ b/net/unix/unix_bpf.c @@ -145,12 +145,12 @@ int unix_dgram_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool re if (restore) { sk->sk_write_space = psock->saved_write_space; - WRITE_ONCE(sk->sk_prot, psock->sk_proto); + sock_replace_proto(sk, psock->sk_proto); return 0; } unix_dgram_bpf_check_needs_rebuild(psock->sk_proto); - WRITE_ONCE(sk->sk_prot, &unix_dgram_bpf_prot); + sock_replace_proto(sk, &unix_dgram_bpf_prot); return 0; } @@ -158,12 +158,12 @@ int unix_stream_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool r { if (restore) { sk->sk_write_space = psock->saved_write_space; - WRITE_ONCE(sk->sk_prot, psock->sk_proto); + sock_replace_proto(sk, psock->sk_proto); return 0; } unix_stream_bpf_check_needs_rebuild(psock->sk_proto); - WRITE_ONCE(sk->sk_prot, &unix_stream_bpf_prot); + sock_replace_proto(sk, &unix_stream_bpf_prot); return 0; } -- cgit v1.2.3 From e276d62dcfdee6582486e8b8344dd869518e14be Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 27 Oct 2022 00:25:58 +0100 Subject: net/ulp: remove SOCK_SUPPORT_ZC from tls sockets Remove SOCK_SUPPORT_ZC when we're setting ulp as it might not support msghdr::ubuf_info, e.g. like TLS replacing ->sk_prot with a new set of handlers. Cc: # 6.0 Reported-by: Jakub Kicinski Fixes: e993ffe3da4bc ("net: flag sockets supporting msghdr originated zerocopy") Signed-off-by: Pavel Begunkov Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_ulp.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c index 7c27aa629af1..9ae50b1bd844 100644 --- a/net/ipv4/tcp_ulp.c +++ b/net/ipv4/tcp_ulp.c @@ -136,6 +136,9 @@ static int __tcp_set_ulp(struct sock *sk, const struct tcp_ulp_ops *ulp_ops) if (icsk->icsk_ulp_ops) goto out_err; + if (sk->sk_socket) + clear_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); + err = ulp_ops->init(sk); if (err) goto out_err; -- cgit v1.2.3 From 71b7786ea478f3c4611deff4d2b9676b0c17c56b Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Thu, 27 Oct 2022 00:25:59 +0100 Subject: net: also flag accepted sockets supporting msghdr originated zerocopy Without this only the client initiated tcp sockets have SOCK_SUPPORT_ZC. The listening socket on the server also has it, but the accepted connections didn't, which meant IORING_OP_SEND[MSG]_ZC will always fails with -EOPNOTSUPP. Fixes: e993ffe3da4b ("net: flag sockets supporting msghdr originated zerocopy") Cc: # 6.0 CC: Jens Axboe Link: https://lore.kernel.org/io-uring/20221024141503.22b4e251@kernel.org/T/#m38aa19b0b825758fb97860a38ad13122051f9dda Signed-off-by: Stefan Metzmacher Signed-off-by: Pavel Begunkov Signed-off-by: Jakub Kicinski --- net/ipv4/af_inet.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 3dd02396517d..4728087c42a5 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -754,6 +754,8 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags, (TCPF_ESTABLISHED | TCPF_SYN_RECV | TCPF_CLOSE_WAIT | TCPF_CLOSE))); + if (test_bit(SOCK_SUPPORT_ZC, &sock->flags)) + set_bit(SOCK_SUPPORT_ZC, &newsock->flags); sock_graft(sk2, newsock); newsock->state = SS_CONNECTED; -- cgit v1.2.3 From a2c65a9d0568b6737c02b54f00b80716a53fac61 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 27 Oct 2022 17:54:39 +0300 Subject: net: dsa: fall back to default tagger if we can't load the one from DT DSA tagging protocol drivers can be changed at runtime through sysfs and at probe time through the device tree (support for the latter was added later). When changing through sysfs, it is assumed that the module for the new tagging protocol was already loaded into the kernel (in fact this is only a concern for Ocelot/Felix switches, where we have tag_ocelot.ko and tag_ocelot_8021q.ko; for every other switch, the default and alternative protocols are compiled within the same .ko, so there is nothing for the user to load). The kernel cannot currently call request_module(), because it has no way of constructing the modalias name of the tagging protocol driver ("dsa_tag-%d", where the number is one of DSA_TAG_PROTO_*_VALUE). The device tree only contains the string name of the tagging protocol ("ocelot-8021q"), and the only mapping between the string and the DSA_TAG_PROTO_OCELOT_8021Q_VALUE is present in tag_ocelot_8021q.ko. So this is a chicken-and-egg situation and dsa_core.ko has nothing based on which it can automatically request the insertion of the module. As a consequence, if CONFIG_NET_DSA_TAG_OCELOT_8021Q is built as module, the switch will forever defer probing. The long-term solution is to make DSA call request_module() somehow, but that probably needs some refactoring. What we can do to keep operating with existing device tree blobs is to cancel the attempt to change the tagging protocol with the one specified there, and to remain operating with the default one. Depending on the situation, the default protocol might still allow some functionality (in the case of ocelot, it does), and it's better to have that than to fail to probe. Fixes: deff710703d8 ("net: dsa: Allow default tag protocol to be overridden from DT") Link: https://lore.kernel.org/lkml/20221027113248.420216-1-michael@walle.cc/ Reported-by: Heiko Thiery Reported-by: Michael Walle Signed-off-by: Vladimir Oltean Tested-by: Michael Walle Reviewed-by: Florian Fainelli Link: https://lore.kernel.org/r/20221027145439.3086017-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- net/dsa/dsa2.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index af0e2c0394ac..e504a18fc125 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -1409,9 +1409,9 @@ static enum dsa_tag_protocol dsa_get_tag_protocol(struct dsa_port *dp, static int dsa_port_parse_cpu(struct dsa_port *dp, struct net_device *master, const char *user_protocol) { + const struct dsa_device_ops *tag_ops = NULL; struct dsa_switch *ds = dp->ds; struct dsa_switch_tree *dst = ds->dst; - const struct dsa_device_ops *tag_ops; enum dsa_tag_protocol default_proto; /* Find out which protocol the switch would prefer. */ @@ -1434,10 +1434,17 @@ static int dsa_port_parse_cpu(struct dsa_port *dp, struct net_device *master, } tag_ops = dsa_find_tagger_by_name(user_protocol); - } else { - tag_ops = dsa_tag_driver_get(default_proto); + if (IS_ERR(tag_ops)) { + dev_warn(ds->dev, + "Failed to find a tagging driver for protocol %s, using default\n", + user_protocol); + tag_ops = NULL; + } } + if (!tag_ops) + tag_ops = dsa_tag_driver_get(default_proto); + if (IS_ERR(tag_ops)) { if (PTR_ERR(tag_ops) == -ENOPROTOOPT) return -EPROBE_DEFER; -- cgit v1.2.3 From 8bdc2acd420c6f3dd1f1c78750ec989f02a1e2b9 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 28 Oct 2022 18:05:00 +0300 Subject: net: sched: Fix use after free in red_enqueue() We can't use "skb" again after passing it to qdisc_enqueue(). This is basically identical to commit 2f09707d0c97 ("sch_sfb: Also store skb len before calling child enqueue"). Fixes: d7f4f332f082 ("sch_red: update backlog as well") Signed-off-by: Dan Carpenter Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_red.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index a5a401f93c1a..98129324e157 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -72,6 +72,7 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch, { struct red_sched_data *q = qdisc_priv(sch); struct Qdisc *child = q->qdisc; + unsigned int len; int ret; q->vars.qavg = red_calc_qavg(&q->parms, @@ -126,9 +127,10 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch, break; } + len = qdisc_pkt_len(skb); ret = qdisc_enqueue(skb, child, to_free); if (likely(ret == NET_XMIT_SUCCESS)) { - qdisc_qstats_backlog_inc(sch, skb); + sch->qstats.backlog += len; sch->q.qlen++; } else if (net_xmit_drop_count(ret)) { q->stats.pdrop++; -- cgit v1.2.3 From d4bc8271db21ea9f1c86a1ca4d64999f184d4aae Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 26 Oct 2022 09:52:36 +0200 Subject: netfilter: nf_tables: netlink notifier might race to release objects commit release path is invoked via call_rcu and it runs lockless to release the objects after rcu grace period. The netlink notifier handler might win race to remove objects that the transaction context is still referencing from the commit release path. Call rcu_barrier() to ensure pending rcu callbacks run to completion if the list of transactions to be destroyed is not empty. Fixes: 6001a930ce03 ("netfilter: nftables: introduce table ownership") Reported-by: syzbot+8f747f62763bc6c32916@syzkaller.appspotmail.com Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 58d9cbc9ccdc..2197118aa7b0 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -10030,6 +10030,8 @@ static int nft_rcv_nl_event(struct notifier_block *this, unsigned long event, nft_net = nft_pernet(net); deleted = 0; mutex_lock(&nft_net->commit_mutex); + if (!list_empty(&nf_tables_destroy_list)) + rcu_barrier(); again: list_for_each_entry(table, &nft_net->tables, list) { if (nft_table_has_owner(table) && -- cgit v1.2.3 From 26b5934ff4194e13196bedcba373cd4915071d0e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 26 Oct 2022 09:54:45 +0200 Subject: netfilter: nf_tables: release flow rule object from commit path No need to postpone this to the commit release path, since no packets are walking over this object, this is accessed from control plane only. This helped uncovered UAF triggered by races with the netlink notifier. Fixes: 9dd732e0bdf5 ("netfilter: nf_tables: memleak flow rule from commit path") Reported-by: syzbot+8f747f62763bc6c32916@syzkaller.appspotmail.com Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 2197118aa7b0..76bd4d03dbda 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -8465,9 +8465,6 @@ static void nft_commit_release(struct nft_trans *trans) nf_tables_chain_destroy(&trans->ctx); break; case NFT_MSG_DELRULE: - if (trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD) - nft_flow_rule_destroy(nft_trans_flow_rule(trans)); - nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans)); break; case NFT_MSG_DELSET: @@ -8973,6 +8970,9 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_rule_expr_deactivate(&trans->ctx, nft_trans_rule(trans), NFT_TRANS_COMMIT); + + if (trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD) + nft_flow_rule_destroy(nft_trans_flow_rule(trans)); break; case NFT_MSG_NEWSET: nft_clear(net, nft_trans_set(trans)); -- cgit v1.2.3 From 8ec95b94716a1e4d126edc3fb2bc426a717e2dba Mon Sep 17 00:00:00 2001 From: Wang Yufen Date: Tue, 1 Nov 2022 09:31:36 +0800 Subject: bpf, sockmap: Fix the sk->sk_forward_alloc warning of sk_stream_kill_queues When running `test_sockmap` selftests, the following warning appears: WARNING: CPU: 2 PID: 197 at net/core/stream.c:205 sk_stream_kill_queues+0xd3/0xf0 Call Trace: inet_csk_destroy_sock+0x55/0x110 tcp_rcv_state_process+0xd28/0x1380 ? tcp_v4_do_rcv+0x77/0x2c0 tcp_v4_do_rcv+0x77/0x2c0 __release_sock+0x106/0x130 __tcp_close+0x1a7/0x4e0 tcp_close+0x20/0x70 inet_release+0x3c/0x80 __sock_release+0x3a/0xb0 sock_close+0x14/0x20 __fput+0xa3/0x260 task_work_run+0x59/0xb0 exit_to_user_mode_prepare+0x1b3/0x1c0 syscall_exit_to_user_mode+0x19/0x50 do_syscall_64+0x48/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae The root case is in commit 84472b436e76 ("bpf, sockmap: Fix more uncharged while msg has more_data"), where I used msg->sg.size to replace the tosend, causing breakage: if (msg->apply_bytes && msg->apply_bytes < tosend) tosend = psock->apply_bytes; Fixes: 84472b436e76 ("bpf, sockmap: Fix more uncharged while msg has more_data") Reported-by: Jakub Sitnicki Signed-off-by: Wang Yufen Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/1667266296-8794-1-git-send-email-wangyufen@huawei.com --- net/ipv4/tcp_bpf.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index c501c329b1db..cf9c3e8f7ccb 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -278,7 +278,7 @@ static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock, { bool cork = false, enospc = sk_msg_full(msg); struct sock *sk_redir; - u32 tosend, delta = 0; + u32 tosend, origsize, sent, delta = 0; u32 eval = __SK_NONE; int ret; @@ -333,10 +333,12 @@ more_data: cork = true; psock->cork = NULL; } - sk_msg_return(sk, msg, msg->sg.size); + sk_msg_return(sk, msg, tosend); release_sock(sk); + origsize = msg->sg.size; ret = tcp_bpf_sendmsg_redir(sk_redir, msg, tosend, flags); + sent = origsize - msg->sg.size; if (eval == __SK_REDIRECT) sock_put(sk_redir); @@ -375,7 +377,7 @@ more_data: msg->sg.data[msg->sg.start].page_link && msg->sg.data[msg->sg.start].length) { if (eval == __SK_REDIRECT) - sk_mem_charge(sk, msg->sg.size); + sk_mem_charge(sk, tosend - sent); goto more_data; } } -- cgit v1.2.3 From ecaf75ffd5f5db320d8b1da0198eef5a5ce64a3f Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 31 Oct 2022 13:34:07 +0100 Subject: netlink: introduce bigendian integer types Jakub reported that the addition of the "network_byte_order" member in struct nla_policy increases size of 32bit platforms. Instead of scraping the bit from elsewhere Johannes suggested to add explicit NLA_BE types instead, so do this here. NLA_POLICY_MAX_BE() macro is removed again, there is no need for it: NLA_POLICY_MAX(NLA_BE.., ..) will do the right thing. NLA_BE64 can be added later. Fixes: 08724ef69907 ("netlink: introduce NLA_POLICY_MAX_BE") Reported-by: Jakub Kicinski Suggested-by: Johannes Berg Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20221031123407.9158-1-fw@strlen.de Signed-off-by: Jakub Kicinski --- include/net/netlink.h | 19 ++++++++++--------- lib/nlattr.c | 41 +++++++++++++++-------------------------- net/netfilter/nft_payload.c | 6 +++--- 3 files changed, 28 insertions(+), 38 deletions(-) (limited to 'net') diff --git a/include/net/netlink.h b/include/net/netlink.h index 7db13b3261fc..6bfa972f2fbf 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -181,6 +181,8 @@ enum { NLA_S64, NLA_BITFIELD32, NLA_REJECT, + NLA_BE16, + NLA_BE32, __NLA_TYPE_MAX, }; @@ -231,6 +233,7 @@ enum nla_policy_validation { * NLA_U32, NLA_U64, * NLA_S8, NLA_S16, * NLA_S32, NLA_S64, + * NLA_BE16, NLA_BE32, * NLA_MSECS Leaving the length field zero will verify the * given type fits, using it verifies minimum length * just like "All other" @@ -261,6 +264,8 @@ enum nla_policy_validation { * NLA_U16, * NLA_U32, * NLA_U64, + * NLA_BE16, + * NLA_BE32, * NLA_S8, * NLA_S16, * NLA_S32, @@ -349,7 +354,6 @@ struct nla_policy { struct netlink_range_validation_signed *range_signed; struct { s16 min, max; - u8 network_byte_order:1; }; int (*validate)(const struct nlattr *attr, struct netlink_ext_ack *extack); @@ -374,6 +378,8 @@ struct nla_policy { (tp == NLA_U8 || tp == NLA_U16 || tp == NLA_U32 || tp == NLA_U64) #define __NLA_IS_SINT_TYPE(tp) \ (tp == NLA_S8 || tp == NLA_S16 || tp == NLA_S32 || tp == NLA_S64) +#define __NLA_IS_BEINT_TYPE(tp) \ + (tp == NLA_BE16 || tp == NLA_BE32) #define __NLA_ENSURE(condition) BUILD_BUG_ON_ZERO(!(condition)) #define NLA_ENSURE_UINT_TYPE(tp) \ @@ -387,6 +393,7 @@ struct nla_policy { #define NLA_ENSURE_INT_OR_BINARY_TYPE(tp) \ (__NLA_ENSURE(__NLA_IS_UINT_TYPE(tp) || \ __NLA_IS_SINT_TYPE(tp) || \ + __NLA_IS_BEINT_TYPE(tp) || \ tp == NLA_MSECS || \ tp == NLA_BINARY) + tp) #define NLA_ENSURE_NO_VALIDATION_PTR(tp) \ @@ -394,6 +401,8 @@ struct nla_policy { tp != NLA_REJECT && \ tp != NLA_NESTED && \ tp != NLA_NESTED_ARRAY) + tp) +#define NLA_ENSURE_BEINT_TYPE(tp) \ + (__NLA_ENSURE(__NLA_IS_BEINT_TYPE(tp)) + tp) #define NLA_POLICY_RANGE(tp, _min, _max) { \ .type = NLA_ENSURE_INT_OR_BINARY_TYPE(tp), \ @@ -424,14 +433,6 @@ struct nla_policy { .type = NLA_ENSURE_INT_OR_BINARY_TYPE(tp), \ .validation_type = NLA_VALIDATE_MAX, \ .max = _max, \ - .network_byte_order = 0, \ -} - -#define NLA_POLICY_MAX_BE(tp, _max) { \ - .type = NLA_ENSURE_UINT_TYPE(tp), \ - .validation_type = NLA_VALIDATE_MAX, \ - .max = _max, \ - .network_byte_order = 1, \ } #define NLA_POLICY_MASK(tp, _mask) { \ diff --git a/lib/nlattr.c b/lib/nlattr.c index 40f22b177d69..b67a53e29b8f 100644 --- a/lib/nlattr.c +++ b/lib/nlattr.c @@ -124,10 +124,12 @@ void nla_get_range_unsigned(const struct nla_policy *pt, range->max = U8_MAX; break; case NLA_U16: + case NLA_BE16: case NLA_BINARY: range->max = U16_MAX; break; case NLA_U32: + case NLA_BE32: range->max = U32_MAX; break; case NLA_U64: @@ -159,31 +161,6 @@ void nla_get_range_unsigned(const struct nla_policy *pt, } } -static u64 nla_get_attr_bo(const struct nla_policy *pt, - const struct nlattr *nla) -{ - switch (pt->type) { - case NLA_U16: - if (pt->network_byte_order) - return ntohs(nla_get_be16(nla)); - - return nla_get_u16(nla); - case NLA_U32: - if (pt->network_byte_order) - return ntohl(nla_get_be32(nla)); - - return nla_get_u32(nla); - case NLA_U64: - if (pt->network_byte_order) - return be64_to_cpu(nla_get_be64(nla)); - - return nla_get_u64(nla); - } - - WARN_ON_ONCE(1); - return 0; -} - static int nla_validate_range_unsigned(const struct nla_policy *pt, const struct nlattr *nla, struct netlink_ext_ack *extack, @@ -197,9 +174,13 @@ static int nla_validate_range_unsigned(const struct nla_policy *pt, value = nla_get_u8(nla); break; case NLA_U16: + value = nla_get_u16(nla); + break; case NLA_U32: + value = nla_get_u32(nla); + break; case NLA_U64: - value = nla_get_attr_bo(pt, nla); + value = nla_get_u64(nla); break; case NLA_MSECS: value = nla_get_u64(nla); @@ -207,6 +188,12 @@ static int nla_validate_range_unsigned(const struct nla_policy *pt, case NLA_BINARY: value = nla_len(nla); break; + case NLA_BE16: + value = ntohs(nla_get_be16(nla)); + break; + case NLA_BE32: + value = ntohl(nla_get_be32(nla)); + break; default: return -EINVAL; } @@ -334,6 +321,8 @@ static int nla_validate_int_range(const struct nla_policy *pt, case NLA_U64: case NLA_MSECS: case NLA_BINARY: + case NLA_BE16: + case NLA_BE32: return nla_validate_range_unsigned(pt, nla, extack, validate); case NLA_S8: case NLA_S16: diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 088244f9d838..4edd899aeb9b 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -173,10 +173,10 @@ static const struct nla_policy nft_payload_policy[NFTA_PAYLOAD_MAX + 1] = { [NFTA_PAYLOAD_SREG] = { .type = NLA_U32 }, [NFTA_PAYLOAD_DREG] = { .type = NLA_U32 }, [NFTA_PAYLOAD_BASE] = { .type = NLA_U32 }, - [NFTA_PAYLOAD_OFFSET] = NLA_POLICY_MAX_BE(NLA_U32, 255), - [NFTA_PAYLOAD_LEN] = NLA_POLICY_MAX_BE(NLA_U32, 255), + [NFTA_PAYLOAD_OFFSET] = NLA_POLICY_MAX(NLA_BE32, 255), + [NFTA_PAYLOAD_LEN] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_PAYLOAD_CSUM_TYPE] = { .type = NLA_U32 }, - [NFTA_PAYLOAD_CSUM_OFFSET] = NLA_POLICY_MAX_BE(NLA_U32, 255), + [NFTA_PAYLOAD_CSUM_OFFSET] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_PAYLOAD_CSUM_FLAGS] = { .type = NLA_U32 }, }; -- cgit v1.2.3 From 5c26159c97b324dc5174a5713eafb8c855cf8106 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Wed, 26 Oct 2022 14:32:16 +0200 Subject: ipvs: use explicitly signed chars The `char` type with no explicit sign is sometimes signed and sometimes unsigned. This code will break on platforms such as arm, where char is unsigned. So mark it here as explicitly signed, so that the todrop_counter decrement and subsequent comparison is correct. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Jason A. Donenfeld Acked-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_conn.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 8c04bb57dd6f..7c4866c04343 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -1265,8 +1265,8 @@ static inline int todrop_entry(struct ip_vs_conn *cp) * The drop rate array needs tuning for real environments. * Called from timer bh only => no locking */ - static const char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8}; - static char todrop_counter[9] = {0}; + static const signed char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8}; + static signed char todrop_counter[9] = {0}; int i; /* if the conn entry hasn't lasted for 60 seconds, don't drop it. -- cgit v1.2.3 From 3d00c6a0da8ddcf75213e004765e4a42acc71d5d Mon Sep 17 00:00:00 2001 From: Zhengchao Shao Date: Mon, 31 Oct 2022 20:07:04 +0800 Subject: ipvs: fix WARNING in __ip_vs_cleanup_batch() During the initialization of ip_vs_conn_net_init(), if file ip_vs_conn or ip_vs_conn_sync fails to be created, the initialization is successful by default. Therefore, the ip_vs_conn or ip_vs_conn_sync file doesn't be found during the remove. The following is the stack information: name 'ip_vs_conn_sync' WARNING: CPU: 3 PID: 9 at fs/proc/generic.c:712 remove_proc_entry+0x389/0x460 Modules linked in: Workqueue: netns cleanup_net RIP: 0010:remove_proc_entry+0x389/0x460 Call Trace: __ip_vs_cleanup_batch+0x7d/0x120 ops_exit_list+0x125/0x170 cleanup_net+0x4ea/0xb00 process_one_work+0x9bf/0x1710 worker_thread+0x665/0x1080 kthread+0x2e4/0x3a0 ret_from_fork+0x1f/0x30 Fixes: 61b1ab4583e2 ("IPVS: netns, add basic init per netns.") Signed-off-by: Zhengchao Shao Acked-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_conn.c | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 7c4866c04343..13534e02346c 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -1447,20 +1447,36 @@ int __net_init ip_vs_conn_net_init(struct netns_ipvs *ipvs) { atomic_set(&ipvs->conn_count, 0); - proc_create_net("ip_vs_conn", 0, ipvs->net->proc_net, - &ip_vs_conn_seq_ops, sizeof(struct ip_vs_iter_state)); - proc_create_net("ip_vs_conn_sync", 0, ipvs->net->proc_net, - &ip_vs_conn_sync_seq_ops, - sizeof(struct ip_vs_iter_state)); +#ifdef CONFIG_PROC_FS + if (!proc_create_net("ip_vs_conn", 0, ipvs->net->proc_net, + &ip_vs_conn_seq_ops, + sizeof(struct ip_vs_iter_state))) + goto err_conn; + + if (!proc_create_net("ip_vs_conn_sync", 0, ipvs->net->proc_net, + &ip_vs_conn_sync_seq_ops, + sizeof(struct ip_vs_iter_state))) + goto err_conn_sync; +#endif + return 0; + +#ifdef CONFIG_PROC_FS +err_conn_sync: + remove_proc_entry("ip_vs_conn", ipvs->net->proc_net); +err_conn: + return -ENOMEM; +#endif } void __net_exit ip_vs_conn_net_cleanup(struct netns_ipvs *ipvs) { /* flush all the connection entries first */ ip_vs_conn_flush(ipvs); +#ifdef CONFIG_PROC_FS remove_proc_entry("ip_vs_conn", ipvs->net->proc_net); remove_proc_entry("ip_vs_conn_sync", ipvs->net->proc_net); +#endif } int __init ip_vs_conn_init(void) -- cgit v1.2.3 From 5663ed63adb9619c98ab7479aa4606fa9b7a548c Mon Sep 17 00:00:00 2001 From: Zhengchao Shao Date: Mon, 31 Oct 2022 20:07:05 +0800 Subject: ipvs: fix WARNING in ip_vs_app_net_cleanup() During the initialization of ip_vs_app_net_init(), if file ip_vs_app fails to be created, the initialization is successful by default. Therefore, the ip_vs_app file doesn't be found during the remove in ip_vs_app_net_cleanup(). It will cause WRNING. The following is the stack information: name 'ip_vs_app' WARNING: CPU: 1 PID: 9 at fs/proc/generic.c:712 remove_proc_entry+0x389/0x460 Modules linked in: Workqueue: netns cleanup_net RIP: 0010:remove_proc_entry+0x389/0x460 Call Trace: ops_exit_list+0x125/0x170 cleanup_net+0x4ea/0xb00 process_one_work+0x9bf/0x1710 worker_thread+0x665/0x1080 kthread+0x2e4/0x3a0 ret_from_fork+0x1f/0x30 Fixes: 457c4cbc5a3d ("[NET]: Make /proc/net per network namespace") Signed-off-by: Zhengchao Shao Acked-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_app.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c index f9b16f2b2219..fdacbc3c15be 100644 --- a/net/netfilter/ipvs/ip_vs_app.c +++ b/net/netfilter/ipvs/ip_vs_app.c @@ -599,13 +599,19 @@ static const struct seq_operations ip_vs_app_seq_ops = { int __net_init ip_vs_app_net_init(struct netns_ipvs *ipvs) { INIT_LIST_HEAD(&ipvs->app_list); - proc_create_net("ip_vs_app", 0, ipvs->net->proc_net, &ip_vs_app_seq_ops, - sizeof(struct seq_net_private)); +#ifdef CONFIG_PROC_FS + if (!proc_create_net("ip_vs_app", 0, ipvs->net->proc_net, + &ip_vs_app_seq_ops, + sizeof(struct seq_net_private))) + return -ENOMEM; +#endif return 0; } void __net_exit ip_vs_app_net_cleanup(struct netns_ipvs *ipvs) { unregister_ip_vs_app(ipvs, NULL /* all */); +#ifdef CONFIG_PROC_FS remove_proc_entry("ip_vs_app", ipvs->net->proc_net); +#endif } -- cgit v1.2.3 From 780854186946e0de2be192ee7fa5125666533b3a Mon Sep 17 00:00:00 2001 From: Zhengchao Shao Date: Wed, 26 Oct 2022 14:39:59 +0800 Subject: wifi: mac80211: fix general-protection-fault in ieee80211_subif_start_xmit() When device is running and the interface status is changed, the gpf issue is triggered. The problem triggering process is as follows: Thread A: Thread B ieee80211_runtime_change_iftype() process_one_work() ... ... ieee80211_do_stop() ... ... ... sdata->bss = NULL ... ... ieee80211_subif_start_xmit() ieee80211_multicast_to_unicast //!sdata->bss->multicast_to_unicast cause gpf issue When the interface status is changed, the sending queue continues to send packets. After the bss is set to NULL, the bss is accessed. As a result, this causes a general-protection-fault issue. The following is the stack information: general protection fault, probably for non-canonical address 0xdffffc000000002f: 0000 [#1] PREEMPT SMP KASAN KASAN: null-ptr-deref in range [0x0000000000000178-0x000000000000017f] Workqueue: mld mld_ifc_work RIP: 0010:ieee80211_subif_start_xmit+0x25b/0x1310 Call Trace: dev_hard_start_xmit+0x1be/0x990 __dev_queue_xmit+0x2c9a/0x3b60 ip6_finish_output2+0xf92/0x1520 ip6_finish_output+0x6af/0x11e0 ip6_output+0x1ed/0x540 mld_sendpack+0xa09/0xe70 mld_ifc_work+0x71c/0xdb0 process_one_work+0x9bf/0x1710 worker_thread+0x665/0x1080 kthread+0x2e4/0x3a0 ret_from_fork+0x1f/0x30 Fixes: f856373e2f31 ("wifi: mac80211: do not wake queues on a vif that is being stopped") Reported-by: syzbot+c6e8fca81c294fd5620a@syzkaller.appspotmail.com Signed-off-by: Zhengchao Shao Link: https://lore.kernel.org/r/20221026063959.177813-1-shaozhengchao@huawei.com Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index a364148149f9..874f2a4d831d 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -4418,6 +4418,11 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb, if (likely(!is_multicast_ether_addr(eth->h_dest))) goto normal; + if (unlikely(!ieee80211_sdata_running(sdata))) { + kfree_skb(skb); + return NETDEV_TX_OK; + } + if (unlikely(ieee80211_multicast_to_unicast(skb, dev))) { struct sk_buff_head queue; -- cgit v1.2.3 From 39e7b5de9853bd92ddbfa4b14165babacd7da0ba Mon Sep 17 00:00:00 2001 From: Nicolas Cavallari Date: Thu, 27 Oct 2022 16:01:33 +0200 Subject: wifi: mac80211: Fix ack frame idr leak when mesh has no route When trying to transmit an data frame with tx_status to a destination that have no route in the mesh, then it is dropped without recrediting the ack_status_frames idr. Once it is exhausted, wpa_supplicant starts failing to do SAE with NL80211_CMD_FRAME and logs "nl80211: Frame command failed". Use ieee80211_free_txskb() instead of kfree_skb() to fix it. Signed-off-by: Nicolas Cavallari Link: https://lore.kernel.org/r/20221027140133.1504-1-nicolas.cavallari@green-communications.fr Signed-off-by: Johannes Berg --- net/mac80211/mesh_pathtbl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c index acc1c299f1ae..69d5e1ec6ede 100644 --- a/net/mac80211/mesh_pathtbl.c +++ b/net/mac80211/mesh_pathtbl.c @@ -710,7 +710,7 @@ int mesh_path_send_to_gates(struct mesh_path *mpath) void mesh_path_discard_frame(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) { - kfree_skb(skb); + ieee80211_free_txskb(&sdata->local->hw, skb); sdata->u.mesh.mshstats.dropped_frames_no_route++; } -- cgit v1.2.3 From 30ac96f7cc973bb850c718c9bbe1fdcedfbe826b Mon Sep 17 00:00:00 2001 From: Howard Hsu Date: Thu, 27 Oct 2022 09:56:53 +0800 Subject: wifi: mac80211: Set TWT Information Frame Disabled bit as 1 The TWT Information Frame Disabled bit of control field of TWT Setup frame shall be set to 1 since handling TWT Information frame is not supported by current mac80211 implementation. Fixes: f5a4c24e689f ("mac80211: introduce individual TWT support in AP mode") Signed-off-by: Howard Hsu Link: https://lore.kernel.org/r/20221027015653.1448-1-howard-yh.hsu@mediatek.com Signed-off-by: Johannes Berg --- net/mac80211/s1g.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/mac80211/s1g.c b/net/mac80211/s1g.c index 8ca7d45d6daa..c1f964e9991c 100644 --- a/net/mac80211/s1g.c +++ b/net/mac80211/s1g.c @@ -112,6 +112,9 @@ ieee80211_s1g_rx_twt_setup(struct ieee80211_sub_if_data *sdata, goto out; } + /* TWT Information not supported yet */ + twt->control |= IEEE80211_TWT_CONTROL_RX_DISABLED; + drv_add_twt_setup(sdata->local, sdata, &sta->sta, twt); out: ieee80211_s1g_send_twt_setup(sdata, mgmt->sa, sdata->vif.addr, twt); -- cgit v1.2.3 From cbc1dd5b659f5a2c3cba88b197b7443679bb35a0 Mon Sep 17 00:00:00 2001 From: Chen Zhongjin Date: Tue, 1 Nov 2022 19:52:52 +0800 Subject: netfilter: nf_nat: Fix possible memory leak in nf_nat_init() In nf_nat_init(), register_nf_nat_bpf() can fail and return directly without any error handling. Then nf_nat_bysource will leak and registering of &nat_net_ops, &follow_master_nat and nf_nat_hook won't be reverted. This leaves wild ops in linkedlists and when another module tries to call register_pernet_operations() or nf_ct_helper_expectfn_register() it triggers page fault: BUG: unable to handle page fault for address: fffffbfff81b964c RIP: 0010:register_pernet_operations+0x1b9/0x5f0 Call Trace: register_pernet_subsys+0x29/0x40 ebtables_init+0x58/0x1000 [ebtables] ... Fixes: 820dc0523e05 ("net: netfilter: move bpf_ct_set_nat_info kfunc in nf_nat_bpf.c") Signed-off-by: Chen Zhongjin Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_nat_core.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 18319a6e6806..e29e4ccb5c5a 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -1152,7 +1152,16 @@ static int __init nf_nat_init(void) WARN_ON(nf_nat_hook != NULL); RCU_INIT_POINTER(nf_nat_hook, &nat_hook); - return register_nf_nat_bpf(); + ret = register_nf_nat_bpf(); + if (ret < 0) { + RCU_INIT_POINTER(nf_nat_hook, NULL); + nf_ct_helper_expectfn_unregister(&follow_master_nat); + synchronize_net(); + unregister_pernet_subsys(&nat_net_ops); + kvfree(nf_nat_bysource); + } + + return ret; } static void __exit nf_nat_cleanup(void) -- cgit v1.2.3 From e97c089d7a49f67027395ddf70bf327eeac2611e Mon Sep 17 00:00:00 2001 From: Zhang Qilong Date: Sat, 29 Oct 2022 00:10:49 +0800 Subject: rose: Fix NULL pointer dereference in rose_send_frame() The syzkaller reported an issue: KASAN: null-ptr-deref in range [0x0000000000000380-0x0000000000000387] CPU: 0 PID: 4069 Comm: kworker/0:15 Not tainted 6.0.0-syzkaller-02734-g0326074ff465 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/22/2022 Workqueue: rcu_gp srcu_invoke_callbacks RIP: 0010:rose_send_frame+0x1dd/0x2f0 net/rose/rose_link.c:101 Call Trace: rose_transmit_clear_request+0x1d5/0x290 net/rose/rose_link.c:255 rose_rx_call_request+0x4c0/0x1bc0 net/rose/af_rose.c:1009 rose_loopback_timer+0x19e/0x590 net/rose/rose_loopback.c:111 call_timer_fn+0x1a0/0x6b0 kernel/time/timer.c:1474 expire_timers kernel/time/timer.c:1519 [inline] __run_timers.part.0+0x674/0xa80 kernel/time/timer.c:1790 __run_timers kernel/time/timer.c:1768 [inline] run_timer_softirq+0xb3/0x1d0 kernel/time/timer.c:1803 __do_softirq+0x1d0/0x9c8 kernel/softirq.c:571 [...] It triggers NULL pointer dereference when 'neigh->dev->dev_addr' is called in the rose_send_frame(). It's the first occurrence of the `neigh` is in rose_loopback_timer() as `rose_loopback_neigh', and the 'dev' in 'rose_loopback_neigh' is initialized sa nullptr. It had been fixed by commit 3b3fd068c56e3fbea30090859216a368398e39bf ("rose: Fix Null pointer dereference in rose_send_frame()") ever. But it's introduced by commit 3c53cd65dece47dd1f9d3a809f32e59d1d87b2b8 ("rose: check NULL rose_loopback_neigh->loopback") again. We fix it by add NULL check in rose_transmit_clear_request(). When the 'dev' in 'neigh' is NULL, we don't reply the request and just clear it. syzkaller don't provide repro, and I provide a syz repro like: r0 = syz_init_net_socket$bt_sco(0x1f, 0x5, 0x2) ioctl$sock_inet_SIOCSIFFLAGS(r0, 0x8914, &(0x7f0000000180)={'rose0\x00', 0x201}) r1 = syz_init_net_socket$rose(0xb, 0x5, 0x0) bind$rose(r1, &(0x7f00000000c0)=@full={0xb, @dev, @null, 0x0, [@null, @null, @netrom, @netrom, @default, @null]}, 0x40) connect$rose(r1, &(0x7f0000000240)=@short={0xb, @dev={0xbb, 0xbb, 0xbb, 0x1, 0x0}, @remote={0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0x1}, 0x1, @netrom={0xbb, 0xbb, 0xbb, 0xbb, 0xbb, 0x0, 0x0}}, 0x1c) Fixes: 3c53cd65dece ("rose: check NULL rose_loopback_neigh->loopback") Signed-off-by: Zhang Qilong Signed-off-by: David S. Miller --- net/rose/rose_link.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/rose/rose_link.c b/net/rose/rose_link.c index 8b96a56d3a49..0f77ae8ef944 100644 --- a/net/rose/rose_link.c +++ b/net/rose/rose_link.c @@ -236,6 +236,9 @@ void rose_transmit_clear_request(struct rose_neigh *neigh, unsigned int lci, uns unsigned char *dptr; int len; + if (!neigh->dev) + return; + len = AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + ROSE_MIN_LEN + 3; if ((skb = alloc_skb(len, GFP_ATOMIC)) == NULL) -- cgit v1.2.3 From 510841da1fcc16f702440ab58ef0b4d82a9056b7 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Wed, 2 Nov 2022 10:40:47 +0100 Subject: netfilter: ipset: enforce documented limit to prevent allocating huge memory Daniel Xu reported that the hash:net,iface type of the ipset subsystem does not limit adding the same network with different interfaces to a set, which can lead to huge memory usage or allocation failure. The quick reproducer is $ ipset create ACL.IN.ALL_PERMIT hash:net,iface hashsize 1048576 timeout 0 $ for i in $(seq 0 100); do /sbin/ipset add ACL.IN.ALL_PERMIT 0.0.0.0/0,kaf_$i timeout 0 -exist; done The backtrace when vmalloc fails: [Tue Oct 25 00:13:08 2022] ipset: vmalloc error: size 1073741848, exceeds total pages <...> [Tue Oct 25 00:13:08 2022] Call Trace: [Tue Oct 25 00:13:08 2022] [Tue Oct 25 00:13:08 2022] dump_stack_lvl+0x48/0x60 [Tue Oct 25 00:13:08 2022] warn_alloc+0x155/0x180 [Tue Oct 25 00:13:08 2022] __vmalloc_node_range+0x72a/0x760 [Tue Oct 25 00:13:08 2022] ? hash_netiface4_add+0x7c0/0xb20 [Tue Oct 25 00:13:08 2022] ? __kmalloc_large_node+0x4a/0x90 [Tue Oct 25 00:13:08 2022] kvmalloc_node+0xa6/0xd0 [Tue Oct 25 00:13:08 2022] ? hash_netiface4_resize+0x99/0x710 <...> The fix is to enforce the limit documented in the ipset(8) manpage: > The internal restriction of the hash:net,iface set type is that the same > network prefix cannot be stored with more than 64 different interfaces > in a single set. Fixes: ccf0a4b7fc68 ("netfilter: ipset: Add bucketsize parameter to all hash types") Reported-by: Daniel Xu Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_hash_gen.h | 30 ++++++------------------------ 1 file changed, 6 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 6e391308431d..3adc291d9ce1 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -42,31 +42,8 @@ #define AHASH_MAX_SIZE (6 * AHASH_INIT_SIZE) /* Max muber of elements in the array block when tuned */ #define AHASH_MAX_TUNED 64 - #define AHASH_MAX(h) ((h)->bucketsize) -/* Max number of elements can be tuned */ -#ifdef IP_SET_HASH_WITH_MULTI -static u8 -tune_bucketsize(u8 curr, u32 multi) -{ - u32 n; - - if (multi < curr) - return curr; - - n = curr + AHASH_INIT_SIZE; - /* Currently, at listing one hash bucket must fit into a message. - * Therefore we have a hard limit here. - */ - return n > curr && n <= AHASH_MAX_TUNED ? n : curr; -} -#define TUNE_BUCKETSIZE(h, multi) \ - ((h)->bucketsize = tune_bucketsize((h)->bucketsize, multi)) -#else -#define TUNE_BUCKETSIZE(h, multi) -#endif - /* A hash bucket */ struct hbucket { struct rcu_head rcu; /* for call_rcu */ @@ -936,7 +913,12 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, goto set_full; /* Create a new slot */ if (n->pos >= n->size) { - TUNE_BUCKETSIZE(h, multi); +#ifdef IP_SET_HASH_WITH_MULTI + if (h->bucketsize >= AHASH_MAX_TUNED) + goto set_full; + else if (h->bucketsize < multi) + h->bucketsize += AHASH_INIT_SIZE; +#endif if (n->size >= AHASH_MAX(h)) { /* Trigger rehashing */ mtype_data_next(&h->next, d); -- cgit v1.2.3 From 3aff8aaca4e36dc8b17eaa011684881a80238966 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Wed, 5 Oct 2022 00:27:18 +0300 Subject: Bluetooth: L2CAP: Fix use-after-free caused by l2cap_reassemble_sdu Fix the race condition between the following two flows that run in parallel: 1. l2cap_reassemble_sdu -> chan->ops->recv (l2cap_sock_recv_cb) -> __sock_queue_rcv_skb. 2. bt_sock_recvmsg -> skb_recv_datagram, skb_free_datagram. An SKB can be queued by the first flow and immediately dequeued and freed by the second flow, therefore the callers of l2cap_reassemble_sdu can't use the SKB after that function returns. However, some places continue accessing struct l2cap_ctrl that resides in the SKB's CB for a short time after l2cap_reassemble_sdu returns, leading to a use-after-free condition (the stack trace is below, line numbers for kernel 5.19.8). Fix it by keeping a local copy of struct l2cap_ctrl. BUG: KASAN: use-after-free in l2cap_rx_state_recv (net/bluetooth/l2cap_core.c:6906) bluetooth Read of size 1 at addr ffff88812025f2f0 by task kworker/u17:3/43169 Workqueue: hci0 hci_rx_work [bluetooth] Call Trace: dump_stack_lvl (lib/dump_stack.c:107 (discriminator 4)) print_report.cold (mm/kasan/report.c:314 mm/kasan/report.c:429) ? l2cap_rx_state_recv (net/bluetooth/l2cap_core.c:6906) bluetooth kasan_report (mm/kasan/report.c:162 mm/kasan/report.c:493) ? l2cap_rx_state_recv (net/bluetooth/l2cap_core.c:6906) bluetooth l2cap_rx_state_recv (net/bluetooth/l2cap_core.c:6906) bluetooth l2cap_rx (net/bluetooth/l2cap_core.c:7236 net/bluetooth/l2cap_core.c:7271) bluetooth ret_from_fork (arch/x86/entry/entry_64.S:306) Allocated by task 43169: kasan_save_stack (mm/kasan/common.c:39) __kasan_slab_alloc (mm/kasan/common.c:45 mm/kasan/common.c:436 mm/kasan/common.c:469) kmem_cache_alloc_node (mm/slab.h:750 mm/slub.c:3243 mm/slub.c:3293) __alloc_skb (net/core/skbuff.c:414) l2cap_recv_frag (./include/net/bluetooth/bluetooth.h:425 net/bluetooth/l2cap_core.c:8329) bluetooth l2cap_recv_acldata (net/bluetooth/l2cap_core.c:8442) bluetooth hci_rx_work (net/bluetooth/hci_core.c:3642 net/bluetooth/hci_core.c:3832) bluetooth process_one_work (kernel/workqueue.c:2289) worker_thread (./include/linux/list.h:292 kernel/workqueue.c:2437) kthread (kernel/kthread.c:376) ret_from_fork (arch/x86/entry/entry_64.S:306) Freed by task 27920: kasan_save_stack (mm/kasan/common.c:39) kasan_set_track (mm/kasan/common.c:45) kasan_set_free_info (mm/kasan/generic.c:372) ____kasan_slab_free (mm/kasan/common.c:368 mm/kasan/common.c:328) slab_free_freelist_hook (mm/slub.c:1780) kmem_cache_free (mm/slub.c:3536 mm/slub.c:3553) skb_free_datagram (./include/net/sock.h:1578 ./include/net/sock.h:1639 net/core/datagram.c:323) bt_sock_recvmsg (net/bluetooth/af_bluetooth.c:295) bluetooth l2cap_sock_recvmsg (net/bluetooth/l2cap_sock.c:1212) bluetooth sock_read_iter (net/socket.c:1087) new_sync_read (./include/linux/fs.h:2052 fs/read_write.c:401) vfs_read (fs/read_write.c:482) ksys_read (fs/read_write.c:620) do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:120) Link: https://lore.kernel.org/linux-bluetooth/CAKErNvoqga1WcmoR3-0875esY6TVWFQDandbVZncSiuGPBQXLA@mail.gmail.com/T/#u Fixes: d2a7ac5d5d3a ("Bluetooth: Add the ERTM receive state machine") Fixes: 4b51dae96731 ("Bluetooth: Add streaming mode receive and incoming packet classifier") Signed-off-by: Maxim Mikityanskiy Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/l2cap_core.c | 48 +++++++++++++++++++++++++++++++++++++++------- 1 file changed, 41 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 1f34b82ca0ec..2283871d3f01 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -6885,6 +6885,7 @@ static int l2cap_rx_state_recv(struct l2cap_chan *chan, struct l2cap_ctrl *control, struct sk_buff *skb, u8 event) { + struct l2cap_ctrl local_control; int err = 0; bool skb_in_use = false; @@ -6909,15 +6910,32 @@ static int l2cap_rx_state_recv(struct l2cap_chan *chan, chan->buffer_seq = chan->expected_tx_seq; skb_in_use = true; + /* l2cap_reassemble_sdu may free skb, hence invalidate + * control, so make a copy in advance to use it after + * l2cap_reassemble_sdu returns and to avoid the race + * condition, for example: + * + * The current thread calls: + * l2cap_reassemble_sdu + * chan->ops->recv == l2cap_sock_recv_cb + * __sock_queue_rcv_skb + * Another thread calls: + * bt_sock_recvmsg + * skb_recv_datagram + * skb_free_datagram + * Then the current thread tries to access control, but + * it was freed by skb_free_datagram. + */ + local_control = *control; err = l2cap_reassemble_sdu(chan, skb, control); if (err) break; - if (control->final) { + if (local_control.final) { if (!test_and_clear_bit(CONN_REJ_ACT, &chan->conn_state)) { - control->final = 0; - l2cap_retransmit_all(chan, control); + local_control.final = 0; + l2cap_retransmit_all(chan, &local_control); l2cap_ertm_send(chan); } } @@ -7297,11 +7315,27 @@ static int l2cap_rx(struct l2cap_chan *chan, struct l2cap_ctrl *control, static int l2cap_stream_rx(struct l2cap_chan *chan, struct l2cap_ctrl *control, struct sk_buff *skb) { + /* l2cap_reassemble_sdu may free skb, hence invalidate control, so store + * the txseq field in advance to use it after l2cap_reassemble_sdu + * returns and to avoid the race condition, for example: + * + * The current thread calls: + * l2cap_reassemble_sdu + * chan->ops->recv == l2cap_sock_recv_cb + * __sock_queue_rcv_skb + * Another thread calls: + * bt_sock_recvmsg + * skb_recv_datagram + * skb_free_datagram + * Then the current thread tries to access control, but it was freed by + * skb_free_datagram. + */ + u16 txseq = control->txseq; + BT_DBG("chan %p, control %p, skb %p, state %d", chan, control, skb, chan->rx_state); - if (l2cap_classify_txseq(chan, control->txseq) == - L2CAP_TXSEQ_EXPECTED) { + if (l2cap_classify_txseq(chan, txseq) == L2CAP_TXSEQ_EXPECTED) { l2cap_pass_to_tx(chan, control); BT_DBG("buffer_seq %u->%u", chan->buffer_seq, @@ -7324,8 +7358,8 @@ static int l2cap_stream_rx(struct l2cap_chan *chan, struct l2cap_ctrl *control, } } - chan->last_acked_seq = control->txseq; - chan->expected_tx_seq = __next_seq(chan, control->txseq); + chan->last_acked_seq = txseq; + chan->expected_tx_seq = __next_seq(chan, txseq); return 0; } -- cgit v1.2.3 From b36a234dc438cb6b76fc929a8df9a0e59c8acf23 Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Tue, 11 Oct 2022 22:25:33 +0300 Subject: Bluetooth: hci_conn: Fix CIS connection dst_type handling hci_connect_cis and iso_connect_cis call hci_bind_cis inconsistently with dst_type being either ISO socket address type or the HCI type, but these values cannot be mixed like this. Fix this by using only the HCI type. CIS connection dst_type was also not initialized in hci_bind_cis, even though it is used in hci_conn_hash_lookup_cis to find existing connections. Set the value in hci_bind_cis, so that existing CIS connections are found e.g. when doing deferred socket connections, also when dst_type is not 0 (ADDR_LE_DEV_PUBLIC). Fixes: 26afbd826ee3 ("Bluetooth: Add initial implementation of CIS connections") Signed-off-by: Pauli Virtanen Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_conn.c | 7 +------ net/bluetooth/iso.c | 14 ++++++++++++-- 2 files changed, 13 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 7a59c4487050..1176bad5d833 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -1761,6 +1761,7 @@ struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst, if (!cis) return ERR_PTR(-ENOMEM); cis->cleanup = cis_cleanup; + cis->dst_type = dst_type; } if (cis->state == BT_CONNECTED) @@ -2140,12 +2141,6 @@ struct hci_conn *hci_connect_cis(struct hci_dev *hdev, bdaddr_t *dst, struct hci_conn *le; struct hci_conn *cis; - /* Convert from ISO socket address type to HCI address type */ - if (dst_type == BDADDR_LE_PUBLIC) - dst_type = ADDR_LE_DEV_PUBLIC; - else - dst_type = ADDR_LE_DEV_RANDOM; - if (hci_dev_test_flag(hdev, HCI_ADVERTISING)) le = hci_connect_le(hdev, dst, dst_type, false, BT_SECURITY_LOW, diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index 613039ba5dbf..f825857db6d0 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -235,6 +235,14 @@ static int iso_chan_add(struct iso_conn *conn, struct sock *sk, return err; } +static inline u8 le_addr_type(u8 bdaddr_type) +{ + if (bdaddr_type == BDADDR_LE_PUBLIC) + return ADDR_LE_DEV_PUBLIC; + else + return ADDR_LE_DEV_RANDOM; +} + static int iso_connect_bis(struct sock *sk) { struct iso_conn *conn; @@ -328,14 +336,16 @@ static int iso_connect_cis(struct sock *sk) /* Just bind if DEFER_SETUP has been set */ if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) { hcon = hci_bind_cis(hdev, &iso_pi(sk)->dst, - iso_pi(sk)->dst_type, &iso_pi(sk)->qos); + le_addr_type(iso_pi(sk)->dst_type), + &iso_pi(sk)->qos); if (IS_ERR(hcon)) { err = PTR_ERR(hcon); goto done; } } else { hcon = hci_connect_cis(hdev, &iso_pi(sk)->dst, - iso_pi(sk)->dst_type, &iso_pi(sk)->qos); + le_addr_type(iso_pi(sk)->dst_type), + &iso_pi(sk)->qos); if (IS_ERR(hcon)) { err = PTR_ERR(hcon); goto done; -- cgit v1.2.3 From 0d0e2d032811280b927650ff3c15fe5020e82533 Mon Sep 17 00:00:00 2001 From: Zhengchao Shao Date: Mon, 17 Oct 2022 15:58:13 +0800 Subject: Bluetooth: L2CAP: fix use-after-free in l2cap_conn_del() When l2cap_recv_frame() is invoked to receive data, and the cid is L2CAP_CID_A2MP, if the channel does not exist, it will create a channel. However, after a channel is created, the hold operation of the channel is not performed. In this case, the value of channel reference counting is 1. As a result, after hci_error_reset() is triggered, l2cap_conn_del() invokes the close hook function of A2MP to release the channel. Then l2cap_chan_unlock(chan) will trigger UAF issue. The process is as follows: Receive data: l2cap_data_channel() a2mp_channel_create() --->channel ref is 2 l2cap_chan_put() --->channel ref is 1 Triger event: hci_error_reset() hci_dev_do_close() ... l2cap_disconn_cfm() l2cap_conn_del() l2cap_chan_hold() --->channel ref is 2 l2cap_chan_del() --->channel ref is 1 a2mp_chan_close_cb() --->channel ref is 0, release channel l2cap_chan_unlock() --->UAF of channel The detailed Call Trace is as follows: BUG: KASAN: use-after-free in __mutex_unlock_slowpath+0xa6/0x5e0 Read of size 8 at addr ffff8880160664b8 by task kworker/u11:1/7593 Workqueue: hci0 hci_error_reset Call Trace: dump_stack_lvl+0xcd/0x134 print_report.cold+0x2ba/0x719 kasan_report+0xb1/0x1e0 kasan_check_range+0x140/0x190 __mutex_unlock_slowpath+0xa6/0x5e0 l2cap_conn_del+0x404/0x7b0 l2cap_disconn_cfm+0x8c/0xc0 hci_conn_hash_flush+0x11f/0x260 hci_dev_close_sync+0x5f5/0x11f0 hci_dev_do_close+0x2d/0x70 hci_error_reset+0x9e/0x140 process_one_work+0x98a/0x1620 worker_thread+0x665/0x1080 kthread+0x2e4/0x3a0 ret_from_fork+0x1f/0x30 Allocated by task 7593: kasan_save_stack+0x1e/0x40 __kasan_kmalloc+0xa9/0xd0 l2cap_chan_create+0x40/0x930 amp_mgr_create+0x96/0x990 a2mp_channel_create+0x7d/0x150 l2cap_recv_frame+0x51b8/0x9a70 l2cap_recv_acldata+0xaa3/0xc00 hci_rx_work+0x702/0x1220 process_one_work+0x98a/0x1620 worker_thread+0x665/0x1080 kthread+0x2e4/0x3a0 ret_from_fork+0x1f/0x30 Freed by task 7593: kasan_save_stack+0x1e/0x40 kasan_set_track+0x21/0x30 kasan_set_free_info+0x20/0x30 ____kasan_slab_free+0x167/0x1c0 slab_free_freelist_hook+0x89/0x1c0 kfree+0xe2/0x580 l2cap_chan_put+0x22a/0x2d0 l2cap_conn_del+0x3fc/0x7b0 l2cap_disconn_cfm+0x8c/0xc0 hci_conn_hash_flush+0x11f/0x260 hci_dev_close_sync+0x5f5/0x11f0 hci_dev_do_close+0x2d/0x70 hci_error_reset+0x9e/0x140 process_one_work+0x98a/0x1620 worker_thread+0x665/0x1080 kthread+0x2e4/0x3a0 ret_from_fork+0x1f/0x30 Last potentially related work creation: kasan_save_stack+0x1e/0x40 __kasan_record_aux_stack+0xbe/0xd0 call_rcu+0x99/0x740 netlink_release+0xe6a/0x1cf0 __sock_release+0xcd/0x280 sock_close+0x18/0x20 __fput+0x27c/0xa90 task_work_run+0xdd/0x1a0 exit_to_user_mode_prepare+0x23c/0x250 syscall_exit_to_user_mode+0x19/0x50 do_syscall_64+0x42/0x80 entry_SYSCALL_64_after_hwframe+0x63/0xcd Second to last potentially related work creation: kasan_save_stack+0x1e/0x40 __kasan_record_aux_stack+0xbe/0xd0 call_rcu+0x99/0x740 netlink_release+0xe6a/0x1cf0 __sock_release+0xcd/0x280 sock_close+0x18/0x20 __fput+0x27c/0xa90 task_work_run+0xdd/0x1a0 exit_to_user_mode_prepare+0x23c/0x250 syscall_exit_to_user_mode+0x19/0x50 do_syscall_64+0x42/0x80 entry_SYSCALL_64_after_hwframe+0x63/0xcd Fixes: d0be8347c623 ("Bluetooth: L2CAP: Fix use-after-free caused by l2cap_chan_put") Signed-off-by: Zhengchao Shao Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/l2cap_core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 2283871d3f01..9a32ce634919 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -7615,6 +7615,7 @@ static void l2cap_data_channel(struct l2cap_conn *conn, u16 cid, return; } + l2cap_chan_hold(chan); l2cap_chan_lock(chan); } else { BT_DBG("unknown cid 0x%4.4x", cid); -- cgit v1.2.3 From 7c9524d929648935bac2bbb4c20437df8f9c3f42 Mon Sep 17 00:00:00 2001 From: Hawkins Jiawei Date: Tue, 18 Oct 2022 10:18:51 +0800 Subject: Bluetooth: L2CAP: Fix memory leak in vhci_write Syzkaller reports a memory leak as follows: ==================================== BUG: memory leak unreferenced object 0xffff88810d81ac00 (size 240): [...] hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [] __alloc_skb+0x1f9/0x270 net/core/skbuff.c:418 [] alloc_skb include/linux/skbuff.h:1257 [inline] [] bt_skb_alloc include/net/bluetooth/bluetooth.h:469 [inline] [] vhci_get_user drivers/bluetooth/hci_vhci.c:391 [inline] [] vhci_write+0x5f/0x230 drivers/bluetooth/hci_vhci.c:511 [] call_write_iter include/linux/fs.h:2192 [inline] [] new_sync_write fs/read_write.c:491 [inline] [] vfs_write+0x42d/0x540 fs/read_write.c:578 [] ksys_write+0x9d/0x160 fs/read_write.c:631 [] do_syscall_x64 arch/x86/entry/common.c:50 [inline] [] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 [] entry_SYSCALL_64_after_hwframe+0x63/0xcd ==================================== HCI core will uses hci_rx_work() to process frame, which is queued to the hdev->rx_q tail in hci_recv_frame() by HCI driver. Yet the problem is that, HCI core may not free the skb after handling ACL data packets. To be more specific, when start fragment does not contain the L2CAP length, HCI core just copies skb into conn->rx_skb and finishes frame process in l2cap_recv_acldata(), without freeing the skb, which triggers the above memory leak. This patch solves it by releasing the relative skb, after processing the above case in l2cap_recv_acldata(). Fixes: 4d7ea8ee90e4 ("Bluetooth: L2CAP: Fix handling fragmented length") Link: https://lore.kernel.org/all/0000000000000d0b1905e6aaef64@google.com/ Reported-and-tested-by: syzbot+8f819e36e01022991cfa@syzkaller.appspotmail.com Signed-off-by: Hawkins Jiawei Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/l2cap_core.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 9a32ce634919..1fbe087d6ae4 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -8461,9 +8461,8 @@ void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) * expected length. */ if (skb->len < L2CAP_LEN_SIZE) { - if (l2cap_recv_frag(conn, skb, conn->mtu) < 0) - goto drop; - return; + l2cap_recv_frag(conn, skb, conn->mtu); + break; } len = get_unaligned_le16(skb->data) + L2CAP_HDR_SIZE; @@ -8507,7 +8506,7 @@ void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) /* Header still could not be read just continue */ if (conn->rx_skb->len < L2CAP_LEN_SIZE) - return; + break; } if (skb->len > conn->rx_len) { -- cgit v1.2.3 From 5638d9ea9c01c77fc11693d48cf719bc7e88f224 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 17 Oct 2022 15:36:23 -0700 Subject: Bluetooth: hci_conn: Fix not restoring ISO buffer count on disconnect MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When disconnecting an ISO link the controller may not generate HCI_EV_NUM_COMP_PKTS for unacked packets which needs to be restored in hci_conn_del otherwise the host would assume they are still in use and would not be able to use all the buffers available. Fixes: 26afbd826ee3 ("Bluetooth: Add initial implementation of CIS connections") Signed-off-by: Luiz Augusto von Dentz Tested-by: Frédéric Danis --- net/bluetooth/hci_conn.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'net') diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 1176bad5d833..a6c12863a253 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -1067,10 +1067,21 @@ int hci_conn_del(struct hci_conn *conn) hdev->acl_cnt += conn->sent; } else { struct hci_conn *acl = conn->link; + if (acl) { acl->link = NULL; hci_conn_drop(acl); } + + /* Unacked ISO frames */ + if (conn->type == ISO_LINK) { + if (hdev->iso_pkts) + hdev->iso_cnt += conn->sent; + else if (hdev->le_pkts) + hdev->le_cnt += conn->sent; + else + hdev->acl_cnt += conn->sent; + } } if (conn->amp_mgr) -- cgit v1.2.3 From 711f8c3fb3db61897080468586b970c87c61d9e4 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 31 Oct 2022 16:10:32 -0700 Subject: Bluetooth: L2CAP: Fix accepting connection request for invalid SPSM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Bluetooth spec states that the valid range for SPSM is from 0x0001-0x00ff so it is invalid to accept values outside of this range: BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 3, Part A page 1059: Table 4.15: L2CAP_LE_CREDIT_BASED_CONNECTION_REQ SPSM ranges CVE: CVE-2022-42896 CC: stable@vger.kernel.org Reported-by: Tamás Koczka Signed-off-by: Luiz Augusto von Dentz Reviewed-by: Tedd Ho-Jeong An --- net/bluetooth/l2cap_core.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'net') diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 1fbe087d6ae4..3eee915fb245 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -5813,6 +5813,19 @@ static int l2cap_le_connect_req(struct l2cap_conn *conn, BT_DBG("psm 0x%2.2x scid 0x%4.4x mtu %u mps %u", __le16_to_cpu(psm), scid, mtu, mps); + /* BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 3, Part A + * page 1059: + * + * Valid range: 0x0001-0x00ff + * + * Table 4.15: L2CAP_LE_CREDIT_BASED_CONNECTION_REQ SPSM ranges + */ + if (!psm || __le16_to_cpu(psm) > L2CAP_PSM_LE_DYN_END) { + result = L2CAP_CR_LE_BAD_PSM; + chan = NULL; + goto response; + } + /* Check if we have socket listening on psm */ pchan = l2cap_global_chan_by_psm(BT_LISTEN, psm, &conn->hcon->src, &conn->hcon->dst, LE_LINK); @@ -6001,6 +6014,18 @@ static inline int l2cap_ecred_conn_req(struct l2cap_conn *conn, psm = req->psm; + /* BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 3, Part A + * page 1059: + * + * Valid range: 0x0001-0x00ff + * + * Table 4.15: L2CAP_LE_CREDIT_BASED_CONNECTION_REQ SPSM ranges + */ + if (!psm || __le16_to_cpu(psm) > L2CAP_PSM_LE_DYN_END) { + result = L2CAP_CR_LE_BAD_PSM; + goto response; + } + BT_DBG("psm 0x%2.2x mtu %u mps %u", __le16_to_cpu(psm), mtu, mps); memset(&pdu, 0, sizeof(pdu)); -- cgit v1.2.3 From f937b758a188d6fd328a81367087eddbb2fce50f Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 31 Oct 2022 16:10:33 -0700 Subject: Bluetooth: L2CAP: Fix l2cap_global_chan_by_psm l2cap_global_chan_by_psm shall not return fixed channels as they are not meant to be connected by (S)PSM. Signed-off-by: Luiz Augusto von Dentz Reviewed-by: Tedd Ho-Jeong An --- net/bluetooth/l2cap_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 3eee915fb245..4d8a1d862f6b 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -1990,7 +1990,7 @@ static struct l2cap_chan *l2cap_global_chan_by_psm(int state, __le16 psm, if (link_type == LE_LINK && c->src_type == BDADDR_BREDR) continue; - if (c->psm == psm) { + if (c->chan_type != L2CAP_CHAN_FIXED && c->psm == psm) { int src_match, dst_match; int src_any, dst_any; -- cgit v1.2.3 From b1a2cd50c0357f243b7435a732b4e62ba3157a2e Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 31 Oct 2022 16:10:52 -0700 Subject: Bluetooth: L2CAP: Fix attempting to access uninitialized memory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On l2cap_parse_conf_req the variable efs is only initialized if remote_efs has been set. CVE: CVE-2022-42895 CC: stable@vger.kernel.org Reported-by: Tamás Koczka Signed-off-by: Luiz Augusto von Dentz Reviewed-by: Tedd Ho-Jeong An --- net/bluetooth/l2cap_core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 4d8a1d862f6b..9c24947aa41e 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -3764,7 +3764,8 @@ done: l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, sizeof(rfc), (unsigned long) &rfc, endptr - ptr); - if (test_bit(FLAG_EFS_ENABLE, &chan->flags)) { + if (remote_efs && + test_bit(FLAG_EFS_ENABLE, &chan->flags)) { chan->remote_id = efs.id; chan->remote_stype = efs.stype; chan->remote_msdu = le16_to_cpu(efs.msdu); -- cgit v1.2.3 From 62ff373da2534534c55debe6c724c7fe14adb97f Mon Sep 17 00:00:00 2001 From: Chen Zhongjin Date: Tue, 1 Nov 2022 17:37:22 +0800 Subject: net/smc: Fix possible leaked pernet namespace in smc_init() In smc_init(), register_pernet_subsys(&smc_net_stat_ops) is called without any error handling. If it fails, registering of &smc_net_ops won't be reverted. And if smc_nl_init() fails, &smc_net_stat_ops itself won't be reverted. This leaves wild ops in subsystem linkedlist and when another module tries to call register_pernet_operations() it triggers page fault: BUG: unable to handle page fault for address: fffffbfff81b964c RIP: 0010:register_pernet_operations+0x1b9/0x5f0 Call Trace: register_pernet_subsys+0x29/0x40 ebtables_init+0x58/0x1000 [ebtables] ... Fixes: 194730a9beb5 ("net/smc: Make SMC statistics network namespace aware") Signed-off-by: Chen Zhongjin Reviewed-by: Tony Lu Reviewed-by: Wenjia Zhang Link: https://lore.kernel.org/r/20221101093722.127223-1-chenzhongjin@huawei.com Signed-off-by: Jakub Kicinski --- net/smc/af_smc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 3ccbf3c201cd..e12d4fa5aece 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -3380,14 +3380,14 @@ static int __init smc_init(void) rc = register_pernet_subsys(&smc_net_stat_ops); if (rc) - return rc; + goto out_pernet_subsys; smc_ism_init(); smc_clc_init(); rc = smc_nl_init(); if (rc) - goto out_pernet_subsys; + goto out_pernet_subsys_stat; rc = smc_pnet_init(); if (rc) @@ -3480,6 +3480,8 @@ out_pnet: smc_pnet_exit(); out_nl: smc_nl_exit(); +out_pernet_subsys_stat: + unregister_pernet_subsys(&smc_net_stat_ops); out_pernet_subsys: unregister_pernet_subsys(&smc_net_ops); -- cgit v1.2.3 From f8017317cb0b279b8ab98b0f3901a2e0ac880dad Mon Sep 17 00:00:00 2001 From: Chen Zhongjin Date: Tue, 1 Nov 2022 20:15:52 +0800 Subject: net, neigh: Fix null-ptr-deref in neigh_table_clear() When IPv6 module gets initialized but hits an error in the middle, kenel panic with: KASAN: null-ptr-deref in range [0x0000000000000598-0x000000000000059f] CPU: 1 PID: 361 Comm: insmod Hardware name: QEMU Standard PC (i440FX + PIIX, 1996) RIP: 0010:__neigh_ifdown.isra.0+0x24b/0x370 RSP: 0018:ffff888012677908 EFLAGS: 00000202 ... Call Trace: neigh_table_clear+0x94/0x2d0 ndisc_cleanup+0x27/0x40 [ipv6] inet6_init+0x21c/0x2cb [ipv6] do_one_initcall+0xd3/0x4d0 do_init_module+0x1ae/0x670 ... Kernel panic - not syncing: Fatal exception When ipv6 initialization fails, it will try to cleanup and calls: neigh_table_clear() neigh_ifdown(tbl, NULL) pneigh_queue_purge(&tbl->proxy_queue, dev_net(dev == NULL)) # dev_net(NULL) triggers null-ptr-deref. Fix it by passing NULL to pneigh_queue_purge() in neigh_ifdown() if dev is NULL, to make kernel not panic immediately. Fixes: 66ba215cb513 ("neigh: fix possible DoS due to net iface start/stop loop") Signed-off-by: Chen Zhongjin Reviewed-by: Eric Dumazet Reviewed-by: Denis V. Lunev Link: https://lore.kernel.org/r/20221101121552.21890-1-chenzhongjin@huawei.com Signed-off-by: Jakub Kicinski --- net/core/neighbour.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 3c4786b99907..a77a85e357e0 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -409,7 +409,7 @@ static int __neigh_ifdown(struct neigh_table *tbl, struct net_device *dev, write_lock_bh(&tbl->lock); neigh_flush_dev(tbl, dev, skip_perm); pneigh_ifdown_and_unlock(tbl, dev); - pneigh_queue_purge(&tbl->proxy_queue, dev_net(dev)); + pneigh_queue_purge(&tbl->proxy_queue, dev ? dev_net(dev) : NULL); if (skb_queue_empty_lockless(&tbl->proxy_queue)) del_timer_sync(&tbl->proxy_timer); return 0; -- cgit v1.2.3 From 628ac04a75ed5ff13647e725f40192da22ef2be8 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 1 Nov 2022 20:57:53 +0200 Subject: bridge: Fix flushing of dynamic FDB entries The following commands should result in all the dynamic FDB entries being flushed, but instead all the non-local (non-permanent) entries are flushed: # bridge fdb add 00:aa:bb:cc:dd:ee dev dummy1 master static # bridge fdb add 00:11:22:33:44:55 dev dummy1 master dynamic # ip link set dev br0 type bridge fdb_flush # bridge fdb show brport dummy1 00:00:00:00:00:01 master br0 permanent 33:33:00:00:00:01 self permanent 01:00:5e:00:00:01 self permanent This is because br_fdb_flush() works with FDB flags and not the corresponding enumerator values. Fix by passing the FDB flag instead. After the fix: # bridge fdb add 00:aa:bb:cc:dd:ee dev dummy1 master static # bridge fdb add 00:11:22:33:44:55 dev dummy1 master dynamic # ip link set dev br0 type bridge fdb_flush # bridge fdb show brport dummy1 00:aa:bb:cc:dd:ee master br0 static 00:00:00:00:00:01 master br0 permanent 33:33:00:00:00:01 self permanent 01:00:5e:00:00:01 self permanent Fixes: 1f78ee14eeac ("net: bridge: fdb: add support for fine-grained flushing") Signed-off-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Link: https://lore.kernel.org/r/20221101185753.2120691-1-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- net/bridge/br_netlink.c | 2 +- net/bridge/br_sysfs_br.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 5aeb3646e74c..d087fd4c784a 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -1332,7 +1332,7 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], if (data[IFLA_BR_FDB_FLUSH]) { struct net_bridge_fdb_flush_desc desc = { - .flags_mask = BR_FDB_STATIC + .flags_mask = BIT(BR_FDB_STATIC) }; br_fdb_flush(br, &desc); diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c index 612e367fff20..ea733542244c 100644 --- a/net/bridge/br_sysfs_br.c +++ b/net/bridge/br_sysfs_br.c @@ -345,7 +345,7 @@ static int set_flush(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { struct net_bridge_fdb_flush_desc desc = { - .flags_mask = BR_FDB_STATIC + .flags_mask = BIT(BR_FDB_STATIC) }; br_fdb_flush(br, &desc); -- cgit v1.2.3 From 768b3c745fe5789f2430bdab02f35a9ad1148d97 Mon Sep 17 00:00:00 2001 From: Zhengchao Shao Date: Wed, 2 Nov 2022 10:06:10 +0800 Subject: ipv6: fix WARNING in ip6_route_net_exit_late() During the initialization of ip6_route_net_init_late(), if file ipv6_route or rt6_stats fails to be created, the initialization is successful by default. Therefore, the ipv6_route or rt6_stats file doesn't be found during the remove in ip6_route_net_exit_late(). It will cause WRNING. The following is the stack information: name 'rt6_stats' WARNING: CPU: 0 PID: 9 at fs/proc/generic.c:712 remove_proc_entry+0x389/0x460 Modules linked in: Workqueue: netns cleanup_net RIP: 0010:remove_proc_entry+0x389/0x460 PKRU: 55555554 Call Trace: ops_exit_list+0xb0/0x170 cleanup_net+0x4ea/0xb00 process_one_work+0x9bf/0x1710 worker_thread+0x665/0x1080 kthread+0x2e4/0x3a0 ret_from_fork+0x1f/0x30 Fixes: cdb1876192db ("[NETNS][IPV6] route6 - create route6 proc files for the namespace") Signed-off-by: Zhengchao Shao Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20221102020610.351330-1-shaozhengchao@huawei.com Signed-off-by: Jakub Kicinski --- net/ipv6/route.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 69252eb462b2..2f355f0ec32a 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -6555,10 +6555,16 @@ static void __net_exit ip6_route_net_exit(struct net *net) static int __net_init ip6_route_net_init_late(struct net *net) { #ifdef CONFIG_PROC_FS - proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops, - sizeof(struct ipv6_route_iter)); - proc_create_net_single("rt6_stats", 0444, net->proc_net, - rt6_stats_seq_show, NULL); + if (!proc_create_net("ipv6_route", 0, net->proc_net, + &ipv6_route_seq_ops, + sizeof(struct ipv6_route_iter))) + return -ENOMEM; + + if (!proc_create_net_single("rt6_stats", 0444, net->proc_net, + rt6_stats_seq_show, NULL)) { + remove_proc_entry("ipv6_route", net->proc_net); + return -ENOMEM; + } #endif return 0; } -- cgit v1.2.3 From cf6ff0df0fd123493e57278a1bd4414a97511a34 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Mon, 31 Oct 2022 19:17:05 -0700 Subject: vsock: remove the unused 'wait' in vsock_connectible_recvmsg() Remove the unused variable introduced by 19c1b90e1979. Fixes: 19c1b90e1979 ("af_vsock: separate receive data loop") Signed-off-by: Dexuan Cui Reviewed-by: Stefano Garzarella Signed-off-by: Paolo Abeni --- net/vmw_vsock/af_vsock.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index ee418701cdee..d258fd43092e 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -2092,8 +2092,6 @@ vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, const struct vsock_transport *transport; int err; - DEFINE_WAIT(wait); - sk = sock->sk; vsk = vsock_sk(sk); err = 0; -- cgit v1.2.3 From 466a85336fee6e3b35eb97b8405a28302fd25809 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Mon, 31 Oct 2022 19:17:06 -0700 Subject: vsock: fix possible infinite sleep in vsock_connectible_wait_data() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently vsock_connectible_has_data() may miss a wakeup operation between vsock_connectible_has_data() == 0 and the prepare_to_wait(). Fix the race by adding the process to the wait queue before checking vsock_connectible_has_data(). Fixes: b3f7fd54881b ("af_vsock: separate wait data loop") Signed-off-by: Dexuan Cui Reviewed-by: Stefano Garzarella Reported-by: Frédéric Dalleau Tested-by: Frédéric Dalleau Signed-off-by: Paolo Abeni --- net/vmw_vsock/af_vsock.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index d258fd43092e..884eca7f6743 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1905,8 +1905,11 @@ static int vsock_connectible_wait_data(struct sock *sk, err = 0; transport = vsk->transport; - while ((data = vsock_connectible_has_data(vsk)) == 0) { + while (1) { prepare_to_wait(sk_sleep(sk), wait, TASK_INTERRUPTIBLE); + data = vsock_connectible_has_data(vsk); + if (data != 0) + break; if (sk->sk_err != 0 || (sk->sk_shutdown & RCV_SHUTDOWN) || -- cgit v1.2.3 From 8bbabb3fddcd0f858be69ed5abc9b470a239d6f2 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 1 Nov 2022 21:34:17 -0700 Subject: bpf, sock_map: Move cancel_work_sync() out of sock lock Stanislav reported a lockdep warning, which is caused by the cancel_work_sync() called inside sock_map_close(), as analyzed below by Jakub: psock->work.func = sk_psock_backlog() ACQUIRE psock->work_mutex sk_psock_handle_skb() skb_send_sock() __skb_send_sock() sendpage_unlocked() kernel_sendpage() sock->ops->sendpage = inet_sendpage() sk->sk_prot->sendpage = tcp_sendpage() ACQUIRE sk->sk_lock tcp_sendpage_locked() RELEASE sk->sk_lock RELEASE psock->work_mutex sock_map_close() ACQUIRE sk->sk_lock sk_psock_stop() sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED) cancel_work_sync() __cancel_work_timer() __flush_work() // wait for psock->work to finish RELEASE sk->sk_lock We can move the cancel_work_sync() out of the sock lock protection, but still before saved_close() was called. Fixes: 799aa7f98d53 ("skmsg: Avoid lock_sock() in sk_psock_backlog()") Reported-by: Stanislav Fomichev Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Tested-by: Jakub Sitnicki Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20221102043417.279409-1-xiyou.wangcong@gmail.com --- include/linux/skmsg.h | 2 +- net/core/skmsg.c | 7 ++----- net/core/sock_map.c | 7 ++++--- 3 files changed, 7 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 48f4b645193b..70d6cb94e580 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -376,7 +376,7 @@ static inline void sk_psock_report_error(struct sk_psock *psock, int err) } struct sk_psock *sk_psock_init(struct sock *sk, int node); -void sk_psock_stop(struct sk_psock *psock, bool wait); +void sk_psock_stop(struct sk_psock *psock); #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock); diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 1efdc47a999b..e6b9ced3eda8 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -803,16 +803,13 @@ static void sk_psock_link_destroy(struct sk_psock *psock) } } -void sk_psock_stop(struct sk_psock *psock, bool wait) +void sk_psock_stop(struct sk_psock *psock) { spin_lock_bh(&psock->ingress_lock); sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); sk_psock_cork_free(psock); __sk_psock_zap_ingress(psock); spin_unlock_bh(&psock->ingress_lock); - - if (wait) - cancel_work_sync(&psock->work); } static void sk_psock_done_strp(struct sk_psock *psock); @@ -850,7 +847,7 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock) sk_psock_stop_verdict(sk, psock); write_unlock_bh(&sk->sk_callback_lock); - sk_psock_stop(psock, false); + sk_psock_stop(psock); INIT_RCU_WORK(&psock->rwork, sk_psock_destroy); queue_rcu_work(system_wq, &psock->rwork); diff --git a/net/core/sock_map.c b/net/core/sock_map.c index a660baedd9e7..81beb16ab1eb 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -1596,7 +1596,7 @@ void sock_map_destroy(struct sock *sk) saved_destroy = psock->saved_destroy; sock_map_remove_links(sk, psock); rcu_read_unlock(); - sk_psock_stop(psock, false); + sk_psock_stop(psock); sk_psock_put(sk, psock); saved_destroy(sk); } @@ -1619,9 +1619,10 @@ void sock_map_close(struct sock *sk, long timeout) saved_close = psock->saved_close; sock_map_remove_links(sk, psock); rcu_read_unlock(); - sk_psock_stop(psock, true); - sk_psock_put(sk, psock); + sk_psock_stop(psock); release_sock(sk); + cancel_work_sync(&psock->work); + sk_psock_put(sk, psock); saved_close(sk, timeout); } EXPORT_SYMBOL_GPL(sock_map_close); -- cgit v1.2.3 From 9e4b7a99a03aefd37ba7bb1f022c8efab5019165 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Wed, 2 Nov 2022 17:53:25 +0100 Subject: net: gso: fix panic on frag_list with mixed head alloc types Since commit 3dcbdb134f32 ("net: gso: Fix skb_segment splat when splitting gso_size mangled skb having linear-headed frag_list"), it is allowed to change gso_size of a GRO packet. However, that commit assumes that "checking the first list_skb member suffices; i.e if either of the list_skb members have non head_frag head, then the first one has too". It turns out this assumption does not hold. We've seen BUG_ON being hit in skb_segment when skbs on the frag_list had differing head_frag with the vmxnet3 driver. This happens because __netdev_alloc_skb and __napi_alloc_skb can return a skb that is page backed or kmalloced depending on the requested size. As the result, the last small skb in the GRO packet can be kmalloced. There are three different locations where this can be fixed: (1) We could check head_frag in GRO and not allow GROing skbs with different head_frag. However, that would lead to performance regression on normal forward paths with unmodified gso_size, where !head_frag in the last packet is not a problem. (2) Set a flag in bpf_skb_net_grow and bpf_skb_net_shrink indicating that NETIF_F_SG is undesirable. That would need to eat a bit in sk_buff. Furthermore, that flag can be unset when all skbs on the frag_list are page backed. To retain good performance, bpf_skb_net_grow/shrink would have to walk the frag_list. (3) Walk the frag_list in skb_segment when determining whether NETIF_F_SG should be cleared. This of course slows things down. This patch implements (3). To limit the performance impact in skb_segment, the list is walked only for skbs with SKB_GSO_DODGY set that have gso_size changed. Normal paths thus will not hit it. We could check only the last skb but since we need to walk the whole list anyway, let's stay on the safe side. Fixes: 3dcbdb134f32 ("net: gso: Fix skb_segment splat when splitting gso_size mangled skb having linear-headed frag_list") Signed-off-by: Jiri Benc Reviewed-by: Willem de Bruijn Link: https://lore.kernel.org/r/e04426a6a91baf4d1081e1b478c82b5de25fdf21.1667407944.git.jbenc@redhat.com Signed-off-by: Jakub Kicinski --- net/core/skbuff.c | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index d1a3fa6f3f12..88fa40571d0c 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4134,23 +4134,25 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, int i = 0; int pos; - if (list_skb && !list_skb->head_frag && skb_headlen(list_skb) && - (skb_shinfo(head_skb)->gso_type & SKB_GSO_DODGY)) { - /* gso_size is untrusted, and we have a frag_list with a linear - * non head_frag head. - * - * (we assume checking the first list_skb member suffices; - * i.e if either of the list_skb members have non head_frag - * head, then the first one has too). - * - * If head_skb's headlen does not fit requested gso_size, it - * means that the frag_list members do NOT terminate on exact - * gso_size boundaries. Hence we cannot perform skb_frag_t page - * sharing. Therefore we must fallback to copying the frag_list - * skbs; we do so by disabling SG. - */ - if (mss != GSO_BY_FRAGS && mss != skb_headlen(head_skb)) - features &= ~NETIF_F_SG; + if ((skb_shinfo(head_skb)->gso_type & SKB_GSO_DODGY) && + mss != GSO_BY_FRAGS && mss != skb_headlen(head_skb)) { + struct sk_buff *check_skb; + + for (check_skb = list_skb; check_skb; check_skb = check_skb->next) { + if (skb_headlen(check_skb) && !check_skb->head_frag) { + /* gso_size is untrusted, and we have a frag_list with + * a linear non head_frag item. + * + * If head_skb's headlen does not fit requested gso_size, + * it means that the frag_list members do NOT terminate + * on exact gso_size boundaries. Hence we cannot perform + * skb_frag_t page sharing. Therefore we must fallback to + * copying the frag_list skbs; we do so by disabling SG. + */ + features &= ~NETIF_F_SG; + break; + } + } } __skb_push(head_skb, doffset); -- cgit v1.2.3 From d3fd203f36d46aa29600a72d57a1b61af80e4a25 Mon Sep 17 00:00:00 2001 From: Baisong Zhong Date: Wed, 2 Nov 2022 16:16:20 +0800 Subject: bpf, test_run: Fix alignment problem in bpf_prog_test_run_skb() We got a syzkaller problem because of aarch64 alignment fault if KFENCE enabled. When the size from user bpf program is an odd number, like 399, 407, etc, it will cause the struct skb_shared_info's unaligned access. As seen below: BUG: KFENCE: use-after-free read in __skb_clone+0x23c/0x2a0 net/core/skbuff.c:1032 Use-after-free read at 0xffff6254fffac077 (in kfence-#213): __lse_atomic_add arch/arm64/include/asm/atomic_lse.h:26 [inline] arch_atomic_add arch/arm64/include/asm/atomic.h:28 [inline] arch_atomic_inc include/linux/atomic-arch-fallback.h:270 [inline] atomic_inc include/asm-generic/atomic-instrumented.h:241 [inline] __skb_clone+0x23c/0x2a0 net/core/skbuff.c:1032 skb_clone+0xf4/0x214 net/core/skbuff.c:1481 ____bpf_clone_redirect net/core/filter.c:2433 [inline] bpf_clone_redirect+0x78/0x1c0 net/core/filter.c:2420 bpf_prog_d3839dd9068ceb51+0x80/0x330 bpf_dispatcher_nop_func include/linux/bpf.h:728 [inline] bpf_test_run+0x3c0/0x6c0 net/bpf/test_run.c:53 bpf_prog_test_run_skb+0x638/0xa7c net/bpf/test_run.c:594 bpf_prog_test_run kernel/bpf/syscall.c:3148 [inline] __do_sys_bpf kernel/bpf/syscall.c:4441 [inline] __se_sys_bpf+0xad0/0x1634 kernel/bpf/syscall.c:4381 kfence-#213: 0xffff6254fffac000-0xffff6254fffac196, size=407, cache=kmalloc-512 allocated by task 15074 on cpu 0 at 1342.585390s: kmalloc include/linux/slab.h:568 [inline] kzalloc include/linux/slab.h:675 [inline] bpf_test_init.isra.0+0xac/0x290 net/bpf/test_run.c:191 bpf_prog_test_run_skb+0x11c/0xa7c net/bpf/test_run.c:512 bpf_prog_test_run kernel/bpf/syscall.c:3148 [inline] __do_sys_bpf kernel/bpf/syscall.c:4441 [inline] __se_sys_bpf+0xad0/0x1634 kernel/bpf/syscall.c:4381 __arm64_sys_bpf+0x50/0x60 kernel/bpf/syscall.c:4381 To fix the problem, we adjust @size so that (@size + @hearoom) is a multiple of SMP_CACHE_BYTES. So we make sure the struct skb_shared_info is aligned to a cache line. Fixes: 1cf1cae963c2 ("bpf: introduce BPF_PROG_TEST_RUN command") Signed-off-by: Baisong Zhong Signed-off-by: Daniel Borkmann Cc: Eric Dumazet Link: https://lore.kernel.org/bpf/20221102081620.1465154-1-zhongbaisong@huawei.com --- net/bpf/test_run.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 13d578ce2a09..fcb3e6c5e03c 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -774,6 +774,7 @@ static void *bpf_test_init(const union bpf_attr *kattr, u32 user_size, if (user_size > size) return ERR_PTR(-EMSGSIZE); + size = SKB_DATA_ALIGN(size); data = kzalloc(size + headroom + tailroom, GFP_USER); if (!data) return ERR_PTR(-ENOMEM); -- cgit v1.2.3 From 0c175da7b0378445f5ef53904247cfbfb87e0b78 Mon Sep 17 00:00:00 2001 From: Lu Wei Date: Fri, 4 Nov 2022 10:27:23 +0800 Subject: tcp: prohibit TCP_REPAIR_OPTIONS if data was already sent If setsockopt with option name of TCP_REPAIR_OPTIONS and opt_code of TCPOPT_SACK_PERM is called to enable sack after data is sent and dupacks are received , it will trigger a warning in function tcp_verify_left_out() as follows: ============================================ WARNING: CPU: 8 PID: 0 at net/ipv4/tcp_input.c:2132 tcp_timeout_mark_lost+0x154/0x160 tcp_enter_loss+0x2b/0x290 tcp_retransmit_timer+0x50b/0x640 tcp_write_timer_handler+0x1c8/0x340 tcp_write_timer+0xe5/0x140 call_timer_fn+0x3a/0x1b0 __run_timers.part.0+0x1bf/0x2d0 run_timer_softirq+0x43/0xb0 __do_softirq+0xfd/0x373 __irq_exit_rcu+0xf6/0x140 The warning is caused in the following steps: 1. a socket named socketA is created 2. socketA enters repair mode without build a connection 3. socketA calls connect() and its state is changed to TCP_ESTABLISHED directly 4. socketA leaves repair mode 5. socketA calls sendmsg() to send data, packets_out and sack_outs(dup ack receives) increase 6. socketA enters repair mode again 7. socketA calls setsockopt with TCPOPT_SACK_PERM to enable sack 8. retransmit timer expires, it calls tcp_timeout_mark_lost(), lost_out increases 9. sack_outs + lost_out > packets_out triggers since lost_out and sack_outs increase repeatly In function tcp_timeout_mark_lost(), tp->sacked_out will be cleared if Step7 not happen and the warning will not be triggered. As suggested by Denis and Eric, TCP_REPAIR_OPTIONS should be prohibited if data was already sent. socket-tcp tests in CRIU has been tested as follows: $ sudo ./test/zdtm.py run -t zdtm/static/socket-tcp* --keep-going \ --ignore-taint socket-tcp* represent all socket-tcp tests in test/zdtm/static/. Fixes: b139ba4e90dc ("tcp: Repair connection-time negotiated parameters") Signed-off-by: Lu Wei Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index ef14efa1fb70..54836a6b81d6 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3647,7 +3647,7 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, case TCP_REPAIR_OPTIONS: if (!tp->repair) err = -EINVAL; - else if (sk->sk_state == TCP_ESTABLISHED) + else if (sk->sk_state == TCP_ESTABLISHED && !tp->bytes_sent) err = tcp_repair_options_est(sk, optval, optlen); else err = -EPERM; -- cgit v1.2.3 From c23fb2c82267638f9d206cb96bb93e1f93ad7828 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Fri, 4 Nov 2022 11:32:16 +0100 Subject: ipv6: addrlabel: fix infoleak when sending struct ifaddrlblmsg to network When copying a `struct ifaddrlblmsg` to the network, __ifal_reserved remained uninitialized, resulting in a 1-byte infoleak: BUG: KMSAN: kernel-network-infoleak in __netdev_start_xmit ./include/linux/netdevice.h:4841 __netdev_start_xmit ./include/linux/netdevice.h:4841 netdev_start_xmit ./include/linux/netdevice.h:4857 xmit_one net/core/dev.c:3590 dev_hard_start_xmit+0x1dc/0x800 net/core/dev.c:3606 __dev_queue_xmit+0x17e8/0x4350 net/core/dev.c:4256 dev_queue_xmit ./include/linux/netdevice.h:3009 __netlink_deliver_tap_skb net/netlink/af_netlink.c:307 __netlink_deliver_tap+0x728/0xad0 net/netlink/af_netlink.c:325 netlink_deliver_tap net/netlink/af_netlink.c:338 __netlink_sendskb net/netlink/af_netlink.c:1263 netlink_sendskb+0x1d9/0x200 net/netlink/af_netlink.c:1272 netlink_unicast+0x56d/0xf50 net/netlink/af_netlink.c:1360 nlmsg_unicast ./include/net/netlink.h:1061 rtnl_unicast+0x5a/0x80 net/core/rtnetlink.c:758 ip6addrlbl_get+0xfad/0x10f0 net/ipv6/addrlabel.c:628 rtnetlink_rcv_msg+0xb33/0x1570 net/core/rtnetlink.c:6082 ... Uninit was created at: slab_post_alloc_hook+0x118/0xb00 mm/slab.h:742 slab_alloc_node mm/slub.c:3398 __kmem_cache_alloc_node+0x4f2/0x930 mm/slub.c:3437 __do_kmalloc_node mm/slab_common.c:954 __kmalloc_node_track_caller+0x117/0x3d0 mm/slab_common.c:975 kmalloc_reserve net/core/skbuff.c:437 __alloc_skb+0x27a/0xab0 net/core/skbuff.c:509 alloc_skb ./include/linux/skbuff.h:1267 nlmsg_new ./include/net/netlink.h:964 ip6addrlbl_get+0x490/0x10f0 net/ipv6/addrlabel.c:608 rtnetlink_rcv_msg+0xb33/0x1570 net/core/rtnetlink.c:6082 netlink_rcv_skb+0x299/0x550 net/netlink/af_netlink.c:2540 rtnetlink_rcv+0x26/0x30 net/core/rtnetlink.c:6109 netlink_unicast_kernel net/netlink/af_netlink.c:1319 netlink_unicast+0x9ab/0xf50 net/netlink/af_netlink.c:1345 netlink_sendmsg+0xebc/0x10f0 net/netlink/af_netlink.c:1921 ... This patch ensures that the reserved field is always initialized. Reported-by: syzbot+3553517af6020c4f2813f1003fe76ef3cbffe98d@syzkaller.appspotmail.com Fixes: 2a8cc6c89039 ("[IPV6] ADDRCONF: Support RFC3484 configurable address selection policy table.") Signed-off-by: Alexander Potapenko Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/addrlabel.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index 8a22486cf270..17ac45aa7194 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -437,6 +437,7 @@ static void ip6addrlbl_putmsg(struct nlmsghdr *nlh, { struct ifaddrlblmsg *ifal = nlmsg_data(nlh); ifal->ifal_family = AF_INET6; + ifal->__ifal_reserved = 0; ifal->ifal_prefixlen = prefixlen; ifal->ifal_flags = 0; ifal->ifal_index = ifindex; -- cgit v1.2.3 From a3335faebe1608f3e78e24f09501c9f7d5ebf87a Mon Sep 17 00:00:00 2001 From: Chen Zhongjin Date: Mon, 31 Oct 2022 11:30:53 +0800 Subject: can: af_can: can_exit(): add missing dev_remove_pack() of canxl_packet In can_init(), dev_add_pack(&canxl_packet) is added but not removed in can_exit(). It breaks the packet handler list and can make kernel panic when can_init() is called for the second time. | > modprobe can && rmmod can | > rmmod xxx && modprobe can | | BUG: unable to handle page fault for address: fffffbfff807d7f4 | RIP: 0010:dev_add_pack+0x133/0x1f0 | Call Trace: | | can_init+0xaa/0x1000 [can] | do_one_initcall+0xd3/0x4e0 | ... Fixes: fb08cba12b52 ("can: canxl: update CAN infrastructure for CAN XL frames") Signed-off-by: Chen Zhongjin Acked-by: Oliver Hartkopp Link: https://lore.kernel.org/all/20221031033053.37849-1-chenzhongjin@huawei.com [mkl: adjust subject and commit message] Signed-off-by: Marc Kleine-Budde --- net/can/af_can.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/can/af_can.c b/net/can/af_can.c index 9503ab10f9b8..5e9e3e1e9825 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -902,6 +902,7 @@ out_pernet: static __exit void can_exit(void) { /* protocol unregister */ + dev_remove_pack(&canxl_packet); dev_remove_pack(&canfd_packet); dev_remove_pack(&can_packet); sock_unregister(PF_CAN); -- cgit v1.2.3 From 8aa59e355949442c408408c2d836e561794c40a1 Mon Sep 17 00:00:00 2001 From: Zhengchao Shao Date: Fri, 28 Oct 2022 16:56:50 +0800 Subject: can: af_can: fix NULL pointer dereference in can_rx_register() It causes NULL pointer dereference when testing as following: (a) use syscall(__NR_socket, 0x10ul, 3ul, 0) to create netlink socket. (b) use syscall(__NR_sendmsg, ...) to create bond link device and vxcan link device, and bind vxcan device to bond device (can also use ifenslave command to bind vxcan device to bond device). (c) use syscall(__NR_socket, 0x1dul, 3ul, 1) to create CAN socket. (d) use syscall(__NR_bind, ...) to bind the bond device to CAN socket. The bond device invokes the can-raw protocol registration interface to receive CAN packets. However, ml_priv is not allocated to the dev, dev_rcv_lists is assigned to NULL in can_rx_register(). In this case, it will occur the NULL pointer dereference issue. The following is the stack information: BUG: kernel NULL pointer dereference, address: 0000000000000008 PGD 122a4067 P4D 122a4067 PUD 1223c067 PMD 0 Oops: 0000 [#1] PREEMPT SMP RIP: 0010:can_rx_register+0x12d/0x1e0 Call Trace: raw_enable_filters+0x8d/0x120 raw_enable_allfilters+0x3b/0x130 raw_bind+0x118/0x4f0 __sys_bind+0x163/0x1a0 __x64_sys_bind+0x1e/0x30 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x63/0xcd Fixes: 4e096a18867a ("net: introduce CAN specific pointer in the struct net_device") Signed-off-by: Zhengchao Shao Reviewed-by: Marc Kleine-Budde Link: https://lore.kernel.org/all/20221028085650.170470-1-shaozhengchao@huawei.com Signed-off-by: Marc Kleine-Budde --- net/can/af_can.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/can/af_can.c b/net/can/af_can.c index 5e9e3e1e9825..27dcdcc0b808 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -450,7 +450,7 @@ int can_rx_register(struct net *net, struct net_device *dev, canid_t can_id, /* insert new receiver (dev,canid,mask) -> (func,data) */ - if (dev && dev->type != ARPHRD_CAN) + if (dev && (dev->type != ARPHRD_CAN || !can_get_ml_priv(dev))) return -ENODEV; if (dev && !net_eq(net, dev_net(dev))) -- cgit v1.2.3 From 866337865f3747c68a3e7bb837611e39cec1ecd6 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Fri, 4 Nov 2022 15:25:51 +0100 Subject: can: isotp: fix tx state handling for echo tx processing In commit 4b7fe92c0690 ("can: isotp: add local echo tx processing for consecutive frames") the data flow for consecutive frames (CF) has been reworked to improve the reliability of long data transfers. This rework did not touch the transmission and the tx state changes of single frame (SF) transfers which likely led to the WARN in the isotp_tx_timer_handler() catching a wrong tx state. This patch makes use of the improved frame processing for SF frames and sets the ISOTP_SENDING state in isotp_sendmsg() within the cmpxchg() condition handling. A review of the state machine and the timer handling additionally revealed a missing echo timeout handling in the case of the burst mode in isotp_rcv_echo() and removes a potential timer configuration uncertainty in isotp_rcv_fc() when the receiver requests consecutive frames. Fixes: 4b7fe92c0690 ("can: isotp: add local echo tx processing for consecutive frames") Link: https://lore.kernel.org/linux-can/CAO4mrfe3dG7cMP1V5FLUkw7s+50c9vichigUMQwsxX4M=45QEw@mail.gmail.com/T/#u Reported-by: Wei Chen Cc: stable@vger.kernel.org # v6.0 Signed-off-by: Oliver Hartkopp Link: https://lore.kernel.org/all/20221104142551.16924-1-socketcan@hartkopp.net Signed-off-by: Marc Kleine-Budde --- net/can/isotp.c | 71 ++++++++++++++++++++++++++++++--------------------------- 1 file changed, 38 insertions(+), 33 deletions(-) (limited to 'net') diff --git a/net/can/isotp.c b/net/can/isotp.c index a9d1357f8489..608f8c24ae46 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -111,6 +111,9 @@ MODULE_ALIAS("can-proto-6"); #define ISOTP_FC_WT 1 /* wait */ #define ISOTP_FC_OVFLW 2 /* overflow */ +#define ISOTP_FC_TIMEOUT 1 /* 1 sec */ +#define ISOTP_ECHO_TIMEOUT 2 /* 2 secs */ + enum { ISOTP_IDLE = 0, ISOTP_WAIT_FIRST_FC, @@ -258,7 +261,8 @@ static int isotp_send_fc(struct sock *sk, int ae, u8 flowstatus) so->lastrxcf_tstamp = ktime_set(0, 0); /* start rx timeout watchdog */ - hrtimer_start(&so->rxtimer, ktime_set(1, 0), HRTIMER_MODE_REL_SOFT); + hrtimer_start(&so->rxtimer, ktime_set(ISOTP_FC_TIMEOUT, 0), + HRTIMER_MODE_REL_SOFT); return 0; } @@ -344,6 +348,8 @@ static int check_pad(struct isotp_sock *so, struct canfd_frame *cf, return 0; } +static void isotp_send_cframe(struct isotp_sock *so); + static int isotp_rcv_fc(struct isotp_sock *so, struct canfd_frame *cf, int ae) { struct sock *sk = &so->sk; @@ -398,14 +404,15 @@ static int isotp_rcv_fc(struct isotp_sock *so, struct canfd_frame *cf, int ae) case ISOTP_FC_CTS: so->tx.bs = 0; so->tx.state = ISOTP_SENDING; - /* start cyclic timer for sending CF frame */ - hrtimer_start(&so->txtimer, so->tx_gap, + /* send CF frame and enable echo timeout handling */ + hrtimer_start(&so->txtimer, ktime_set(ISOTP_ECHO_TIMEOUT, 0), HRTIMER_MODE_REL_SOFT); + isotp_send_cframe(so); break; case ISOTP_FC_WT: /* start timer to wait for next FC frame */ - hrtimer_start(&so->txtimer, ktime_set(1, 0), + hrtimer_start(&so->txtimer, ktime_set(ISOTP_FC_TIMEOUT, 0), HRTIMER_MODE_REL_SOFT); break; @@ -600,7 +607,7 @@ static int isotp_rcv_cf(struct sock *sk, struct canfd_frame *cf, int ae, /* perform blocksize handling, if enabled */ if (!so->rxfc.bs || ++so->rx.bs < so->rxfc.bs) { /* start rx timeout watchdog */ - hrtimer_start(&so->rxtimer, ktime_set(1, 0), + hrtimer_start(&so->rxtimer, ktime_set(ISOTP_FC_TIMEOUT, 0), HRTIMER_MODE_REL_SOFT); return 0; } @@ -829,7 +836,7 @@ static void isotp_rcv_echo(struct sk_buff *skb, void *data) struct isotp_sock *so = isotp_sk(sk); struct canfd_frame *cf = (struct canfd_frame *)skb->data; - /* only handle my own local echo skb's */ + /* only handle my own local echo CF/SF skb's (no FF!) */ if (skb->sk != sk || so->cfecho != *(u32 *)cf->data) return; @@ -849,13 +856,16 @@ static void isotp_rcv_echo(struct sk_buff *skb, void *data) if (so->txfc.bs && so->tx.bs >= so->txfc.bs) { /* stop and wait for FC with timeout */ so->tx.state = ISOTP_WAIT_FC; - hrtimer_start(&so->txtimer, ktime_set(1, 0), + hrtimer_start(&so->txtimer, ktime_set(ISOTP_FC_TIMEOUT, 0), HRTIMER_MODE_REL_SOFT); return; } /* no gap between data frames needed => use burst mode */ if (!so->tx_gap) { + /* enable echo timeout handling */ + hrtimer_start(&so->txtimer, ktime_set(ISOTP_ECHO_TIMEOUT, 0), + HRTIMER_MODE_REL_SOFT); isotp_send_cframe(so); return; } @@ -879,7 +889,7 @@ static enum hrtimer_restart isotp_tx_timer_handler(struct hrtimer *hrtimer) /* start timeout for unlikely lost echo skb */ hrtimer_set_expires(&so->txtimer, ktime_add(ktime_get(), - ktime_set(2, 0))); + ktime_set(ISOTP_ECHO_TIMEOUT, 0))); restart = HRTIMER_RESTART; /* push out the next consecutive frame */ @@ -907,7 +917,8 @@ static enum hrtimer_restart isotp_tx_timer_handler(struct hrtimer *hrtimer) break; default: - WARN_ON_ONCE(1); + WARN_ONCE(1, "can-isotp: tx timer state %08X cfecho %08X\n", + so->tx.state, so->cfecho); } return restart; @@ -923,7 +934,7 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) struct canfd_frame *cf; int ae = (so->opt.flags & CAN_ISOTP_EXTEND_ADDR) ? 1 : 0; int wait_tx_done = (so->opt.flags & CAN_ISOTP_WAIT_TX_DONE) ? 1 : 0; - s64 hrtimer_sec = 0; + s64 hrtimer_sec = ISOTP_ECHO_TIMEOUT; int off; int err; @@ -942,6 +953,8 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) err = wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE); if (err) goto err_out; + + so->tx.state = ISOTP_SENDING; } if (!size || size > MAX_MSG_LENGTH) { @@ -986,6 +999,10 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) cf = (struct canfd_frame *)skb->data; skb_put_zero(skb, so->ll.mtu); + /* cfecho should have been zero'ed by init / former isotp_rcv_echo() */ + if (so->cfecho) + pr_notice_once("can-isotp: uninit cfecho %08X\n", so->cfecho); + /* check for single frame transmission depending on TX_DL */ if (size <= so->tx.ll_dl - SF_PCI_SZ4 - ae - off) { /* The message size generally fits into a SingleFrame - good. @@ -1011,11 +1028,8 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) else cf->data[ae] |= size; - so->tx.state = ISOTP_IDLE; - wake_up_interruptible(&so->wait); - - /* don't enable wait queue for a single frame transmission */ - wait_tx_done = 0; + /* set CF echo tag for isotp_rcv_echo() (SF-mode) */ + so->cfecho = *(u32 *)cf->data; } else { /* send first frame */ @@ -1031,31 +1045,23 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) /* disable wait for FCs due to activated block size */ so->txfc.bs = 0; - /* cfecho should have been zero'ed by init */ - if (so->cfecho) - pr_notice_once("can-isotp: no fc cfecho %08X\n", - so->cfecho); - - /* set consecutive frame echo tag */ + /* set CF echo tag for isotp_rcv_echo() (CF-mode) */ so->cfecho = *(u32 *)cf->data; - - /* switch directly to ISOTP_SENDING state */ - so->tx.state = ISOTP_SENDING; - - /* start timeout for unlikely lost echo skb */ - hrtimer_sec = 2; } else { /* standard flow control check */ so->tx.state = ISOTP_WAIT_FIRST_FC; /* start timeout for FC */ - hrtimer_sec = 1; - } + hrtimer_sec = ISOTP_FC_TIMEOUT; - hrtimer_start(&so->txtimer, ktime_set(hrtimer_sec, 0), - HRTIMER_MODE_REL_SOFT); + /* no CF echo tag for isotp_rcv_echo() (FF-mode) */ + so->cfecho = 0; + } } + hrtimer_start(&so->txtimer, ktime_set(hrtimer_sec, 0), + HRTIMER_MODE_REL_SOFT); + /* send the first or only CAN frame */ cf->flags = so->ll.tx_flags; @@ -1068,8 +1074,7 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) __func__, ERR_PTR(err)); /* no transmission -> no timeout monitoring */ - if (hrtimer_sec) - hrtimer_cancel(&so->txtimer); + hrtimer_cancel(&so->txtimer); /* reset consecutive frame echo tag */ so->cfecho = 0; -- cgit v1.2.3 From 3eb3d283e8579a22b81dd2ac3987b77465b2a22f Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Fri, 4 Nov 2022 08:50:00 +0100 Subject: can: j1939: j1939_send_one(): fix missing CAN header initialization The read access to struct canxl_frame::len inside of a j1939 created skbuff revealed a missing initialization of reserved and later filled elements in struct can_frame. This patch initializes the 8 byte CAN header with zero. Fixes: 9d71dd0c7009 ("can: add support of SAE J1939 protocol") Cc: Oleksij Rempel Link: https://lore.kernel.org/linux-can/20221104052235.GA6474@pengutronix.de Reported-by: syzbot+d168ec0caca4697e03b1@syzkaller.appspotmail.com Signed-off-by: Oliver Hartkopp Link: https://lore.kernel.org/all/20221104075000.105414-1-socketcan@hartkopp.net Cc: stable@vger.kernel.org Signed-off-by: Marc Kleine-Budde --- net/can/j1939/main.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/can/j1939/main.c b/net/can/j1939/main.c index 144c86b0e3ff..821d4ff303b3 100644 --- a/net/can/j1939/main.c +++ b/net/can/j1939/main.c @@ -336,6 +336,9 @@ int j1939_send_one(struct j1939_priv *priv, struct sk_buff *skb) /* re-claim the CAN_HDR from the SKB */ cf = skb_push(skb, J1939_CAN_HDR); + /* initialize header structure */ + memset(cf, 0, J1939_CAN_HDR); + /* make it a full can frame again */ skb_put(skb, J1939_CAN_FTR + (8 - dlc)); -- cgit v1.2.3 From 1c075b192fe41030457cd4a5f7dea730412bca40 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 4 Nov 2022 16:48:53 -0400 Subject: tipc: fix the msg->req tlv len check in tipc_nl_compat_name_table_dump_header This is a follow-up for commit 974cb0e3e7c9 ("tipc: fix uninit-value in tipc_nl_compat_name_table_dump") where it should have type casted sizeof(..) to int to work when TLV_GET_DATA_LEN() returns a negative value. syzbot reported a call trace because of it: BUG: KMSAN: uninit-value in ... tipc_nl_compat_name_table_dump+0x841/0xea0 net/tipc/netlink_compat.c:934 __tipc_nl_compat_dumpit+0xab2/0x1320 net/tipc/netlink_compat.c:238 tipc_nl_compat_dumpit+0x991/0xb50 net/tipc/netlink_compat.c:321 tipc_nl_compat_recv+0xb6e/0x1640 net/tipc/netlink_compat.c:1324 genl_family_rcv_msg_doit net/netlink/genetlink.c:731 [inline] genl_family_rcv_msg net/netlink/genetlink.c:775 [inline] genl_rcv_msg+0x103f/0x1260 net/netlink/genetlink.c:792 netlink_rcv_skb+0x3a5/0x6c0 net/netlink/af_netlink.c:2501 genl_rcv+0x3c/0x50 net/netlink/genetlink.c:803 netlink_unicast_kernel net/netlink/af_netlink.c:1319 [inline] netlink_unicast+0xf3b/0x1270 net/netlink/af_netlink.c:1345 netlink_sendmsg+0x1288/0x1440 net/netlink/af_netlink.c:1921 sock_sendmsg_nosec net/socket.c:714 [inline] sock_sendmsg net/socket.c:734 [inline] Reported-by: syzbot+e5dbaaa238680ce206ea@syzkaller.appspotmail.com Fixes: 974cb0e3e7c9 ("tipc: fix uninit-value in tipc_nl_compat_name_table_dump") Signed-off-by: Xin Long Link: https://lore.kernel.org/r/ccd6a7ea801b15aec092c3b532a883b4c5708695.1667594933.git.lucien.xin@gmail.com Signed-off-by: Jakub Kicinski --- net/tipc/netlink_compat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index fc68733673ba..dfea27a906f2 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -880,7 +880,7 @@ static int tipc_nl_compat_name_table_dump_header(struct tipc_nl_compat_msg *msg) }; ntq = (struct tipc_name_table_query *)TLV_DATA(msg->req); - if (TLV_GET_DATA_LEN(msg->req) < sizeof(struct tipc_name_table_query)) + if (TLV_GET_DATA_LEN(msg->req) < (int)sizeof(struct tipc_name_table_query)) return -EINVAL; depth = ntohl(ntq->depth); -- cgit v1.2.3 From 9f0b773210c27a8f5d98ddb2fc4ba60a42a3285f Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 4 Nov 2022 17:45:15 -0400 Subject: sctp: remove the unnecessary sinfo_stream check in sctp_prsctp_prune_unsent Since commit 5bbbbe32a431 ("sctp: introduce stream scheduler foundations"), sctp_stream_outq_migrate() has been called in sctp_stream_init/update to removes those chunks to streams higher than the new max. There is no longer need to do such check in sctp_prsctp_prune_unsent(). Signed-off-by: Xin Long Signed-off-by: Jakub Kicinski --- net/sctp/outqueue.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index e213aaf45d67..c99fe3dc19bc 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -384,6 +384,7 @@ static int sctp_prsctp_prune_unsent(struct sctp_association *asoc, { struct sctp_outq *q = &asoc->outqueue; struct sctp_chunk *chk, *temp; + struct sctp_stream_out *sout; q->sched->unsched_all(&asoc->stream); @@ -398,12 +399,9 @@ static int sctp_prsctp_prune_unsent(struct sctp_association *asoc, sctp_sched_dequeue_common(q, chk); asoc->sent_cnt_removable--; asoc->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++; - if (chk->sinfo.sinfo_stream < asoc->stream.outcnt) { - struct sctp_stream_out *streamout = - SCTP_SO(&asoc->stream, chk->sinfo.sinfo_stream); - streamout->ext->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++; - } + sout = SCTP_SO(&asoc->stream, chk->sinfo.sinfo_stream); + sout->ext->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++; msg_len -= chk->skb->truesize + sizeof(struct sctp_chunk); sctp_chunk_free(chk); -- cgit v1.2.3 From 2f201ae14ae0f91dbf1cffea7bb1e29e81d4d108 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 4 Nov 2022 17:45:16 -0400 Subject: sctp: clear out_curr if all frag chunks of current msg are pruned A crash was reported by Zhen Chen: list_del corruption, ffffa035ddf01c18->next is NULL WARNING: CPU: 1 PID: 250682 at lib/list_debug.c:49 __list_del_entry_valid+0x59/0xe0 RIP: 0010:__list_del_entry_valid+0x59/0xe0 Call Trace: sctp_sched_dequeue_common+0x17/0x70 [sctp] sctp_sched_fcfs_dequeue+0x37/0x50 [sctp] sctp_outq_flush_data+0x85/0x360 [sctp] sctp_outq_uncork+0x77/0xa0 [sctp] sctp_cmd_interpreter.constprop.0+0x164/0x1450 [sctp] sctp_side_effects+0x37/0xe0 [sctp] sctp_do_sm+0xd0/0x230 [sctp] sctp_primitive_SEND+0x2f/0x40 [sctp] sctp_sendmsg_to_asoc+0x3fa/0x5c0 [sctp] sctp_sendmsg+0x3d5/0x440 [sctp] sock_sendmsg+0x5b/0x70 and in sctp_sched_fcfs_dequeue() it dequeued a chunk from stream out_curr outq while this outq was empty. Normally stream->out_curr must be set to NULL once all frag chunks of current msg are dequeued, as we can see in sctp_sched_dequeue_done(). However, in sctp_prsctp_prune_unsent() as it is not a proper dequeue, sctp_sched_dequeue_done() is not called to do this. This patch is to fix it by simply setting out_curr to NULL when the last frag chunk of current msg is dequeued from out_curr stream in sctp_prsctp_prune_unsent(). Fixes: 5bbbbe32a431 ("sctp: introduce stream scheduler foundations") Reported-by: Zhen Chen Tested-by: Caowangbao Signed-off-by: Xin Long Signed-off-by: Jakub Kicinski --- net/sctp/outqueue.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index c99fe3dc19bc..20831079fb09 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -403,6 +403,11 @@ static int sctp_prsctp_prune_unsent(struct sctp_association *asoc, sout = SCTP_SO(&asoc->stream, chk->sinfo.sinfo_stream); sout->ext->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++; + /* clear out_curr if all frag chunks are pruned */ + if (asoc->stream.out_curr == sout && + list_is_last(&chk->frag_list, &chk->msg->chunks)) + asoc->stream.out_curr = NULL; + msg_len -= chk->skb->truesize + sizeof(struct sctp_chunk); sctp_chunk_free(chk); if (msg_len <= 0) -- cgit v1.2.3 From 03832a32bf8ff0a8305d94ddd3979835a807248f Mon Sep 17 00:00:00 2001 From: Ziyang Xuan Date: Thu, 3 Nov 2022 09:12:02 +0800 Subject: netfilter: nfnetlink: fix potential dead lock in nfnetlink_rcv_msg() When type is NFNL_CB_MUTEX and -EAGAIN error occur in nfnetlink_rcv_msg(), it does not execute nfnl_unlock(). That would trigger potential dead lock. Fixes: 50f2db9e368f ("netfilter: nfnetlink: consolidate callback types") Signed-off-by: Ziyang Xuan Signed-off-by: Florian Westphal --- net/netfilter/nfnetlink.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 9c44518cb70f..6d18fb346868 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -294,6 +294,7 @@ replay: nfnl_lock(subsys_id); if (nfnl_dereference_protected(subsys_id) != ss || nfnetlink_find_client(type, ss) != nc) { + nfnl_unlock(subsys_id); err = -EAGAIN; break; } -- cgit v1.2.3 From 03c1f1ef1584c981935fab2fa0c45d3e43e2c235 Mon Sep 17 00:00:00 2001 From: Shigeru Yoshida Date: Thu, 3 Nov 2022 22:08:49 +0900 Subject: netfilter: Cleanup nft_net->module_list from nf_tables_exit_net() syzbot reported a warning like below [1]: WARNING: CPU: 3 PID: 9 at net/netfilter/nf_tables_api.c:10096 nf_tables_exit_net+0x71c/0x840 Modules linked in: CPU: 2 PID: 9 Comm: kworker/u8:0 Tainted: G W 6.1.0-rc3-00072-g8e5423e991e8 #47 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.0-1.fc36 04/01/2014 Workqueue: netns cleanup_net RIP: 0010:nf_tables_exit_net+0x71c/0x840 ... Call Trace: ? __nft_release_table+0xfc0/0xfc0 ops_exit_list+0xb5/0x180 cleanup_net+0x506/0xb10 ? unregister_pernet_device+0x80/0x80 process_one_work+0xa38/0x1730 ? pwq_dec_nr_in_flight+0x2b0/0x2b0 ? rwlock_bug.part.0+0x90/0x90 ? _raw_spin_lock_irq+0x46/0x50 worker_thread+0x67e/0x10e0 ? process_one_work+0x1730/0x1730 kthread+0x2e5/0x3a0 ? kthread_complete_and_exit+0x40/0x40 ret_from_fork+0x1f/0x30 In nf_tables_exit_net(), there is a case where nft_net->commit_list is empty but nft_net->module_list is not empty. Such a case occurs with the following scenario: 1. nfnetlink_rcv_batch() is called 2. nf_tables_newset() returns -EAGAIN and NFNL_BATCH_FAILURE bit is set to status 3. nf_tables_abort() is called with NFNL_ABORT_AUTOLOAD (nft_net->commit_list is released, but nft_net->module_list is not because of NFNL_ABORT_AUTOLOAD flag) 4. Jump to replay label 5. netlink_skb_clone() fails and returns from the function (this is caused by fault injection in the reproducer of syzbot) This patch fixes this issue by calling __nf_tables_abort() when nft_net->module_list is not empty in nf_tables_exit_net(). Fixes: eb014de4fd41 ("netfilter: nf_tables: autoload modules from the abort path") Link: https://syzkaller.appspot.com/bug?id=802aba2422de4218ad0c01b46c9525cc9d4e4aa3 [1] Reported-by: syzbot+178efee9e2d7f87f5103@syzkaller.appspotmail.com Signed-off-by: Shigeru Yoshida Signed-off-by: Florian Westphal --- net/netfilter/nf_tables_api.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 76bd4d03dbda..e7152d599d73 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -10090,7 +10090,8 @@ static void __net_exit nf_tables_exit_net(struct net *net) struct nftables_pernet *nft_net = nft_pernet(net); mutex_lock(&nft_net->commit_mutex); - if (!list_empty(&nft_net->commit_list)) + if (!list_empty(&nft_net->commit_list) || + !list_empty(&nft_net->module_list)) __nf_tables_abort(net, NFNL_ABORT_NONE); __nft_release_tables(net); mutex_unlock(&nft_net->commit_mutex); -- cgit v1.2.3 From d4072058af4fd8fb4658e7452289042a406a9398 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 8 Nov 2022 09:55:17 +0000 Subject: mctp: Fix an error handling path in mctp_init() If mctp_neigh_init() return error, the routes resources should be released in the error handling path. Otherwise some resources leak. Fixes: 4d8b9319282a ("mctp: Add neighbour implementation") Signed-off-by: Wei Yongjun Acked-by: Matt Johnston Link: https://lore.kernel.org/r/20221108095517.620115-1-weiyongjun@huaweicloud.com Signed-off-by: Jakub Kicinski --- net/mctp/af_mctp.c | 4 +++- net/mctp/route.c | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c index b6b5e496fa40..fc9e728b6333 100644 --- a/net/mctp/af_mctp.c +++ b/net/mctp/af_mctp.c @@ -665,12 +665,14 @@ static __init int mctp_init(void) rc = mctp_neigh_init(); if (rc) - goto err_unreg_proto; + goto err_unreg_routes; mctp_device_init(); return 0; +err_unreg_routes: + mctp_routes_exit(); err_unreg_proto: proto_unregister(&mctp_proto); err_unreg_sock: diff --git a/net/mctp/route.c b/net/mctp/route.c index 2155f15a074c..f9a80b82dc51 100644 --- a/net/mctp/route.c +++ b/net/mctp/route.c @@ -1400,7 +1400,7 @@ int __init mctp_routes_init(void) return register_pernet_subsys(&mctp_net_ops); } -void __exit mctp_routes_exit(void) +void mctp_routes_exit(void) { unregister_pernet_subsys(&mctp_net_ops); rtnl_unregister(PF_MCTP, RTM_DELROUTE); -- cgit v1.2.3 From 0834ced65a6a1eaa10d0b319b685879a671b29aa Mon Sep 17 00:00:00 2001 From: Yu Liao Date: Thu, 10 Nov 2022 17:03:29 +0800 Subject: net/tls: Fix memory leak in tls_enc_skb() and tls_sw_fallback_init() 'aead_req' and 'aead_send' is allocated but not freed in default switch case. This commit fixes the potential memory leak by freeing them under the situation. Note that the default cases here should never be reached as they'd mean we allowed offloading an unsupported algorithm. Fixes: ea7a9d88ba21 ("net/tls: Use cipher sizes structs") Signed-off-by: Yu Liao Reviewed-by: Gal Pressman Link: https://lore.kernel.org/r/20221110090329.2036382-1-liaoyu15@huawei.com Signed-off-by: Jakub Kicinski --- net/tls/tls_device_fallback.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c index cdb391a8754b..7fbb1d0b69b3 100644 --- a/net/tls/tls_device_fallback.c +++ b/net/tls/tls_device_fallback.c @@ -346,7 +346,7 @@ static struct sk_buff *tls_enc_skb(struct tls_context *tls_ctx, salt = tls_ctx->crypto_send.aes_gcm_256.salt; break; default: - return NULL; + goto free_req; } cipher_sz = &tls_cipher_size_desc[tls_ctx->crypto_send.info.cipher_type]; buf_len = cipher_sz->salt + cipher_sz->iv + TLS_AAD_SPACE_SIZE + @@ -492,7 +492,8 @@ int tls_sw_fallback_init(struct sock *sk, key = ((struct tls12_crypto_info_aes_gcm_256 *)crypto_info)->key; break; default: - return -EINVAL; + rc = -EINVAL; + goto free_aead; } cipher_sz = &tls_cipher_size_desc[crypto_info->cipher_type]; -- cgit v1.2.3 From 8fbb53c8bfd8c56ecf1f78dc821778b58f505503 Mon Sep 17 00:00:00 2001 From: Zhengchao Shao Date: Fri, 11 Nov 2022 09:47:34 +0800 Subject: net: caif: fix double disconnect client in chnl_net_open() When connecting to client timeout, disconnect client for twice in chnl_net_open(). Remove one. Compile tested only. Fixes: 2aa40aef9deb ("caif: Use link layer MTU instead of fixed MTU") Signed-off-by: Zhengchao Shao Signed-off-by: David S. Miller --- net/caif/chnl_net.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net') diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c index 4d63ef13a1fd..f35fc87c453a 100644 --- a/net/caif/chnl_net.c +++ b/net/caif/chnl_net.c @@ -310,9 +310,6 @@ static int chnl_net_open(struct net_device *dev) if (result == 0) { pr_debug("connect timeout\n"); - caif_disconnect_client(dev_net(dev), &priv->chnl); - priv->state = CAIF_DISCONNECTED; - pr_debug("state disconnected\n"); result = -ETIMEDOUT; goto error; } -- cgit v1.2.3 From ed1fe1bebe18884b11e5536b5ac42e3a48960835 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 11 Nov 2022 23:10:20 +0200 Subject: net: dsa: make dsa_master_ioctl() see through port_hwtstamp_get() shims MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are multi-generational drivers like mv88e6xxx which have code like this: int mv88e6xxx_port_hwtstamp_get(struct dsa_switch *ds, int port, struct ifreq *ifr) { if (!chip->info->ptp_support) return -EOPNOTSUPP; ... } DSA wants to deny PTP timestamping on the master if the switch supports timestamping too. However it currently relies on the presence of the port_hwtstamp_get() callback to determine PTP capability, and this clearly does not work in that case (method is present but returns -EOPNOTSUPP). We should not deny PTP on the DSA master for those switches which truly do not support hardware timestamping. Create a dsa_port_supports_hwtstamp() method which actually probes for support by calling port_hwtstamp_get() and seeing whether that returned -EOPNOTSUPP or not. Fixes: f685e609a301 ("net: dsa: Deny PTP on master if switch supports it") Link: https://patchwork.kernel.org/project/netdevbpf/patch/20221110124345.3901389-1-festevam@gmail.com/ Reported-by: Fabio Estevam Reported-by: Steffen Bätz Signed-off-by: Vladimir Oltean Tested-by: Fabio Estevam Signed-off-by: David S. Miller --- net/dsa/dsa_priv.h | 1 + net/dsa/master.c | 3 +-- net/dsa/port.c | 16 ++++++++++++++++ 3 files changed, 18 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 6e65c7ffd6f3..71e9707d11d4 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -210,6 +210,7 @@ static inline struct net_device *dsa_master_find_slave(struct net_device *dev, extern struct rtnl_link_ops dsa_link_ops __read_mostly; /* port.c */ +bool dsa_port_supports_hwtstamp(struct dsa_port *dp, struct ifreq *ifr); void dsa_port_set_tag_protocol(struct dsa_port *cpu_dp, const struct dsa_device_ops *tag_ops); int dsa_port_set_state(struct dsa_port *dp, u8 state, bool do_fast_age); diff --git a/net/dsa/master.c b/net/dsa/master.c index 40367ab41cf8..421de166515f 100644 --- a/net/dsa/master.c +++ b/net/dsa/master.c @@ -204,8 +204,7 @@ static int dsa_master_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) * switch in the tree that is PTP capable. */ list_for_each_entry(dp, &dst->ports, list) - if (dp->ds->ops->port_hwtstamp_get || - dp->ds->ops->port_hwtstamp_set) + if (dsa_port_supports_hwtstamp(dp, ifr)) return -EBUSY; break; } diff --git a/net/dsa/port.c b/net/dsa/port.c index 208168276995..750fe68d9b2a 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -110,6 +110,22 @@ static bool dsa_port_can_configure_learning(struct dsa_port *dp) return !err; } +bool dsa_port_supports_hwtstamp(struct dsa_port *dp, struct ifreq *ifr) +{ + struct dsa_switch *ds = dp->ds; + int err; + + if (!ds->ops->port_hwtstamp_get || !ds->ops->port_hwtstamp_set) + return false; + + /* "See through" shim implementations of the "get" method. + * This will clobber the ifreq structure, but we will either return an + * error, or the master will overwrite it with proper values. + */ + err = ds->ops->port_hwtstamp_get(ds, dp->index, ifr); + return err != -EOPNOTSUPP; +} + int dsa_port_set_state(struct dsa_port *dp, u8 state, bool do_fast_age) { struct dsa_switch *ds = dp->ds; -- cgit v1.2.3 From 5121197ecc5db58c07da95eb1ff82b98b121a221 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sun, 13 Nov 2022 16:51:19 -0800 Subject: kcm: close race conditions on sk_receive_queue sk->sk_receive_queue is protected by skb queue lock, but for KCM sockets its RX path takes mux->rx_lock to protect more than just skb queue. However, kcm_recvmsg() still only grabs the skb queue lock, so race conditions still exist. We can teach kcm_recvmsg() to grab mux->rx_lock too but this would introduce a potential performance regression as struct kcm_mux can be shared by multiple KCM sockets. So we have to enforce skb queue lock in requeue_rx_msgs() and handle skb peek case carefully in kcm_wait_data(). Fortunately, skb_recv_datagram() already handles it nicely and is widely used by other sockets, we can just switch to skb_recv_datagram() after getting rid of the unnecessary sock lock in kcm_recvmsg() and kcm_splice_read(). Side note: SOCK_DONE is not used by KCM sockets, so it is safe to get rid of this check too. I ran the original syzbot reproducer for 30 min without seeing any issue. Fixes: ab7ac4eb9832 ("kcm: Kernel Connection Multiplexor module") Reported-by: syzbot+278279efdd2730dd14bf@syzkaller.appspotmail.com Reported-by: shaozhengchao Cc: Paolo Abeni Cc: Tom Herbert Signed-off-by: Cong Wang Link: https://lore.kernel.org/r/20221114005119.597905-1-xiyou.wangcong@gmail.com Signed-off-by: Paolo Abeni --- net/kcm/kcmsock.c | 58 ++++++------------------------------------------------- 1 file changed, 6 insertions(+), 52 deletions(-) (limited to 'net') diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index a5004228111d..890a2423f559 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -222,7 +222,7 @@ static void requeue_rx_msgs(struct kcm_mux *mux, struct sk_buff_head *head) struct sk_buff *skb; struct kcm_sock *kcm; - while ((skb = __skb_dequeue(head))) { + while ((skb = skb_dequeue(head))) { /* Reset destructor to avoid calling kcm_rcv_ready */ skb->destructor = sock_rfree; skb_orphan(skb); @@ -1085,53 +1085,17 @@ out_error: return err; } -static struct sk_buff *kcm_wait_data(struct sock *sk, int flags, - long timeo, int *err) -{ - struct sk_buff *skb; - - while (!(skb = skb_peek(&sk->sk_receive_queue))) { - if (sk->sk_err) { - *err = sock_error(sk); - return NULL; - } - - if (sock_flag(sk, SOCK_DONE)) - return NULL; - - if ((flags & MSG_DONTWAIT) || !timeo) { - *err = -EAGAIN; - return NULL; - } - - sk_wait_data(sk, &timeo, NULL); - - /* Handle signals */ - if (signal_pending(current)) { - *err = sock_intr_errno(timeo); - return NULL; - } - } - - return skb; -} - static int kcm_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct sock *sk = sock->sk; struct kcm_sock *kcm = kcm_sk(sk); int err = 0; - long timeo; struct strp_msg *stm; int copied = 0; struct sk_buff *skb; - timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); - - lock_sock(sk); - - skb = kcm_wait_data(sk, flags, timeo, &err); + skb = skb_recv_datagram(sk, flags, &err); if (!skb) goto out; @@ -1162,14 +1126,11 @@ msg_finished: /* Finished with message */ msg->msg_flags |= MSG_EOR; KCM_STATS_INCR(kcm->stats.rx_msgs); - skb_unlink(skb, &sk->sk_receive_queue); - kfree_skb(skb); } } out: - release_sock(sk); - + skb_free_datagram(sk, skb); return copied ? : err; } @@ -1179,7 +1140,6 @@ static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos, { struct sock *sk = sock->sk; struct kcm_sock *kcm = kcm_sk(sk); - long timeo; struct strp_msg *stm; int err = 0; ssize_t copied; @@ -1187,11 +1147,7 @@ static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos, /* Only support splice for SOCKSEQPACKET */ - timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); - - lock_sock(sk); - - skb = kcm_wait_data(sk, flags, timeo, &err); + skb = skb_recv_datagram(sk, flags, &err); if (!skb) goto err_out; @@ -1219,13 +1175,11 @@ static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos, * finish reading the message. */ - release_sock(sk); - + skb_free_datagram(sk, skb); return copied; err_out: - release_sock(sk); - + skb_free_datagram(sk, skb); return err; } -- cgit v1.2.3 From 9d45921ee4cb364910097e7d1b7558559c2f9fd2 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 14 Nov 2022 10:45:09 +0200 Subject: bridge: switchdev: Fix memory leaks when changing VLAN protocol The bridge driver can offload VLANs to the underlying hardware either via switchdev or the 8021q driver. When the former is used, the VLAN is marked in the bridge driver with the 'BR_VLFLAG_ADDED_BY_SWITCHDEV' private flag. To avoid the memory leaks mentioned in the cited commit, the bridge driver will try to delete a VLAN via the 8021q driver if the VLAN is not marked with the previously mentioned flag. When the VLAN protocol of the bridge changes, switchdev drivers are notified via the 'SWITCHDEV_ATTR_ID_BRIDGE_VLAN_PROTOCOL' attribute, but the 8021q driver is also called to add the existing VLANs with the new protocol and delete them with the old protocol. In case the VLANs were offloaded via switchdev, the above behavior is both redundant and buggy. Redundant because the VLANs are already programmed in hardware and drivers that support VLAN protocol change (currently only mlx5) change the protocol upon the switchdev attribute notification. Buggy because the 8021q driver is called despite these VLANs being marked with 'BR_VLFLAG_ADDED_BY_SWITCHDEV'. This leads to memory leaks [1] when the VLANs are deleted. Fix by not calling the 8021q driver for VLANs that were already programmed via switchdev. [1] unreferenced object 0xffff8881f6771200 (size 256): comm "ip", pid 446855, jiffies 4298238841 (age 55.240s) hex dump (first 32 bytes): 00 00 7f 0e 83 88 ff ff 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<00000000012819ac>] vlan_vid_add+0x437/0x750 [<00000000f2281fad>] __br_vlan_set_proto+0x289/0x920 [<000000000632b56f>] br_changelink+0x3d6/0x13f0 [<0000000089d25f04>] __rtnl_newlink+0x8ae/0x14c0 [<00000000f6276baf>] rtnl_newlink+0x5f/0x90 [<00000000746dc902>] rtnetlink_rcv_msg+0x336/0xa00 [<000000001c2241c0>] netlink_rcv_skb+0x11d/0x340 [<0000000010588814>] netlink_unicast+0x438/0x710 [<00000000e1a4cd5c>] netlink_sendmsg+0x788/0xc40 [<00000000e8992d4e>] sock_sendmsg+0xb0/0xe0 [<00000000621b8f91>] ____sys_sendmsg+0x4ff/0x6d0 [<000000000ea26996>] ___sys_sendmsg+0x12e/0x1b0 [<00000000684f7e25>] __sys_sendmsg+0xab/0x130 [<000000004538b104>] do_syscall_64+0x3d/0x90 [<0000000091ed9678>] entry_SYSCALL_64_after_hwframe+0x46/0xb0 Fixes: 279737939a81 ("net: bridge: Fix VLANs memory leak") Reported-by: Vlad Buslov Tested-by: Vlad Buslov Signed-off-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Link: https://lore.kernel.org/r/20221114084509.860831-1-idosch@nvidia.com Signed-off-by: Paolo Abeni --- net/bridge/br_vlan.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 6e53dc991409..9ffd40b8270c 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -959,6 +959,8 @@ int __br_vlan_set_proto(struct net_bridge *br, __be16 proto, list_for_each_entry(p, &br->port_list, list) { vg = nbp_vlan_group(p); list_for_each_entry(vlan, &vg->vlan_list, vlist) { + if (vlan->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV) + continue; err = vlan_vid_add(p->dev, proto, vlan->vid); if (err) goto err_filt; @@ -973,8 +975,11 @@ int __br_vlan_set_proto(struct net_bridge *br, __be16 proto, /* Delete VLANs for the old proto from the device filter. */ list_for_each_entry(p, &br->port_list, list) { vg = nbp_vlan_group(p); - list_for_each_entry(vlan, &vg->vlan_list, vlist) + list_for_each_entry(vlan, &vg->vlan_list, vlist) { + if (vlan->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV) + continue; vlan_vid_del(p->dev, oldproto, vlan->vid); + } } return 0; @@ -983,13 +988,19 @@ err_filt: attr.u.vlan_protocol = ntohs(oldproto); switchdev_port_attr_set(br->dev, &attr, NULL); - list_for_each_entry_continue_reverse(vlan, &vg->vlan_list, vlist) + list_for_each_entry_continue_reverse(vlan, &vg->vlan_list, vlist) { + if (vlan->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV) + continue; vlan_vid_del(p->dev, proto, vlan->vid); + } list_for_each_entry_continue_reverse(p, &br->port_list, list) { vg = nbp_vlan_group(p); - list_for_each_entry(vlan, &vg->vlan_list, vlist) + list_for_each_entry(vlan, &vg->vlan_list, vlist) { + if (vlan->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV) + continue; vlan_vid_del(p->dev, proto, vlan->vid); + } } return err; -- cgit v1.2.3 From 2929cceb2fcf0ded7182562e4888afafece82cce Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Mon, 14 Nov 2022 11:05:19 +0000 Subject: net/x25: Fix skb leak in x25_lapb_receive_frame() x25_lapb_receive_frame() using skb_copy() to get a private copy of skb, the new skb should be freed in the undersized/fragmented skb error handling path. Otherwise there is a memory leak. Fixes: cb101ed2c3c7 ("x25: Handle undersized/fragmented skbs") Signed-off-by: Wei Yongjun Acked-by: Martin Schiller Link: https://lore.kernel.org/r/20221114110519.514538-1-weiyongjun@huaweicloud.com Signed-off-by: Jakub Kicinski --- net/x25/x25_dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/x25/x25_dev.c b/net/x25/x25_dev.c index 5259ef8f5242..748d8630ab58 100644 --- a/net/x25/x25_dev.c +++ b/net/x25/x25_dev.c @@ -117,7 +117,7 @@ int x25_lapb_receive_frame(struct sk_buff *skb, struct net_device *dev, if (!pskb_may_pull(skb, 1)) { x25_neigh_put(nb); - return 0; + goto drop; } switch (skb->data[0]) { -- cgit v1.2.3 From 4e0c19fcb8b5323716140fa82b79aa9f60e60407 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 14 Nov 2022 16:35:51 +0200 Subject: net: dsa: don't leak tagger-owned storage on switch driver unbind In the initial commit dc452a471dba ("net: dsa: introduce tagger-owned storage for private and shared data"), we had a call to tag_ops->disconnect(dst) issued from dsa_tree_free(), which is called at tree teardown time. There were problems with connecting to a switch tree as a whole, so this got reworked to connecting to individual switches within the tree. In this process, tag_ops->disconnect(ds) was made to be called only from switch.c (cross-chip notifiers emitted as a result of dynamic tag proto changes), but the normal driver teardown code path wasn't replaced with anything. Solve this problem by adding a function that does the opposite of dsa_switch_setup_tag_protocol(), which is called from the equivalent spot in dsa_switch_teardown(). The positioning here also ensures that we won't have any use-after-free in tagging protocol (*rcv) ops, since the teardown sequence is as follows: dsa_tree_teardown -> dsa_tree_teardown_master -> dsa_master_teardown -> unsets master->dsa_ptr, making no further packets match the ETH_P_XDSA packet type handler -> dsa_tree_teardown_ports -> dsa_port_teardown -> dsa_slave_destroy -> unregisters DSA net devices, there is even a synchronize_net() in unregister_netdevice_many() -> dsa_tree_teardown_switches -> dsa_switch_teardown -> dsa_switch_teardown_tag_protocol -> finally frees the tagger-owned storage Fixes: 7f2973149c22 ("net: dsa: make tagging protocols connect to individual switches from a tree") Signed-off-by: Vladimir Oltean Reviewed-by: Saeed Mahameed Reviewed-by: Florian Fainelli Link: https://lore.kernel.org/r/20221114143551.1906361-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- net/dsa/dsa2.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index e504a18fc125..5417f7b1187c 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -864,6 +864,14 @@ disconnect: return err; } +static void dsa_switch_teardown_tag_protocol(struct dsa_switch *ds) +{ + const struct dsa_device_ops *tag_ops = ds->dst->tag_ops; + + if (tag_ops->disconnect) + tag_ops->disconnect(ds); +} + static int dsa_switch_setup(struct dsa_switch *ds) { struct dsa_devlink_priv *dl_priv; @@ -953,6 +961,8 @@ static void dsa_switch_teardown(struct dsa_switch *ds) ds->slave_mii_bus = NULL; } + dsa_switch_teardown_tag_protocol(ds); + if (ds->ops->teardown) ds->ops->teardown(ds); -- cgit v1.2.3 From b68777d54fac21fc833ec26ea1a2a84f975ab035 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Mon, 14 Nov 2022 20:16:19 +0100 Subject: l2tp: Serialize access to sk_user_data with sk_callback_lock sk->sk_user_data has multiple users, which are not compatible with each other. Writers must synchronize by grabbing the sk->sk_callback_lock. l2tp currently fails to grab the lock when modifying the underlying tunnel socket fields. Fix it by adding appropriate locking. We err on the side of safety and grab the sk_callback_lock also inside the sk_destruct callback overridden by l2tp, even though there should be no refs allowing access to the sock at the time when sk_destruct gets called. v4: - serialize write to sk_user_data in l2tp sk_destruct v3: - switch from sock lock to sk_callback_lock - document write-protection for sk_user_data v2: - update Fixes to point to origin of the bug - use real names in Reported/Tested-by tags Cc: Tom Parkin Fixes: 3557baabf280 ("[L2TP]: PPP over L2TP driver core") Reported-by: Haowei Yan Signed-off-by: Jakub Sitnicki Signed-off-by: David S. Miller --- include/net/sock.h | 2 +- net/l2tp/l2tp_core.c | 19 +++++++++++++------ 2 files changed, 14 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/net/sock.h b/include/net/sock.h index 5db02546941c..e0517ecc6531 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -323,7 +323,7 @@ struct sk_filter; * @sk_tskey: counter to disambiguate concurrent tstamp requests * @sk_zckey: counter to order MSG_ZEROCOPY notifications * @sk_socket: Identd and reporting IO signals - * @sk_user_data: RPC layer private data + * @sk_user_data: RPC layer private data. Write-protected by @sk_callback_lock. * @sk_frag: cached page frag * @sk_peek_off: current peek_offset value * @sk_send_head: front of stuff to transmit diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 7499c51b1850..754fdda8a5f5 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1150,8 +1150,10 @@ static void l2tp_tunnel_destruct(struct sock *sk) } /* Remove hooks into tunnel socket */ + write_lock_bh(&sk->sk_callback_lock); sk->sk_destruct = tunnel->old_sk_destruct; sk->sk_user_data = NULL; + write_unlock_bh(&sk->sk_callback_lock); /* Call the original destructor */ if (sk->sk_destruct) @@ -1469,16 +1471,18 @@ int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net, sock = sockfd_lookup(tunnel->fd, &ret); if (!sock) goto err; - - ret = l2tp_validate_socket(sock->sk, net, tunnel->encap); - if (ret < 0) - goto err_sock; } + sk = sock->sk; + write_lock(&sk->sk_callback_lock); + + ret = l2tp_validate_socket(sk, net, tunnel->encap); + if (ret < 0) + goto err_sock; + tunnel->l2tp_net = net; pn = l2tp_pernet(net); - sk = sock->sk; sock_hold(sk); tunnel->sock = sk; @@ -1504,7 +1508,7 @@ int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net, setup_udp_tunnel_sock(net, sock, &udp_cfg); } else { - sk->sk_user_data = tunnel; + rcu_assign_sk_user_data(sk, tunnel); } tunnel->old_sk_destruct = sk->sk_destruct; @@ -1518,6 +1522,7 @@ int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net, if (tunnel->fd >= 0) sockfd_put(sock); + write_unlock(&sk->sk_callback_lock); return 0; err_sock: @@ -1525,6 +1530,8 @@ err_sock: sock_release(sock); else sockfd_put(sock); + + write_unlock(&sk->sk_callback_lock); err: return ret; } -- cgit v1.2.3 From aeac4ec8f46d610a10adbaeff5e2edf6a88ffc62 Mon Sep 17 00:00:00 2001 From: Gleb Mazovetskiy Date: Mon, 14 Nov 2022 22:56:16 +0000 Subject: tcp: configurable source port perturb table size On embedded systems with little memory and no relevant security concerns, it is beneficial to reduce the size of the table. Reducing the size from 2^16 to 2^8 saves 255 KiB of kernel RAM. Makes the table size configurable as an expert option. The size was previously increased from 2^8 to 2^16 in commit 4c2c8f03a5ab ("tcp: increase source port perturb table to 2^16"). Signed-off-by: Gleb Mazovetskiy Reviewed-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/ipv4/Kconfig | 10 ++++++++++ net/ipv4/inet_hashtables.c | 10 +++++----- 2 files changed, 15 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index e983bb0c5012..2dfb12230f08 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -402,6 +402,16 @@ config INET_IPCOMP If unsure, say Y. +config INET_TABLE_PERTURB_ORDER + int "INET: Source port perturbation table size (as power of 2)" if EXPERT + default 16 + help + Source port perturbation table size (as power of 2) for + RFC 6056 3.3.4. Algorithm 4: Double-Hash Port Selection Algorithm. + + The default is almost always what you want. + Only change this if you know what you are doing. + config INET_XFRM_TUNNEL tristate select INET_TUNNEL diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index d3dc28156622..033bf3c2538f 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -906,13 +906,13 @@ EXPORT_SYMBOL_GPL(inet_bhash2_update_saddr); * Note that we use 32bit integers (vs RFC 'short integers') * because 2^16 is not a multiple of num_ephemeral and this * property might be used by clever attacker. + * * RFC claims using TABLE_LENGTH=10 buckets gives an improvement, though - * attacks were since demonstrated, thus we use 65536 instead to really - * give more isolation and privacy, at the expense of 256kB of kernel - * memory. + * attacks were since demonstrated, thus we use 65536 by default instead + * to really give more isolation and privacy, at the expense of 256kB + * of kernel memory. */ -#define INET_TABLE_PERTURB_SHIFT 16 -#define INET_TABLE_PERTURB_SIZE (1 << INET_TABLE_PERTURB_SHIFT) +#define INET_TABLE_PERTURB_SIZE (1 << CONFIG_INET_TABLE_PERTURB_ORDER) static u32 *table_perturb; int __inet_hash_connect(struct inet_timewait_death_row *death_row, -- cgit v1.2.3 From 8207f253a097fe15c93d85ac15ebb73c5e39e1e1 Mon Sep 17 00:00:00 2001 From: Thomas Zeitlhofer Date: Tue, 15 Nov 2022 23:09:41 +0100 Subject: net: neigh: decrement the family specific qlen Commit 0ff4eb3d5ebb ("neighbour: make proxy_queue.qlen limit per-device") introduced the length counter qlen in struct neigh_parms. There are separate neigh_parms instances for IPv4/ARP and IPv6/ND, and while the family specific qlen is incremented in pneigh_enqueue(), the mentioned commit decrements always the IPv4/ARP specific qlen, regardless of the currently processed family, in pneigh_queue_purge() and neigh_proxy_process(). As a result, with IPv6/ND, the family specific qlen is only incremented (and never decremented) until it exceeds PROXY_QLEN, and then, according to the check in pneigh_enqueue(), neighbor solicitations are not answered anymore. As an example, this is noted when using the subnet-router anycast address to access a Linux router. After a certain amount of time (in the observed case, qlen exceeded PROXY_QLEN after two days), the Linux router stops answering neighbor solicitations for its subnet-router anycast address and effectively becomes unreachable. Another result with IPv6/ND is that the IPv4/ARP specific qlen is decremented more often than incremented. This leads to negative qlen values, as a signed integer has been used for the length counter qlen, and potentially to an integer overflow. Fix this by introducing the helper function neigh_parms_qlen_dec(), which decrements the family specific qlen. Thereby, make use of the existing helper function neigh_get_dev_parms_rcu(), whose definition therefore needs to be placed earlier in neighbour.c. Take the family member from struct neigh_table to determine the currently processed family and appropriately call neigh_parms_qlen_dec() from pneigh_queue_purge() and neigh_proxy_process(). Additionally, use an unsigned integer for the length counter qlen. Fixes: 0ff4eb3d5ebb ("neighbour: make proxy_queue.qlen limit per-device") Signed-off-by: Thomas Zeitlhofer Signed-off-by: David S. Miller --- include/net/neighbour.h | 2 +- net/core/neighbour.c | 58 +++++++++++++++++++++++++------------------------ 2 files changed, 31 insertions(+), 29 deletions(-) (limited to 'net') diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 20745cf7ae1a..2f2a6023fb0e 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -83,7 +83,7 @@ struct neigh_parms { struct rcu_head rcu_head; int reachable_time; - int qlen; + u32 qlen; int data[NEIGH_VAR_DATA_MAX]; DECLARE_BITMAP(data_state, NEIGH_VAR_DATA_MAX); }; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index a77a85e357e0..952a54763358 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -307,7 +307,31 @@ static int neigh_del_timer(struct neighbour *n) return 0; } -static void pneigh_queue_purge(struct sk_buff_head *list, struct net *net) +static struct neigh_parms *neigh_get_dev_parms_rcu(struct net_device *dev, + int family) +{ + switch (family) { + case AF_INET: + return __in_dev_arp_parms_get_rcu(dev); + case AF_INET6: + return __in6_dev_nd_parms_get_rcu(dev); + } + return NULL; +} + +static void neigh_parms_qlen_dec(struct net_device *dev, int family) +{ + struct neigh_parms *p; + + rcu_read_lock(); + p = neigh_get_dev_parms_rcu(dev, family); + if (p) + p->qlen--; + rcu_read_unlock(); +} + +static void pneigh_queue_purge(struct sk_buff_head *list, struct net *net, + int family) { struct sk_buff_head tmp; unsigned long flags; @@ -321,13 +345,7 @@ static void pneigh_queue_purge(struct sk_buff_head *list, struct net *net) struct net_device *dev = skb->dev; if (net == NULL || net_eq(dev_net(dev), net)) { - struct in_device *in_dev; - - rcu_read_lock(); - in_dev = __in_dev_get_rcu(dev); - if (in_dev) - in_dev->arp_parms->qlen--; - rcu_read_unlock(); + neigh_parms_qlen_dec(dev, family); __skb_unlink(skb, list); __skb_queue_tail(&tmp, skb); } @@ -409,7 +427,8 @@ static int __neigh_ifdown(struct neigh_table *tbl, struct net_device *dev, write_lock_bh(&tbl->lock); neigh_flush_dev(tbl, dev, skip_perm); pneigh_ifdown_and_unlock(tbl, dev); - pneigh_queue_purge(&tbl->proxy_queue, dev ? dev_net(dev) : NULL); + pneigh_queue_purge(&tbl->proxy_queue, dev ? dev_net(dev) : NULL, + tbl->family); if (skb_queue_empty_lockless(&tbl->proxy_queue)) del_timer_sync(&tbl->proxy_timer); return 0; @@ -1621,13 +1640,8 @@ static void neigh_proxy_process(struct timer_list *t) if (tdif <= 0) { struct net_device *dev = skb->dev; - struct in_device *in_dev; - rcu_read_lock(); - in_dev = __in_dev_get_rcu(dev); - if (in_dev) - in_dev->arp_parms->qlen--; - rcu_read_unlock(); + neigh_parms_qlen_dec(dev, tbl->family); __skb_unlink(skb, &tbl->proxy_queue); if (tbl->proxy_redo && netif_running(dev)) { @@ -1821,7 +1835,7 @@ int neigh_table_clear(int index, struct neigh_table *tbl) cancel_delayed_work_sync(&tbl->managed_work); cancel_delayed_work_sync(&tbl->gc_work); del_timer_sync(&tbl->proxy_timer); - pneigh_queue_purge(&tbl->proxy_queue, NULL); + pneigh_queue_purge(&tbl->proxy_queue, NULL, tbl->family); neigh_ifdown(tbl, NULL); if (atomic_read(&tbl->entries)) pr_crit("neighbour leakage\n"); @@ -3539,18 +3553,6 @@ static int proc_unres_qlen(struct ctl_table *ctl, int write, return ret; } -static struct neigh_parms *neigh_get_dev_parms_rcu(struct net_device *dev, - int family) -{ - switch (family) { - case AF_INET: - return __in_dev_arp_parms_get_rcu(dev); - case AF_INET6: - return __in6_dev_nd_parms_get_rcu(dev); - } - return NULL; -} - static void neigh_copy_dflt_parms(struct net *net, struct neigh_parms *p, int index) { -- cgit v1.2.3 From 3bcd6c7eaa53b56c3f584da46a1f7652e759d0e5 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 16 Nov 2022 14:02:28 +0000 Subject: rxrpc: Fix race between conn bundle lookup and bundle removal [ZDI-CAN-15975] After rxrpc_unbundle_conn() has removed a connection from a bundle, it checks to see if there are any conns with available channels and, if not, removes and attempts to destroy the bundle. Whilst it does check after grabbing client_bundles_lock that there are no connections attached, this races with rxrpc_look_up_bundle() retrieving the bundle, but not attaching a connection for the connection to be attached later. There is therefore a window in which the bundle can get destroyed before we manage to attach a new connection to it. Fix this by adding an "active" counter to struct rxrpc_bundle: (1) rxrpc_connect_call() obtains an active count by prepping/looking up a bundle and ditches it before returning. (2) If, during rxrpc_connect_call(), a connection is added to the bundle, this obtains an active count, which is held until the connection is discarded. (3) rxrpc_deactivate_bundle() is created to drop an active count on a bundle and destroy it when the active count reaches 0. The active count is checked inside client_bundles_lock() to prevent a race with rxrpc_look_up_bundle(). (4) rxrpc_unbundle_conn() then calls rxrpc_deactivate_bundle(). Fixes: 245500d853e9 ("rxrpc: Rewrite the client connection manager") Reported-by: zdi-disclosures@trendmicro.com # ZDI-CAN-15975 Signed-off-by: David Howells Tested-by: zdi-disclosures@trendmicro.com cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: David S. Miller --- net/rxrpc/ar-internal.h | 1 + net/rxrpc/conn_client.c | 38 +++++++++++++++++++++++--------------- 2 files changed, 24 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 1ad0ec5afb50..8499ceb7719c 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -399,6 +399,7 @@ enum rxrpc_conn_proto_state { struct rxrpc_bundle { struct rxrpc_conn_parameters params; refcount_t ref; + atomic_t active; /* Number of active users */ unsigned int debug_id; bool try_upgrade; /* True if the bundle is attempting upgrade */ bool alloc_conn; /* True if someone's getting a conn */ diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 3c9eeb5b750c..bdb335cb2d05 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -40,6 +40,8 @@ __read_mostly unsigned long rxrpc_conn_idle_client_fast_expiry = 2 * HZ; DEFINE_IDR(rxrpc_client_conn_ids); static DEFINE_SPINLOCK(rxrpc_conn_id_lock); +static void rxrpc_deactivate_bundle(struct rxrpc_bundle *bundle); + /* * Get a connection ID and epoch for a client connection from the global pool. * The connection struct pointer is then recorded in the idr radix tree. The @@ -123,6 +125,7 @@ static struct rxrpc_bundle *rxrpc_alloc_bundle(struct rxrpc_conn_parameters *cp, bundle->params = *cp; rxrpc_get_peer(bundle->params.peer); refcount_set(&bundle->ref, 1); + atomic_set(&bundle->active, 1); spin_lock_init(&bundle->channel_lock); INIT_LIST_HEAD(&bundle->waiting_calls); } @@ -149,7 +152,7 @@ void rxrpc_put_bundle(struct rxrpc_bundle *bundle) dead = __refcount_dec_and_test(&bundle->ref, &r); - _debug("PUT B=%x %d", d, r); + _debug("PUT B=%x %d", d, r - 1); if (dead) rxrpc_free_bundle(bundle); } @@ -338,6 +341,7 @@ found_bundle_free: rxrpc_free_bundle(candidate); found_bundle: rxrpc_get_bundle(bundle); + atomic_inc(&bundle->active); spin_unlock(&local->client_bundles_lock); _leave(" = %u [found]", bundle->debug_id); return bundle; @@ -435,6 +439,7 @@ static void rxrpc_add_conn_to_bundle(struct rxrpc_bundle *bundle, gfp_t gfp) if (old) trace_rxrpc_client(old, -1, rxrpc_client_replace); candidate->bundle_shift = shift; + atomic_inc(&bundle->active); bundle->conns[i] = candidate; for (j = 0; j < RXRPC_MAXCALLS; j++) set_bit(shift + j, &bundle->avail_chans); @@ -725,6 +730,7 @@ granted_channel: smp_rmb(); out_put_bundle: + rxrpc_deactivate_bundle(bundle); rxrpc_put_bundle(bundle); out: _leave(" = %d", ret); @@ -900,9 +906,8 @@ out: static void rxrpc_unbundle_conn(struct rxrpc_connection *conn) { struct rxrpc_bundle *bundle = conn->bundle; - struct rxrpc_local *local = bundle->params.local; unsigned int bindex; - bool need_drop = false, need_put = false; + bool need_drop = false; int i; _enter("C=%x", conn->debug_id); @@ -921,15 +926,22 @@ static void rxrpc_unbundle_conn(struct rxrpc_connection *conn) } spin_unlock(&bundle->channel_lock); - /* If there are no more connections, remove the bundle */ - if (!bundle->avail_chans) { - _debug("maybe unbundle"); - spin_lock(&local->client_bundles_lock); + if (need_drop) { + rxrpc_deactivate_bundle(bundle); + rxrpc_put_connection(conn); + } +} - for (i = 0; i < ARRAY_SIZE(bundle->conns); i++) - if (bundle->conns[i]) - break; - if (i == ARRAY_SIZE(bundle->conns) && !bundle->params.exclusive) { +/* + * Drop the active count on a bundle. + */ +static void rxrpc_deactivate_bundle(struct rxrpc_bundle *bundle) +{ + struct rxrpc_local *local = bundle->params.local; + bool need_put = false; + + if (atomic_dec_and_lock(&bundle->active, &local->client_bundles_lock)) { + if (!bundle->params.exclusive) { _debug("erase bundle"); rb_erase(&bundle->local_node, &local->client_bundles); need_put = true; @@ -939,10 +951,6 @@ static void rxrpc_unbundle_conn(struct rxrpc_connection *conn) if (need_put) rxrpc_put_bundle(bundle); } - - if (need_drop) - rxrpc_put_connection(conn); - _leave(""); } /* -- cgit v1.2.3 From 0ad6bded175e829c2ca261529c9dce39a32a042d Mon Sep 17 00:00:00 2001 From: Lin Ma Date: Wed, 16 Nov 2022 21:02:49 +0800 Subject: nfc/nci: fix race with opening and closing Previously we leverage NCI_UNREG and the lock inside nci_close_device to prevent the race condition between opening a device and closing a device. However, it still has problem because a failed opening command will erase the NCI_UNREG flag and allow another opening command to bypass the status checking. This fix corrects that by making sure the NCI_UNREG is held. Reported-by: syzbot+43475bf3cfbd6e41f5b7@syzkaller.appspotmail.com Fixes: 48b71a9e66c2 ("NFC: add NCI_UNREG flag to eliminate the race") Signed-off-by: Lin Ma Signed-off-by: David S. Miller --- net/nfc/nci/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index 6a193cce2a75..4ffdf2f45c44 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -542,7 +542,7 @@ static int nci_open_device(struct nci_dev *ndev) skb_queue_purge(&ndev->tx_q); ndev->ops->close(ndev); - ndev->flags = 0; + ndev->flags &= BIT(NCI_UNREG); } done: -- cgit v1.2.3 From 11c10956515b8ec44cf4f2a7b9d8bf8b9dc05ec4 Mon Sep 17 00:00:00 2001 From: Zhengchao Shao Date: Thu, 10 Nov 2022 20:26:06 +0800 Subject: 9p/fd: fix issue of list_del corruption in p9_fd_cancel() Syz reported the following issue: kernel BUG at lib/list_debug.c:53! invalid opcode: 0000 [#1] PREEMPT SMP KASAN RIP: 0010:__list_del_entry_valid.cold+0x5c/0x72 Call Trace: p9_fd_cancel+0xb1/0x270 p9_client_rpc+0x8ea/0xba0 p9_client_create+0x9c0/0xed0 v9fs_session_init+0x1e0/0x1620 v9fs_mount+0xba/0xb80 legacy_get_tree+0x103/0x200 vfs_get_tree+0x89/0x2d0 path_mount+0x4c0/0x1ac0 __x64_sys_mount+0x33b/0x430 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x46/0xb0 The process is as follows: Thread A: Thread B: p9_poll_workfn() p9_client_create() ... ... p9_conn_cancel() p9_fd_cancel() list_del() ... ... list_del() //list_del corruption There is no lock protection when deleting list in p9_conn_cancel(). After deleting list in Thread A, thread B will delete the same list again. It will cause issue of list_del corruption. Setting req->status to REQ_STATUS_ERROR under lock prevents other cleanup paths from trying to manipulate req_list. The other thread can safely check req->status because it still holds a reference to req at this point. Link: https://lkml.kernel.org/r/20221110122606.383352-1-shaozhengchao@huawei.com Fixes: 52f1c45dde91 ("9p: trans_fd/p9_conn_cancel: drop client lock earlier") Reported-by: syzbot+9b69b8d10ab4a7d88056@syzkaller.appspotmail.com Signed-off-by: Zhengchao Shao [Dominique: add description of the fix in commit message] Signed-off-by: Dominique Martinet --- net/9p/trans_fd.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index 56a186768750..bd28e63d7666 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -202,9 +202,11 @@ static void p9_conn_cancel(struct p9_conn *m, int err) list_for_each_entry_safe(req, rtmp, &m->req_list, req_list) { list_move(&req->req_list, &cancel_list); + req->status = REQ_STATUS_ERROR; } list_for_each_entry_safe(req, rtmp, &m->unsent_req_list, req_list) { list_move(&req->req_list, &cancel_list); + req->status = REQ_STATUS_ERROR; } spin_unlock(&m->req_lock); -- cgit v1.2.3 From 578b565b240afdfe0596d183f473f333eb9d3008 Mon Sep 17 00:00:00 2001 From: GUO Zihua Date: Thu, 17 Nov 2022 17:11:57 +0800 Subject: 9p/fd: Fix write overflow in p9_read_work This error was reported while fuzzing: BUG: KASAN: slab-out-of-bounds in _copy_to_iter+0xd35/0x1190 Write of size 4043 at addr ffff888008724eb1 by task kworker/1:1/24 CPU: 1 PID: 24 Comm: kworker/1:1 Not tainted 6.1.0-rc5-00002-g1adf73218daa-dirty #223 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.15.0-0-g2dd4b9b3f840-prebuilt.qemu.org 04/01/2014 Workqueue: events p9_read_work Call Trace: dump_stack_lvl+0x4c/0x64 print_report+0x178/0x4b0 kasan_report+0xae/0x130 kasan_check_range+0x179/0x1e0 memcpy+0x38/0x60 _copy_to_iter+0xd35/0x1190 copy_page_to_iter+0x1d5/0xb00 pipe_read+0x3a1/0xd90 __kernel_read+0x2a5/0x760 kernel_read+0x47/0x60 p9_read_work+0x463/0x780 process_one_work+0x91d/0x1300 worker_thread+0x8c/0x1210 kthread+0x280/0x330 ret_from_fork+0x22/0x30 Allocated by task 457: kasan_save_stack+0x1c/0x40 kasan_set_track+0x21/0x30 __kasan_kmalloc+0x7e/0x90 __kmalloc+0x59/0x140 p9_fcall_init.isra.11+0x5d/0x1c0 p9_tag_alloc+0x251/0x550 p9_client_prepare_req+0x162/0x350 p9_client_rpc+0x18d/0xa90 p9_client_create+0x670/0x14e0 v9fs_session_init+0x1fd/0x14f0 v9fs_mount+0xd7/0xaf0 legacy_get_tree+0xf3/0x1f0 vfs_get_tree+0x86/0x2c0 path_mount+0x885/0x1940 do_mount+0xec/0x100 __x64_sys_mount+0x1a0/0x1e0 do_syscall_64+0x3a/0x90 entry_SYSCALL_64_after_hwframe+0x63/0xcd This BUG pops up when trying to reproduce https://syzkaller.appspot.com/bug?id=6c7cd46c7bdd0e86f95d26ec3153208ad186f9fa The callstack is different but the issue is valid and re-producable with the same re-producer in the link. The root cause of this issue is that we check the size of the message received against the msize of the client in p9_read_work. However, it turns out that capacity is no longer consistent with msize. Thus, the message size should be checked against sdata capacity. As the msize is non-consistant with the capacity of the tag and as we are now checking message size against capacity directly, there is no point checking message size against msize. So remove it. Link: https://lkml.kernel.org/r/20221117091159.31533-2-guozihua@huawei.com Link: https://lkml.kernel.org/r/20221117091159.31533-3-guozihua@huawei.com Reported-by: syzbot+0f89bd13eaceccc0e126@syzkaller.appspotmail.com Fixes: 60ece0833b6c ("net/9p: allocate appropriate reduced message buffers") Signed-off-by: GUO Zihua Reviewed-by: Christian Schoenebeck [Dominique: squash patches 1 & 2 and fix size including header part] Signed-off-by: Dominique Martinet --- net/9p/trans_fd.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index bd28e63d7666..27c5c938ccd1 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -324,14 +324,6 @@ static void p9_read_work(struct work_struct *work) goto error; } - if (m->rc.size >= m->client->msize) { - p9_debug(P9_DEBUG_ERROR, - "requested packet size too big: %d\n", - m->rc.size); - err = -EIO; - goto error; - } - p9_debug(P9_DEBUG_TRANS, "mux %p pkt: size: %d bytes tag: %d\n", m, m->rc.size, m->rc.tag); @@ -344,6 +336,14 @@ static void p9_read_work(struct work_struct *work) goto error; } + if (m->rc.size > m->rreq->rc.capacity) { + p9_debug(P9_DEBUG_ERROR, + "requested packet size too big: %d for tag %d with capacity %zd\n", + m->rc.size, m->rc.tag, m->rreq->rc.capacity); + err = -EIO; + goto error; + } + if (!m->rreq->rc.sdata) { p9_debug(P9_DEBUG_ERROR, "No recv fcall for tag %d (req %p), disconnecting!\n", -- cgit v1.2.3 From 6854fadbeee10891ed74246bdc05031906b6c8cf Mon Sep 17 00:00:00 2001 From: GUO Zihua Date: Thu, 17 Nov 2022 17:11:59 +0800 Subject: 9p/fd: Use P9_HDRSZ for header size Cleanup hardcoded header sizes to use P9_HDRSZ instead of '7' Link: https://lkml.kernel.org/r/20221117091159.31533-4-guozihua@huawei.com Signed-off-by: GUO Zihua Reviewed-by: Christian Schoenebeck [Dominique: commit message adjusted to make sense after offset size adjustment got removed] Signed-off-by: Dominique Martinet --- net/9p/trans_fd.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index 27c5c938ccd1..eeea0a6a75b6 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -120,7 +120,7 @@ struct p9_conn { struct list_head unsent_req_list; struct p9_req_t *rreq; struct p9_req_t *wreq; - char tmp_buf[7]; + char tmp_buf[P9_HDRSZ]; struct p9_fcall rc; int wpos; int wsize; @@ -293,7 +293,7 @@ static void p9_read_work(struct work_struct *work) if (!m->rc.sdata) { m->rc.sdata = m->tmp_buf; m->rc.offset = 0; - m->rc.capacity = 7; /* start by reading header */ + m->rc.capacity = P9_HDRSZ; /* start by reading header */ } clear_bit(Rpending, &m->wsched); @@ -316,7 +316,7 @@ static void p9_read_work(struct work_struct *work) p9_debug(P9_DEBUG_TRANS, "got new header\n"); /* Header size */ - m->rc.size = 7; + m->rc.size = P9_HDRSZ; err = p9_parse_header(&m->rc, &m->rc.size, NULL, NULL, 0); if (err) { p9_debug(P9_DEBUG_ERROR, -- cgit v1.2.3 From 52d1aa8b8249ff477aaa38b6f74a8ced780d079c Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Wed, 9 Nov 2022 12:39:07 -0700 Subject: netfilter: conntrack: Fix data-races around ct mark nf_conn:mark can be read from and written to in parallel. Use READ_ONCE()/WRITE_ONCE() for reads and writes to prevent unwanted compiler optimizations. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Daniel Xu Signed-off-by: Pablo Neira Ayuso --- net/core/flow_dissector.c | 2 +- net/ipv4/netfilter/ipt_CLUSTERIP.c | 4 ++-- net/netfilter/nf_conntrack_core.c | 2 +- net/netfilter/nf_conntrack_netlink.c | 24 ++++++++++++++---------- net/netfilter/nf_conntrack_standalone.c | 2 +- net/netfilter/nft_ct.c | 6 +++--- net/netfilter/xt_connmark.c | 18 ++++++++++-------- net/openvswitch/conntrack.c | 8 ++++---- net/sched/act_connmark.c | 4 ++-- net/sched/act_ct.c | 8 ++++---- net/sched/act_ctinfo.c | 6 +++--- 11 files changed, 45 insertions(+), 39 deletions(-) (limited to 'net') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 25cd35f5922e..007730412947 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -296,7 +296,7 @@ skb_flow_dissect_ct(const struct sk_buff *skb, key->ct_zone = ct->zone.id; #endif #if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) - key->ct_mark = ct->mark; + key->ct_mark = READ_ONCE(ct->mark); #endif cl = nf_ct_labels_find(ct); diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index f8e176c77d1c..b3cc416ed292 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -435,7 +435,7 @@ clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par) switch (ctinfo) { case IP_CT_NEW: - ct->mark = hash; + WRITE_ONCE(ct->mark, hash); break; case IP_CT_RELATED: case IP_CT_RELATED_REPLY: @@ -452,7 +452,7 @@ clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par) #ifdef DEBUG nf_ct_dump_tuple_ip(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); #endif - pr_debug("hash=%u ct_hash=%u ", hash, ct->mark); + pr_debug("hash=%u ct_hash=%u ", hash, READ_ONCE(ct->mark)); if (!clusterip_responsible(cipinfo->config, hash)) { pr_debug("not responsible\n"); return NF_DROP; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index f97bda06d2a9..2692139ce417 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1781,7 +1781,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, } #ifdef CONFIG_NF_CONNTRACK_MARK - ct->mark = exp->master->mark; + ct->mark = READ_ONCE(exp->master->mark); #endif #ifdef CONFIG_NF_CONNTRACK_SECMARK ct->secmark = exp->master->secmark; diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 7562b215b932..d71150a40fb0 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -328,9 +328,9 @@ nla_put_failure: } #ifdef CONFIG_NF_CONNTRACK_MARK -static int ctnetlink_dump_mark(struct sk_buff *skb, const struct nf_conn *ct) +static int ctnetlink_dump_mark(struct sk_buff *skb, u32 mark) { - if (nla_put_be32(skb, CTA_MARK, htonl(ct->mark))) + if (nla_put_be32(skb, CTA_MARK, htonl(mark))) goto nla_put_failure; return 0; @@ -543,7 +543,7 @@ static int ctnetlink_dump_extinfo(struct sk_buff *skb, static int ctnetlink_dump_info(struct sk_buff *skb, struct nf_conn *ct) { if (ctnetlink_dump_status(skb, ct) < 0 || - ctnetlink_dump_mark(skb, ct) < 0 || + ctnetlink_dump_mark(skb, READ_ONCE(ct->mark)) < 0 || ctnetlink_dump_secctx(skb, ct) < 0 || ctnetlink_dump_id(skb, ct) < 0 || ctnetlink_dump_use(skb, ct) < 0 || @@ -722,6 +722,7 @@ ctnetlink_conntrack_event(unsigned int events, const struct nf_ct_event *item) struct sk_buff *skb; unsigned int type; unsigned int flags = 0, group; + u32 mark; int err; if (events & (1 << IPCT_DESTROY)) { @@ -826,8 +827,9 @@ ctnetlink_conntrack_event(unsigned int events, const struct nf_ct_event *item) } #ifdef CONFIG_NF_CONNTRACK_MARK - if ((events & (1 << IPCT_MARK) || ct->mark) - && ctnetlink_dump_mark(skb, ct) < 0) + mark = READ_ONCE(ct->mark); + if ((events & (1 << IPCT_MARK) || mark) && + ctnetlink_dump_mark(skb, mark) < 0) goto nla_put_failure; #endif nlmsg_end(skb, nlh); @@ -1154,7 +1156,7 @@ static int ctnetlink_filter_match(struct nf_conn *ct, void *data) } #ifdef CONFIG_NF_CONNTRACK_MARK - if ((ct->mark & filter->mark.mask) != filter->mark.val) + if ((READ_ONCE(ct->mark) & filter->mark.mask) != filter->mark.val) goto ignore_entry; #endif status = (u32)READ_ONCE(ct->status); @@ -2002,9 +2004,9 @@ static void ctnetlink_change_mark(struct nf_conn *ct, mask = ~ntohl(nla_get_be32(cda[CTA_MARK_MASK])); mark = ntohl(nla_get_be32(cda[CTA_MARK])); - newmark = (ct->mark & mask) ^ mark; - if (newmark != ct->mark) - ct->mark = newmark; + newmark = (READ_ONCE(ct->mark) & mask) ^ mark; + if (newmark != READ_ONCE(ct->mark)) + WRITE_ONCE(ct->mark, newmark); } #endif @@ -2669,6 +2671,7 @@ static int __ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct) { const struct nf_conntrack_zone *zone; struct nlattr *nest_parms; + u32 mark; zone = nf_ct_zone(ct); @@ -2730,7 +2733,8 @@ static int __ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct) goto nla_put_failure; #ifdef CONFIG_NF_CONNTRACK_MARK - if (ct->mark && ctnetlink_dump_mark(skb, ct) < 0) + mark = READ_ONCE(ct->mark); + if (mark && ctnetlink_dump_mark(skb, mark) < 0) goto nla_put_failure; #endif if (ctnetlink_dump_labels(skb, ct) < 0) diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 4ffe84c5a82c..bca839ab1ae8 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -366,7 +366,7 @@ static int ct_seq_show(struct seq_file *s, void *v) goto release; #if defined(CONFIG_NF_CONNTRACK_MARK) - seq_printf(s, "mark=%u ", ct->mark); + seq_printf(s, "mark=%u ", READ_ONCE(ct->mark)); #endif ct_show_secctx(s, ct); diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index a3f01f209a53..641dc21f92b4 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -98,7 +98,7 @@ static void nft_ct_get_eval(const struct nft_expr *expr, return; #ifdef CONFIG_NF_CONNTRACK_MARK case NFT_CT_MARK: - *dest = ct->mark; + *dest = READ_ONCE(ct->mark); return; #endif #ifdef CONFIG_NF_CONNTRACK_SECMARK @@ -297,8 +297,8 @@ static void nft_ct_set_eval(const struct nft_expr *expr, switch (priv->key) { #ifdef CONFIG_NF_CONNTRACK_MARK case NFT_CT_MARK: - if (ct->mark != value) { - ct->mark = value; + if (READ_ONCE(ct->mark) != value) { + WRITE_ONCE(ct->mark, value); nf_conntrack_event_cache(IPCT_MARK, ct); } break; diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c index e5ebc0810675..ad3c033db64e 100644 --- a/net/netfilter/xt_connmark.c +++ b/net/netfilter/xt_connmark.c @@ -30,6 +30,7 @@ connmark_tg_shift(struct sk_buff *skb, const struct xt_connmark_tginfo2 *info) u_int32_t new_targetmark; struct nf_conn *ct; u_int32_t newmark; + u_int32_t oldmark; ct = nf_ct_get(skb, &ctinfo); if (ct == NULL) @@ -37,14 +38,15 @@ connmark_tg_shift(struct sk_buff *skb, const struct xt_connmark_tginfo2 *info) switch (info->mode) { case XT_CONNMARK_SET: - newmark = (ct->mark & ~info->ctmask) ^ info->ctmark; + oldmark = READ_ONCE(ct->mark); + newmark = (oldmark & ~info->ctmask) ^ info->ctmark; if (info->shift_dir == D_SHIFT_RIGHT) newmark >>= info->shift_bits; else newmark <<= info->shift_bits; - if (ct->mark != newmark) { - ct->mark = newmark; + if (READ_ONCE(ct->mark) != newmark) { + WRITE_ONCE(ct->mark, newmark); nf_conntrack_event_cache(IPCT_MARK, ct); } break; @@ -55,15 +57,15 @@ connmark_tg_shift(struct sk_buff *skb, const struct xt_connmark_tginfo2 *info) else new_targetmark <<= info->shift_bits; - newmark = (ct->mark & ~info->ctmask) ^ + newmark = (READ_ONCE(ct->mark) & ~info->ctmask) ^ new_targetmark; - if (ct->mark != newmark) { - ct->mark = newmark; + if (READ_ONCE(ct->mark) != newmark) { + WRITE_ONCE(ct->mark, newmark); nf_conntrack_event_cache(IPCT_MARK, ct); } break; case XT_CONNMARK_RESTORE: - new_targetmark = (ct->mark & info->ctmask); + new_targetmark = (READ_ONCE(ct->mark) & info->ctmask); if (info->shift_dir == D_SHIFT_RIGHT) new_targetmark >>= info->shift_bits; else @@ -126,7 +128,7 @@ connmark_mt(const struct sk_buff *skb, struct xt_action_param *par) if (ct == NULL) return false; - return ((ct->mark & info->mask) == info->mark) ^ info->invert; + return ((READ_ONCE(ct->mark) & info->mask) == info->mark) ^ info->invert; } static int connmark_mt_check(const struct xt_mtchk_param *par) diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index c7b10234cf7c..c8eaf4234b2e 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -152,7 +152,7 @@ static u8 ovs_ct_get_state(enum ip_conntrack_info ctinfo) static u32 ovs_ct_get_mark(const struct nf_conn *ct) { #if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) - return ct ? ct->mark : 0; + return ct ? READ_ONCE(ct->mark) : 0; #else return 0; #endif @@ -340,9 +340,9 @@ static int ovs_ct_set_mark(struct nf_conn *ct, struct sw_flow_key *key, #if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) u32 new_mark; - new_mark = ct_mark | (ct->mark & ~(mask)); - if (ct->mark != new_mark) { - ct->mark = new_mark; + new_mark = ct_mark | (READ_ONCE(ct->mark) & ~(mask)); + if (READ_ONCE(ct->mark) != new_mark) { + WRITE_ONCE(ct->mark, new_mark); if (nf_ct_is_confirmed(ct)) nf_conntrack_event_cache(IPCT_MARK, ct); key->ct.mark = new_mark; diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index 66b143bb04ac..d41002e4613f 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -61,7 +61,7 @@ static int tcf_connmark_act(struct sk_buff *skb, const struct tc_action *a, c = nf_ct_get(skb, &ctinfo); if (c) { - skb->mark = c->mark; + skb->mark = READ_ONCE(c->mark); /* using overlimits stats to count how many packets marked */ ca->tcf_qstats.overlimits++; goto out; @@ -81,7 +81,7 @@ static int tcf_connmark_act(struct sk_buff *skb, const struct tc_action *a, c = nf_ct_tuplehash_to_ctrack(thash); /* using overlimits stats to count how many packets marked */ ca->tcf_qstats.overlimits++; - skb->mark = c->mark; + skb->mark = READ_ONCE(c->mark); nf_ct_put(c); out: diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index b38d91d6b249..4c7f7861ea96 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -178,7 +178,7 @@ static void tcf_ct_flow_table_add_action_meta(struct nf_conn *ct, entry = tcf_ct_flow_table_flow_action_get_next(action); entry->id = FLOW_ACTION_CT_METADATA; #if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) - entry->ct_metadata.mark = ct->mark; + entry->ct_metadata.mark = READ_ONCE(ct->mark); #endif ctinfo = dir == IP_CT_DIR_ORIGINAL ? IP_CT_ESTABLISHED : IP_CT_ESTABLISHED_REPLY; @@ -936,9 +936,9 @@ static void tcf_ct_act_set_mark(struct nf_conn *ct, u32 mark, u32 mask) if (!mask) return; - new_mark = mark | (ct->mark & ~(mask)); - if (ct->mark != new_mark) { - ct->mark = new_mark; + new_mark = mark | (READ_ONCE(ct->mark) & ~(mask)); + if (READ_ONCE(ct->mark) != new_mark) { + WRITE_ONCE(ct->mark, new_mark); if (nf_ct_is_confirmed(ct)) nf_conntrack_event_cache(IPCT_MARK, ct); } diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c index d4102f0a9abd..eaa02f098d1c 100644 --- a/net/sched/act_ctinfo.c +++ b/net/sched/act_ctinfo.c @@ -32,7 +32,7 @@ static void tcf_ctinfo_dscp_set(struct nf_conn *ct, struct tcf_ctinfo *ca, { u8 dscp, newdscp; - newdscp = (((ct->mark & cp->dscpmask) >> cp->dscpmaskshift) << 2) & + newdscp = (((READ_ONCE(ct->mark) & cp->dscpmask) >> cp->dscpmaskshift) << 2) & ~INET_ECN_MASK; switch (proto) { @@ -72,7 +72,7 @@ static void tcf_ctinfo_cpmark_set(struct nf_conn *ct, struct tcf_ctinfo *ca, struct sk_buff *skb) { ca->stats_cpmark_set++; - skb->mark = ct->mark & cp->cpmarkmask; + skb->mark = READ_ONCE(ct->mark) & cp->cpmarkmask; } static int tcf_ctinfo_act(struct sk_buff *skb, const struct tc_action *a, @@ -130,7 +130,7 @@ static int tcf_ctinfo_act(struct sk_buff *skb, const struct tc_action *a, } if (cp->mode & CTINFO_MODE_DSCP) - if (!cp->dscpstatemask || (ct->mark & cp->dscpstatemask)) + if (!cp->dscpstatemask || (READ_ONCE(ct->mark) & cp->dscpstatemask)) tcf_ctinfo_dscp_set(ct, ca, cp, skb, wlen, proto); if (cp->mode & CTINFO_MODE_CPMARK) -- cgit v1.2.3 From 33c7aba0b4ffd6d7cdab862a034eb582a5120a38 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 14 Nov 2022 11:31:54 +0100 Subject: netfilter: nf_tables: do not set up extensions for end interval Elements with an end interval flag set on do not store extensions. The global set definition is currently setting on the timeout and stateful expression for end interval elements. This leads to skipping end interval elements from the set->ops->walk() path as the expired check bogusly reports true. Moreover, do not set up stateful expressions for elements with end interval flag set on since this is never used. Fixes: 65038428b2c6 ("netfilter: nf_tables: allow to specify stateful expression in set definition") Fixes: 8d8540c4f5e0 ("netfilter: nft_set_rbtree: add timeout support") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index e7152d599d73..7a09421f19e1 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5958,7 +5958,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, &timeout); if (err) return err; - } else if (set->flags & NFT_SET_TIMEOUT) { + } else if (set->flags & NFT_SET_TIMEOUT && + !(flags & NFT_SET_ELEM_INTERVAL_END)) { timeout = set->timeout; } @@ -6024,7 +6025,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, err = -EOPNOTSUPP; goto err_set_elem_expr; } - } else if (set->num_exprs > 0) { + } else if (set->num_exprs > 0 && + !(flags & NFT_SET_ELEM_INTERVAL_END)) { err = nft_set_elem_expr_clone(ctx, set, expr_array); if (err < 0) goto err_set_elem_expr_clone; -- cgit v1.2.3 From 764f8485890d4988a2083f5dea90cfe0116b25f6 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 17 Nov 2022 20:21:52 -0800 Subject: ipv4/fib: Replace zero-length array with DECLARE_FLEX_ARRAY() helper Zero-length arrays are deprecated[1] and are being replaced with flexible array members in support of the ongoing efforts to tighten the FORTIFY_SOURCE routines on memcpy(), correctly instrument array indexing with UBSAN_BOUNDS, and to globally enable -fstrict-flex-arrays=3. Replace zero-length array with flexible-array member in struct key_vector. This results in no differences in binary output. [1] https://github.com/KSPP/linux/issues/78 Cc: Jakub Kicinski Cc: "David S. Miller" Cc: Hideaki YOSHIFUJI Cc: David Ahern Cc: Eric Dumazet Cc: Paolo Abeni Cc: "Gustavo A. R. Silva" Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook Reviewed-by: Gustavo A. R. Silva Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 452ff177e4da..c88bf856c443 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -126,7 +126,7 @@ struct key_vector { /* This list pointer if valid if (pos | bits) == 0 (LEAF) */ struct hlist_head leaf; /* This array is valid if (pos | bits) > 0 (TNODE) */ - struct key_vector __rcu *tnode[0]; + DECLARE_FLEX_ARRAY(struct key_vector __rcu *, tnode); }; }; -- cgit v1.2.3 From c7aa1a76d4a0a3c401025b60c401412bbb60f8c6 Mon Sep 17 00:00:00 2001 From: Vishwanath Pai Date: Wed, 28 Sep 2022 14:26:50 -0400 Subject: netfilter: ipset: regression in ip_set_hash_ip.c This patch introduced a regression: commit 48596a8ddc46 ("netfilter: ipset: Fix adding an IPv4 range containing more than 2^31 addresses") The variable e.ip is passed to adtfn() function which finally adds the ip address to the set. The patch above refactored the for loop and moved e.ip = htonl(ip) to the end of the for loop. What this means is that if the value of "ip" changes between the first assignement of e.ip and the forloop, then e.ip is pointing to a different ip address than "ip". Test case: $ ipset create jdtest_tmp hash:ip family inet hashsize 2048 maxelem 100000 $ ipset add jdtest_tmp 10.0.1.1/31 ipset v6.21.1: Element cannot be added to the set: it's already added The value of ip gets updated inside the "else if (tb[IPSET_ATTR_CIDR])" block but e.ip is still pointing to the old value. Fixes: 48596a8ddc46 ("netfilter: ipset: Fix adding an IPv4 range containing more than 2^31 addresses") Reviewed-by: Joshua Hunt Signed-off-by: Vishwanath Pai Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_hash_ip.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c index dd30c03d5a23..75d556d71652 100644 --- a/net/netfilter/ipset/ip_set_hash_ip.c +++ b/net/netfilter/ipset/ip_set_hash_ip.c @@ -151,18 +151,16 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], if (((u64)ip_to - ip + 1) >> (32 - h->netmask) > IPSET_MAX_RANGE) return -ERANGE; - if (retried) { + if (retried) ip = ntohl(h->next.ip); - e.ip = htonl(ip); - } for (; ip <= ip_to;) { + e.ip = htonl(ip); ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; ip += hosts; - e.ip = htonl(ip); - if (e.ip == 0) + if (ip == 0) return 0; ret = 0; -- cgit v1.2.3 From 0e5d56c64afcd6fd2d132ea972605b66f8a7d3c4 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 18 Nov 2022 16:45:00 -0500 Subject: tipc: set con sock in tipc_conn_alloc A crash was reported by Wei Chen: BUG: kernel NULL pointer dereference, address: 0000000000000018 RIP: 0010:tipc_conn_close+0x12/0x100 Call Trace: tipc_topsrv_exit_net+0x139/0x320 ops_exit_list.isra.9+0x49/0x80 cleanup_net+0x31a/0x540 process_one_work+0x3fa/0x9f0 worker_thread+0x42/0x5c0 It was caused by !con->sock in tipc_conn_close(). In tipc_topsrv_accept(), con is allocated in conn_idr then its sock is set: con = tipc_conn_alloc(); ... <----[1] con->sock = newsock; If tipc_conn_close() is called in anytime of [1], the null-pointer-def is triggered by con->sock->sk due to con->sock is not yet set. This patch fixes it by moving the con->sock setting to tipc_conn_alloc() under s->idr_lock. So that con->sock can never be NULL when getting the con from s->conn_idr. It will be also safer to move con->server and flag CF_CONNECTED setting under s->idr_lock, as they should all be set before tipc_conn_alloc() is called. Fixes: c5fa7b3cf3cb ("tipc: introduce new TIPC server infrastructure") Reported-by: Wei Chen Signed-off-by: Xin Long Acked-by: Jon Maloy Signed-off-by: Jakub Kicinski --- net/tipc/topsrv.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/tipc/topsrv.c b/net/tipc/topsrv.c index d92ec92f0b71..b0f9aa521670 100644 --- a/net/tipc/topsrv.c +++ b/net/tipc/topsrv.c @@ -176,7 +176,7 @@ static void tipc_conn_close(struct tipc_conn *con) conn_put(con); } -static struct tipc_conn *tipc_conn_alloc(struct tipc_topsrv *s) +static struct tipc_conn *tipc_conn_alloc(struct tipc_topsrv *s, struct socket *sock) { struct tipc_conn *con; int ret; @@ -202,10 +202,11 @@ static struct tipc_conn *tipc_conn_alloc(struct tipc_topsrv *s) } con->conid = ret; s->idr_in_use++; - spin_unlock_bh(&s->idr_lock); set_bit(CF_CONNECTED, &con->flags); con->server = s; + con->sock = sock; + spin_unlock_bh(&s->idr_lock); return con; } @@ -467,7 +468,7 @@ static void tipc_topsrv_accept(struct work_struct *work) ret = kernel_accept(lsock, &newsock, O_NONBLOCK); if (ret < 0) return; - con = tipc_conn_alloc(srv); + con = tipc_conn_alloc(srv, newsock); if (IS_ERR(con)) { ret = PTR_ERR(con); sock_release(newsock); @@ -479,7 +480,6 @@ static void tipc_topsrv_accept(struct work_struct *work) newsk->sk_data_ready = tipc_conn_data_ready; newsk->sk_write_space = tipc_conn_write_space; newsk->sk_user_data = con; - con->sock = newsock; write_unlock_bh(&newsk->sk_callback_lock); /* Wake up receive process in case of 'SYN+' message */ @@ -577,12 +577,11 @@ bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, u32 lower, sub.filter = filter; *(u64 *)&sub.usr_handle = (u64)port; - con = tipc_conn_alloc(tipc_topsrv(net)); + con = tipc_conn_alloc(tipc_topsrv(net), NULL); if (IS_ERR(con)) return false; *conid = con->conid; - con->sock = NULL; rc = tipc_conn_rcv_sub(tipc_topsrv(net), con, &sub); if (rc >= 0) return true; -- cgit v1.2.3 From a7b42969d63f47320853a802efd879fbdc4e010e Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 18 Nov 2022 16:45:01 -0500 Subject: tipc: add an extra conn_get in tipc_conn_alloc One extra conn_get() is needed in tipc_conn_alloc(), as after tipc_conn_alloc() is called, tipc_conn_close() may free this con before deferencing it in tipc_topsrv_accept(): tipc_conn_alloc(); newsk = newsock->sk; <---- tipc_conn_close(); write_lock_bh(&sk->sk_callback_lock); newsk->sk_data_ready = tipc_conn_data_ready; Then an uaf issue can be triggered: BUG: KASAN: use-after-free in tipc_topsrv_accept+0x1e7/0x370 [tipc] Call Trace: dump_stack_lvl+0x33/0x46 print_report+0x178/0x4b0 kasan_report+0x8c/0x100 kasan_check_range+0x179/0x1e0 tipc_topsrv_accept+0x1e7/0x370 [tipc] process_one_work+0x6a3/0x1030 worker_thread+0x8a/0xdf0 This patch fixes it by holding it in tipc_conn_alloc(), then after all accessing in tipc_topsrv_accept() releasing it. Note when does this in tipc_topsrv_kern_subscr(), as tipc_conn_rcv_sub() returns 0 or -1 only, we don't need to check for "> 0". Fixes: c5fa7b3cf3cb ("tipc: introduce new TIPC server infrastructure") Signed-off-by: Xin Long Acked-by: Jon Maloy Signed-off-by: Jakub Kicinski --- net/tipc/topsrv.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/tipc/topsrv.c b/net/tipc/topsrv.c index b0f9aa521670..e3b427a70398 100644 --- a/net/tipc/topsrv.c +++ b/net/tipc/topsrv.c @@ -206,6 +206,7 @@ static struct tipc_conn *tipc_conn_alloc(struct tipc_topsrv *s, struct socket *s set_bit(CF_CONNECTED, &con->flags); con->server = s; con->sock = sock; + conn_get(con); spin_unlock_bh(&s->idr_lock); return con; @@ -484,6 +485,7 @@ static void tipc_topsrv_accept(struct work_struct *work) /* Wake up receive process in case of 'SYN+' message */ newsk->sk_data_ready(newsk); + conn_put(con); } } @@ -583,10 +585,11 @@ bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, u32 lower, *conid = con->conid; rc = tipc_conn_rcv_sub(tipc_topsrv(net), con, &sub); - if (rc >= 0) - return true; + if (rc) + conn_put(con); + conn_put(con); - return false; + return !rc; } void tipc_topsrv_kern_unsubscr(struct net *net, int conid) -- cgit v1.2.3 From cd0f6421162201e4b22ce757a1966729323185eb Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Sat, 19 Nov 2022 15:28:32 +0800 Subject: tipc: check skb_linearize() return value in tipc_disc_rcv() If skb_linearize() fails in tipc_disc_rcv(), we need to free the skb instead of handle it. Fixes: 25b0b9c4e835 ("tipc: handle collisions of 32-bit node address hash values") Signed-off-by: YueHaibing Acked-by: Jon Maloy Link: https://lore.kernel.org/r/20221119072832.7896-1-yuehaibing@huawei.com Signed-off-by: Jakub Kicinski --- net/tipc/discover.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/discover.c b/net/tipc/discover.c index e8630707901e..e8dcdf267c0c 100644 --- a/net/tipc/discover.c +++ b/net/tipc/discover.c @@ -211,7 +211,10 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *skb, u32 self; int err; - skb_linearize(skb); + if (skb_linearize(skb)) { + kfree_skb(skb); + return; + } hdr = buf_msg(skb); if (caps & TIPC_NODE_ID128) -- cgit v1.2.3 From b97df039a68b2f3e848e238df5d5d06343ea497b Mon Sep 17 00:00:00 2001 From: Thomas Jarosch Date: Wed, 2 Nov 2022 11:18:48 +0100 Subject: xfrm: Fix oops in __xfrm_state_delete() Kernel 5.14 added a new "byseq" index to speed up xfrm_state lookups by sequence number in commit fe9f1d8779cb ("xfrm: add state hashtable keyed by seq") While the patch was thorough, the function pfkey_send_new_mapping() in net/af_key.c also modifies x->km.seq and never added the current xfrm_state to the "byseq" index. This leads to the following kernel Ooops: BUG: kernel NULL pointer dereference, address: 0000000000000000 .. RIP: 0010:__xfrm_state_delete+0xc9/0x1c0 .. Call Trace: xfrm_state_delete+0x1e/0x40 xfrm_del_sa+0xb0/0x110 [xfrm_user] xfrm_user_rcv_msg+0x12d/0x270 [xfrm_user] ? remove_entity_load_avg+0x8a/0xa0 ? copy_to_user_state_extra+0x580/0x580 [xfrm_user] netlink_rcv_skb+0x51/0x100 xfrm_netlink_rcv+0x30/0x50 [xfrm_user] netlink_unicast+0x1a6/0x270 netlink_sendmsg+0x22a/0x480 __sys_sendto+0x1a6/0x1c0 ? __audit_syscall_entry+0xd8/0x130 ? __audit_syscall_exit+0x249/0x2b0 __x64_sys_sendto+0x23/0x30 do_syscall_64+0x3a/0x90 entry_SYSCALL_64_after_hwframe+0x61/0xcb Exact location of the crash in __xfrm_state_delete(): if (x->km.seq) hlist_del_rcu(&x->byseq); The hlist_node "byseq" was never populated. The bug only triggers if a new NAT traversal mapping (changed IP or port) is detected in esp_input_done2() / esp6_input_done2(), which in turn indirectly calls pfkey_send_new_mapping() *if* the kernel is compiled with CONFIG_NET_KEY and "af_key" is active. The PF_KEYv2 message SADB_X_NAT_T_NEW_MAPPING is not part of RFC 2367. Various implementations have been examined how they handle the "sadb_msg_seq" header field: - racoon (Android): does not process SADB_X_NAT_T_NEW_MAPPING - strongswan: does not care about sadb_msg_seq - openswan: does not care about sadb_msg_seq There is no standard how PF_KEYv2 sadb_msg_seq should be populated for SADB_X_NAT_T_NEW_MAPPING and it's not used in popular implementations either. Herbert Xu suggested we should just use the current km.seq value as is. This fixes the root cause of the oops since we no longer modify km.seq itself. The update of "km.seq" looks like a copy'n'paste error from pfkey_send_acquire(). SADB_ACQUIRE must indeed assign a unique km.seq number according to RFC 2367. It has been verified that code paths involving pfkey_send_acquire() don't cause the same Oops. PF_KEYv2 SADB_X_NAT_T_NEW_MAPPING support was originally added here: https://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git commit cbc3488685b20e7b2a98ad387a1a816aada569d8 Author: Derek Atkins AuthorDate: Wed Apr 2 13:21:02 2003 -0800 [IPSEC]: Implement UDP Encapsulation framework. In particular, implement ESPinUDP encapsulation for IPsec Nat Traversal. A note on triggering the bug: I was not able to trigger it using VMs. There is one VPN using a high latency link on our production VPN server that triggered it like once a day though. Link: https://github.com/strongswan/strongswan/issues/992 Link: https://lore.kernel.org/netdev/00959f33ee52c4b3b0084d42c430418e502db554.1652340703.git.antony.antony@secunet.com/T/ Link: https://lore.kernel.org/netdev/20221027142455.3975224-1-chenzhihao@meizu.com/T/ Fixes: fe9f1d8779cb ("xfrm: add state hashtable keyed by seq") Reported-by: Roth Mark Reported-by: Zhihao Chen Tested-by: Roth Mark Signed-off-by: Thomas Jarosch Acked-by: Antony Antony Acked-by: Herbert Xu Signed-off-by: Steffen Klassert --- net/key/af_key.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/key/af_key.c b/net/key/af_key.c index 213287814328..95edcbedf6ef 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -3394,7 +3394,7 @@ static int pfkey_send_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, hdr->sadb_msg_len = size / sizeof(uint64_t); hdr->sadb_msg_errno = 0; hdr->sadb_msg_reserved = 0; - hdr->sadb_msg_seq = x->km.seq = get_acqseq(); + hdr->sadb_msg_seq = x->km.seq; hdr->sadb_msg_pid = 0; /* SA */ -- cgit v1.2.3 From 40781bfb836eda57d19c0baa37c7e72590e05fdc Mon Sep 17 00:00:00 2001 From: Chen Zhongjin Date: Thu, 3 Nov 2022 17:07:13 +0800 Subject: xfrm: Fix ignored return value in xfrm6_init() When IPv6 module initializing in xfrm6_init(), register_pernet_subsys() is possible to fail but its return value is ignored. If IPv6 initialization fails later and xfrm6_fini() is called, removing uninitialized list in xfrm6_net_ops will cause null-ptr-deref: KASAN: null-ptr-deref in range [0x0000000000000008-0x000000000000000f] CPU: 1 PID: 330 Comm: insmod RIP: 0010:unregister_pernet_operations+0xc9/0x450 Call Trace: unregister_pernet_subsys+0x31/0x3e xfrm6_fini+0x16/0x30 [ipv6] ip6_route_init+0xcd/0x128 [ipv6] inet6_init+0x29c/0x602 [ipv6] ... Fix it by catching the error return value of register_pernet_subsys(). Fixes: 8d068875caca ("xfrm: make gc_thresh configurable in all namespaces") Signed-off-by: Chen Zhongjin Reviewed-by: Leon Romanovsky Signed-off-by: Steffen Klassert --- net/ipv6/xfrm6_policy.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 4a4b0e49ec92..ea435eba3053 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -287,9 +287,13 @@ int __init xfrm6_init(void) if (ret) goto out_state; - register_pernet_subsys(&xfrm6_net_ops); + ret = register_pernet_subsys(&xfrm6_net_ops); + if (ret) + goto out_protocol; out: return ret; +out_protocol: + xfrm6_protocol_fini(); out_state: xfrm6_state_fini(); out_policy: -- cgit v1.2.3 From 8427fd100c7b7793650e212a81e42f1cf124613d Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 18 Nov 2022 16:33:03 -0500 Subject: net: sched: allow act_ct to be built without NF_NAT In commit f11fe1dae1c4 ("net/sched: Make NET_ACT_CT depends on NF_NAT"), it fixed the build failure when NF_NAT is m and NET_ACT_CT is y by adding depends on NF_NAT for NET_ACT_CT. However, it would also cause NET_ACT_CT cannot be built without NF_NAT, which is not expected. This patch fixes it by changing to use "(!NF_NAT || NF_NAT)" as the depend. Fixes: f11fe1dae1c4 ("net/sched: Make NET_ACT_CT depends on NF_NAT") Signed-off-by: Xin Long Link: https://lore.kernel.org/r/b6386f28d1ba34721795fb776a91cbdabb203447.1668807183.git.lucien.xin@gmail.com Signed-off-by: Paolo Abeni --- net/sched/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 1e8ab4749c6c..4662a6ce8a7e 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -976,7 +976,7 @@ config NET_ACT_TUNNEL_KEY config NET_ACT_CT tristate "connection tracking tc action" - depends on NET_CLS_ACT && NF_CONNTRACK && NF_NAT && NF_FLOW_TABLE + depends on NET_CLS_ACT && NF_CONNTRACK && (!NF_NAT || NF_NAT) && NF_FLOW_TABLE help Say Y here to allow sending the packets to conntrack module. -- cgit v1.2.3 From 53270fb0fd77fe786d8c07a0793981d797836b93 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Fri, 18 Nov 2022 16:24:19 +0800 Subject: NFC: nci: fix memory leak in nci_rx_data_packet() Syzbot reported a memory leak about skb: unreferenced object 0xffff88810e144e00 (size 240): comm "syz-executor284", pid 3701, jiffies 4294952403 (age 12.620s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [] __alloc_skb+0x1f9/0x270 net/core/skbuff.c:497 [] alloc_skb include/linux/skbuff.h:1267 [inline] [] virtual_ncidev_write+0x24/0xe0 drivers/nfc/virtual_ncidev.c:116 [] do_loop_readv_writev fs/read_write.c:759 [inline] [] do_loop_readv_writev fs/read_write.c:743 [inline] [] do_iter_write+0x253/0x300 fs/read_write.c:863 [] vfs_writev+0xdd/0x240 fs/read_write.c:934 [] do_writev+0xa6/0x1c0 fs/read_write.c:977 [] do_syscall_x64 arch/x86/entry/common.c:50 [inline] [] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 [] entry_SYSCALL_64_after_hwframe+0x63/0xcd In nci_rx_data_packet(), if we don't get a valid conn_info, we will return directly but forget to release the skb. Reported-by: syzbot+cdb9a427d1bc08815104@syzkaller.appspotmail.com Fixes: 4aeee6871e8c ("NFC: nci: Add dynamic logical connections support") Signed-off-by: Liu Shixin Link: https://lore.kernel.org/r/20221118082419.239475-1-liushixin2@huawei.com Signed-off-by: Paolo Abeni --- net/nfc/nci/data.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/nfc/nci/data.c b/net/nfc/nci/data.c index aa5e712adf07..3d36ea5701f0 100644 --- a/net/nfc/nci/data.c +++ b/net/nfc/nci/data.c @@ -279,8 +279,10 @@ void nci_rx_data_packet(struct nci_dev *ndev, struct sk_buff *skb) nci_plen(skb->data)); conn_info = nci_get_conn_info_by_conn_id(ndev, nci_conn_id(skb->data)); - if (!conn_info) + if (!conn_info) { + kfree_skb(skb); return; + } /* strip the nci data header */ skb_pull(skb, NCI_DATA_HDR_SIZE); -- cgit v1.2.3 From 6a66ce44a51bdfc47721f0c591137df2d4b21247 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Tue, 22 Nov 2022 20:18:58 +0100 Subject: netfilter: ipset: restore allowing 64 clashing elements in hash:net,iface The commit 510841da1fcc ("netfilter: ipset: enforce documented limit to prevent allocating huge memory") was too strict and prevented to add up to 64 clashing elements to a hash:net,iface type of set. This patch fixes the issue and now the type behaves as documented. Fixes: 510841da1fcc ("netfilter: ipset: enforce documented limit to prevent allocating huge memory") Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_hash_gen.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 3adc291d9ce1..7499192af586 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -916,7 +916,7 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, #ifdef IP_SET_HASH_WITH_MULTI if (h->bucketsize >= AHASH_MAX_TUNED) goto set_full; - else if (h->bucketsize < multi) + else if (h->bucketsize <= multi) h->bucketsize += AHASH_INIT_SIZE; #endif if (n->size >= AHASH_MAX(h)) { -- cgit v1.2.3 From bcd9e3c1656d0f7dd9743598c65c3ae24efb38d0 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Mon, 21 Nov 2022 19:26:15 +0100 Subject: netfilter: flowtable_offload: add missing locking nf_flow_table_block_setup and the driver TC_SETUP_FT call can modify the flow block cb list while they are being traversed elsewhere, causing a crash. Add a write lock around the calls to protect readers Fixes: c29f74e0df7a ("netfilter: nf_flow_table: hardware offload support") Reported-by: Chad Monroe Signed-off-by: Felix Fietkau Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_offload.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index b04645ced89b..00b522890d77 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -1098,6 +1098,7 @@ static int nf_flow_table_block_setup(struct nf_flowtable *flowtable, struct flow_block_cb *block_cb, *next; int err = 0; + down_write(&flowtable->flow_block_lock); switch (cmd) { case FLOW_BLOCK_BIND: list_splice(&bo->cb_list, &flowtable->flow_block.cb_list); @@ -1112,6 +1113,7 @@ static int nf_flow_table_block_setup(struct nf_flowtable *flowtable, WARN_ON_ONCE(1); err = -EOPNOTSUPP; } + up_write(&flowtable->flow_block_lock); return err; } @@ -1168,7 +1170,9 @@ static int nf_flow_table_offload_cmd(struct flow_block_offload *bo, nf_flow_table_block_offload_init(bo, dev_net(dev), cmd, flowtable, extack); + down_write(&flowtable->flow_block_lock); err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_FT, bo); + up_write(&flowtable->flow_block_lock); if (err < 0) return err; -- cgit v1.2.3 From 77934dc6db0d2b111a8f2759e9ad2fb67f5cffa5 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 18 Nov 2022 17:49:11 -0800 Subject: dccp/tcp: Reset saddr on failure after inet6?_hash_connect(). When connect() is called on a socket bound to the wildcard address, we change the socket's saddr to a local address. If the socket fails to connect() to the destination, we have to reset the saddr. However, when an error occurs after inet_hash6?_connect() in (dccp|tcp)_v[46]_conect(), we forget to reset saddr and leave the socket bound to the address. From the user's point of view, whether saddr is reset or not varies with errno. Let's fix this inconsistent behaviour. Note that after this patch, the repro [0] will trigger the WARN_ON() in inet_csk_get_port() again, but this patch is not buggy and rather fixes a bug papering over the bhash2's bug for which we need another fix. For the record, the repro causes -EADDRNOTAVAIL in inet_hash6_connect() by this sequence: s1 = socket() s1.setsockopt(SOL_SOCKET, SO_REUSEADDR, 1) s1.bind(('127.0.0.1', 10000)) s1.sendto(b'hello', MSG_FASTOPEN, (('127.0.0.1', 10000))) # or s1.connect(('127.0.0.1', 10000)) s2 = socket() s2.setsockopt(SOL_SOCKET, SO_REUSEADDR, 1) s2.bind(('0.0.0.0', 10000)) s2.connect(('127.0.0.1', 10000)) # -EADDRNOTAVAIL s2.listen(32) # WARN_ON(inet_csk(sk)->icsk_bind2_hash != tb2); [0]: https://syzkaller.appspot.com/bug?extid=015d756bbd1f8b5c8f09 Fixes: 3df80d9320bc ("[DCCP]: Introduce DCCPv6") Fixes: 7c657876b63c ("[DCCP]: Initial implementation") Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Kuniyuki Iwashima Acked-by: Joanne Koong Reviewed-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- net/dccp/ipv4.c | 2 ++ net/dccp/ipv6.c | 2 ++ net/ipv4/tcp_ipv4.c | 2 ++ net/ipv6/tcp_ipv6.c | 2 ++ 4 files changed, 8 insertions(+) (limited to 'net') diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 713b7b8dad7e..40640c26680e 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -157,6 +157,8 @@ failure: * This unhashes the socket and releases the local port, if necessary. */ dccp_set_state(sk, DCCP_CLOSED); + if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) + inet_reset_saddr(sk); ip_rt_put(rt); sk->sk_route_caps = 0; inet->inet_dport = 0; diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index e57b43006074..626166cb6d7e 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -985,6 +985,8 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, late_failure: dccp_set_state(sk, DCCP_CLOSED); + if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) + inet_reset_saddr(sk); __sk_dst_reset(sk); failure: inet->inet_dport = 0; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 87d440f47a70..6a3a732b584d 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -343,6 +343,8 @@ failure: * if necessary. */ tcp_set_state(sk, TCP_CLOSE); + if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) + inet_reset_saddr(sk); ip_rt_put(rt); sk->sk_route_caps = 0; inet->inet_dport = 0; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 2a3f9296df1e..81b396e5cf79 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -359,6 +359,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, late_failure: tcp_set_state(sk, TCP_CLOSE); + if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) + inet_reset_saddr(sk); failure: inet->inet_dport = 0; sk->sk_route_caps = 0; -- cgit v1.2.3 From 8acdad37cd13ce777b0811b2d332314037fcefa8 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 18 Nov 2022 17:49:12 -0800 Subject: dccp/tcp: Remove NULL check for prev_saddr in inet_bhash2_update_saddr(). When we call inet_bhash2_update_saddr(), prev_saddr is always non-NULL. Let's remove the unnecessary test. Signed-off-by: Kuniyuki Iwashima Acked-by: Joanne Koong Reviewed-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- net/ipv4/inet_hashtables.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 033bf3c2538f..d745f962745e 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -877,13 +877,10 @@ int inet_bhash2_update_saddr(struct inet_bind_hashbucket *prev_saddr, struct soc head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); - if (prev_saddr) { - spin_lock_bh(&prev_saddr->lock); - __sk_del_bind2_node(sk); - inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, - inet_csk(sk)->icsk_bind2_hash); - spin_unlock_bh(&prev_saddr->lock); - } + spin_lock_bh(&prev_saddr->lock); + __sk_del_bind2_node(sk); + inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, inet_csk(sk)->icsk_bind2_hash); + spin_unlock_bh(&prev_saddr->lock); spin_lock_bh(&head2->lock); tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk); -- cgit v1.2.3 From 8c5dae4c1a49489499e6708c7dd284370ca36287 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 18 Nov 2022 17:49:13 -0800 Subject: dccp/tcp: Update saddr under bhash's lock. When we call connect() for a socket bound to a wildcard address, we update saddr locklessly. However, it could result in a data race; another thread iterating over bhash might see a corrupted address. Let's update saddr under the bhash bucket's lock. Fixes: 3df80d9320bc ("[DCCP]: Introduce DCCPv6") Fixes: 7c657876b63c ("[DCCP]: Initial implementation") Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Kuniyuki Iwashima Acked-by: Joanne Koong Reviewed-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/net/inet_hashtables.h | 2 +- net/dccp/ipv4.c | 22 ++++----------------- net/dccp/ipv6.c | 23 ++++------------------ net/ipv4/af_inet.c | 11 +---------- net/ipv4/inet_hashtables.c | 45 +++++++++++++++++++++++++++++++++++++------ net/ipv4/tcp_ipv4.c | 20 ++++--------------- net/ipv6/tcp_ipv6.c | 19 +++--------------- 7 files changed, 56 insertions(+), 86 deletions(-) (limited to 'net') diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 3af1e927247d..ba06e8b52264 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -281,7 +281,7 @@ inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, in * sk_v6_rcv_saddr (ipv6) changes after it has been binded. The socket's * rcv_saddr field should already have been updated when this is called. */ -int inet_bhash2_update_saddr(struct inet_bind_hashbucket *prev_saddr, struct sock *sk); +int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family); void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, struct inet_bind2_bucket *tb2, unsigned short port); diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 40640c26680e..95e376e3b911 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -45,11 +45,10 @@ static unsigned int dccp_v4_pernet_id __read_mostly; int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { const struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; - struct inet_bind_hashbucket *prev_addr_hashbucket = NULL; - __be32 daddr, nexthop, prev_sk_rcv_saddr; struct inet_sock *inet = inet_sk(sk); struct dccp_sock *dp = dccp_sk(sk); __be16 orig_sport, orig_dport; + __be32 daddr, nexthop; struct flowi4 *fl4; struct rtable *rt; int err; @@ -91,26 +90,13 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) daddr = fl4->daddr; if (inet->inet_saddr == 0) { - if (inet_csk(sk)->icsk_bind2_hash) { - prev_addr_hashbucket = - inet_bhashfn_portaddr(&dccp_hashinfo, sk, - sock_net(sk), - inet->inet_num); - prev_sk_rcv_saddr = sk->sk_rcv_saddr; - } - inet->inet_saddr = fl4->saddr; - } - - sk_rcv_saddr_set(sk, inet->inet_saddr); - - if (prev_addr_hashbucket) { - err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk); + err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET); if (err) { - inet->inet_saddr = 0; - sk_rcv_saddr_set(sk, prev_sk_rcv_saddr); ip_rt_put(rt); return err; } + } else { + sk_rcv_saddr_set(sk, inet->inet_saddr); } inet->inet_dport = usin->sin_port; diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 626166cb6d7e..94c101ed57a9 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -934,26 +934,11 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, } if (saddr == NULL) { - struct inet_bind_hashbucket *prev_addr_hashbucket = NULL; - struct in6_addr prev_v6_rcv_saddr; - - if (icsk->icsk_bind2_hash) { - prev_addr_hashbucket = inet_bhashfn_portaddr(&dccp_hashinfo, - sk, sock_net(sk), - inet->inet_num); - prev_v6_rcv_saddr = sk->sk_v6_rcv_saddr; - } - saddr = &fl6.saddr; - sk->sk_v6_rcv_saddr = *saddr; - - if (prev_addr_hashbucket) { - err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk); - if (err) { - sk->sk_v6_rcv_saddr = prev_v6_rcv_saddr; - goto failure; - } - } + + err = inet_bhash2_update_saddr(sk, saddr, AF_INET6); + if (err) + goto failure; } /* set the source address */ diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 4728087c42a5..0da679411330 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1230,7 +1230,6 @@ EXPORT_SYMBOL(inet_unregister_protosw); static int inet_sk_reselect_saddr(struct sock *sk) { - struct inet_bind_hashbucket *prev_addr_hashbucket; struct inet_sock *inet = inet_sk(sk); __be32 old_saddr = inet->inet_saddr; __be32 daddr = inet->inet_daddr; @@ -1260,16 +1259,8 @@ static int inet_sk_reselect_saddr(struct sock *sk) return 0; } - prev_addr_hashbucket = - inet_bhashfn_portaddr(tcp_or_dccp_get_hashinfo(sk), sk, - sock_net(sk), inet->inet_num); - - inet->inet_saddr = inet->inet_rcv_saddr = new_saddr; - - err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk); + err = inet_bhash2_update_saddr(sk, &new_saddr, AF_INET); if (err) { - inet->inet_saddr = old_saddr; - inet->inet_rcv_saddr = old_saddr; ip_rt_put(rt); return err; } diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index d745f962745e..18ef370af113 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -858,14 +858,34 @@ inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, in return &hinfo->bhash2[hash & (hinfo->bhash_size - 1)]; } -int inet_bhash2_update_saddr(struct inet_bind_hashbucket *prev_saddr, struct sock *sk) +static void inet_update_saddr(struct sock *sk, void *saddr, int family) +{ + if (family == AF_INET) { + inet_sk(sk)->inet_saddr = *(__be32 *)saddr; + sk_rcv_saddr_set(sk, inet_sk(sk)->inet_saddr); + } +#if IS_ENABLED(CONFIG_IPV6) + else { + sk->sk_v6_rcv_saddr = *(struct in6_addr *)saddr; + } +#endif +} + +int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family) { struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk); + struct inet_bind_hashbucket *head, *head2; struct inet_bind2_bucket *tb2, *new_tb2; int l3mdev = inet_sk_bound_l3mdev(sk); - struct inet_bind_hashbucket *head2; int port = inet_sk(sk)->inet_num; struct net *net = sock_net(sk); + int bhash; + + if (!inet_csk(sk)->icsk_bind2_hash) { + /* Not bind()ed before. */ + inet_update_saddr(sk, saddr, family); + return 0; + } /* Allocate a bind2 bucket ahead of time to avoid permanently putting * the bhash2 table in an inconsistent state if a new tb2 bucket @@ -875,14 +895,25 @@ int inet_bhash2_update_saddr(struct inet_bind_hashbucket *prev_saddr, struct soc if (!new_tb2) return -ENOMEM; + bhash = inet_bhashfn(net, port, hinfo->bhash_size); + head = &hinfo->bhash[bhash]; head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); - spin_lock_bh(&prev_saddr->lock); + /* If we change saddr locklessly, another thread + * iterating over bhash might see corrupted address. + */ + spin_lock_bh(&head->lock); + + spin_lock(&head2->lock); __sk_del_bind2_node(sk); inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, inet_csk(sk)->icsk_bind2_hash); - spin_unlock_bh(&prev_saddr->lock); + spin_unlock(&head2->lock); + + inet_update_saddr(sk, saddr, family); - spin_lock_bh(&head2->lock); + head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); + + spin_lock(&head2->lock); tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk); if (!tb2) { tb2 = new_tb2; @@ -890,7 +921,9 @@ int inet_bhash2_update_saddr(struct inet_bind_hashbucket *prev_saddr, struct soc } sk_add_bind2_node(sk, &tb2->owners); inet_csk(sk)->icsk_bind2_hash = tb2; - spin_unlock_bh(&head2->lock); + spin_unlock(&head2->lock); + + spin_unlock_bh(&head->lock); if (tb2 != new_tb2) kmem_cache_free(hinfo->bind2_bucket_cachep, new_tb2); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 6a3a732b584d..23dd7e9df2d5 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -199,15 +199,14 @@ static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, /* This will initiate an outgoing connection. */ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { - struct inet_bind_hashbucket *prev_addr_hashbucket = NULL; struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; struct inet_timewait_death_row *tcp_death_row; - __be32 daddr, nexthop, prev_sk_rcv_saddr; struct inet_sock *inet = inet_sk(sk); struct tcp_sock *tp = tcp_sk(sk); struct ip_options_rcu *inet_opt; struct net *net = sock_net(sk); __be16 orig_sport, orig_dport; + __be32 daddr, nexthop; struct flowi4 *fl4; struct rtable *rt; int err; @@ -251,24 +250,13 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; if (!inet->inet_saddr) { - if (inet_csk(sk)->icsk_bind2_hash) { - prev_addr_hashbucket = inet_bhashfn_portaddr(tcp_death_row->hashinfo, - sk, net, inet->inet_num); - prev_sk_rcv_saddr = sk->sk_rcv_saddr; - } - inet->inet_saddr = fl4->saddr; - } - - sk_rcv_saddr_set(sk, inet->inet_saddr); - - if (prev_addr_hashbucket) { - err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk); + err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET); if (err) { - inet->inet_saddr = 0; - sk_rcv_saddr_set(sk, prev_sk_rcv_saddr); ip_rt_put(rt); return err; } + } else { + sk_rcv_saddr_set(sk, inet->inet_saddr); } if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 81b396e5cf79..2f3ca3190d26 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -292,24 +292,11 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; if (!saddr) { - struct inet_bind_hashbucket *prev_addr_hashbucket = NULL; - struct in6_addr prev_v6_rcv_saddr; - - if (icsk->icsk_bind2_hash) { - prev_addr_hashbucket = inet_bhashfn_portaddr(tcp_death_row->hashinfo, - sk, net, inet->inet_num); - prev_v6_rcv_saddr = sk->sk_v6_rcv_saddr; - } saddr = &fl6.saddr; - sk->sk_v6_rcv_saddr = *saddr; - if (prev_addr_hashbucket) { - err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk); - if (err) { - sk->sk_v6_rcv_saddr = prev_v6_rcv_saddr; - goto failure; - } - } + err = inet_bhash2_update_saddr(sk, saddr, AF_INET6); + if (err) + goto failure; } /* set the source address */ -- cgit v1.2.3 From e0833d1fedb02f038b526ae7dde178a076f56545 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 18 Nov 2022 17:49:14 -0800 Subject: dccp/tcp: Fixup bhash2 bucket when connect() fails. If a socket bound to a wildcard address fails to connect(), we only reset saddr and keep the port. Then, we have to fix up the bhash2 bucket; otherwise, the bucket has an inconsistent address in the list. Also, listen() for such a socket will fire the WARN_ON() in inet_csk_get_port(). [0] Note that when a system runs out of memory, we give up fixing the bucket and unlink sk from bhash and bhash2 by inet_put_port(). [0]: WARNING: CPU: 0 PID: 207 at net/ipv4/inet_connection_sock.c:548 inet_csk_get_port (net/ipv4/inet_connection_sock.c:548 (discriminator 1)) Modules linked in: CPU: 0 PID: 207 Comm: bhash2_prev_rep Not tainted 6.1.0-rc3-00799-gc8421681c845 #63 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.0-1.amzn2022.0.1 04/01/2014 RIP: 0010:inet_csk_get_port (net/ipv4/inet_connection_sock.c:548 (discriminator 1)) Code: 74 a7 eb 93 48 8b 54 24 18 0f b7 cb 4c 89 e6 4c 89 ff e8 48 b2 ff ff 49 8b 87 18 04 00 00 e9 32 ff ff ff 0f 0b e9 34 ff ff ff <0f> 0b e9 42 ff ff ff 41 8b 7f 50 41 8b 4f 54 89 fe 81 f6 00 00 ff RSP: 0018:ffffc900003d7e50 EFLAGS: 00010202 RAX: ffff8881047fb500 RBX: 0000000000004e20 RCX: 0000000000000000 RDX: 000000000000000a RSI: 00000000fffffe00 RDI: 00000000ffffffff RBP: ffffffff8324dc00 R08: 0000000000000001 R09: 0000000000000001 R10: 0000000000000001 R11: 0000000000000001 R12: 0000000000000000 R13: 0000000000000001 R14: 0000000000004e20 R15: ffff8881054e1280 FS: 00007f8ac04dc740(0000) GS:ffff88842fc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000020001540 CR3: 00000001055fa003 CR4: 0000000000770ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: inet_csk_listen_start (net/ipv4/inet_connection_sock.c:1205) inet_listen (net/ipv4/af_inet.c:228) __sys_listen (net/socket.c:1810) __x64_sys_listen (net/socket.c:1819 net/socket.c:1817 net/socket.c:1817) do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:120) RIP: 0033:0x7f8ac051de5d Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 93 af 1b 00 f7 d8 64 89 01 48 RSP: 002b:00007ffc1c177248 EFLAGS: 00000206 ORIG_RAX: 0000000000000032 RAX: ffffffffffffffda RBX: 0000000020001550 RCX: 00007f8ac051de5d RDX: ffffffffffffff80 RSI: 0000000000000000 RDI: 0000000000000004 RBP: 00007ffc1c177270 R08: 0000000000000018 R09: 0000000000000007 R10: 0000000020001540 R11: 0000000000000206 R12: 00007ffc1c177388 R13: 0000000000401169 R14: 0000000000403e18 R15: 00007f8ac0723000 Fixes: 28044fc1d495 ("net: Add a bhash2 table hashed by port and address") Reported-by: syzbot Reported-by: Mat Martineau Signed-off-by: Kuniyuki Iwashima Acked-by: Joanne Koong Reviewed-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/net/inet_hashtables.h | 1 + net/dccp/ipv4.c | 3 +-- net/dccp/ipv6.c | 3 +-- net/dccp/proto.c | 3 +-- net/ipv4/inet_hashtables.c | 38 ++++++++++++++++++++++++++++++++++---- net/ipv4/tcp.c | 3 +-- net/ipv4/tcp_ipv4.c | 3 +-- net/ipv6/tcp_ipv6.c | 3 +-- 8 files changed, 41 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index ba06e8b52264..69174093078f 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -282,6 +282,7 @@ inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, in * rcv_saddr field should already have been updated when this is called. */ int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family); +void inet_bhash2_reset_saddr(struct sock *sk); void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, struct inet_bind2_bucket *tb2, unsigned short port); diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 95e376e3b911..b780827f5e0a 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -143,8 +143,7 @@ failure: * This unhashes the socket and releases the local port, if necessary. */ dccp_set_state(sk, DCCP_CLOSED); - if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) - inet_reset_saddr(sk); + inet_bhash2_reset_saddr(sk); ip_rt_put(rt); sk->sk_route_caps = 0; inet->inet_dport = 0; diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 94c101ed57a9..602f3432d80b 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -970,8 +970,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, late_failure: dccp_set_state(sk, DCCP_CLOSED); - if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) - inet_reset_saddr(sk); + inet_bhash2_reset_saddr(sk); __sk_dst_reset(sk); failure: inet->inet_dport = 0; diff --git a/net/dccp/proto.c b/net/dccp/proto.c index c548ca3e9b0e..85e35c5e8890 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -279,8 +279,7 @@ int dccp_disconnect(struct sock *sk, int flags) inet->inet_dport = 0; - if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) - inet_reset_saddr(sk); + inet_bhash2_reset_saddr(sk); sk->sk_shutdown = 0; sock_reset_flag(sk, SOCK_DONE); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 18ef370af113..3cec471a2cd2 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -871,7 +871,7 @@ static void inet_update_saddr(struct sock *sk, void *saddr, int family) #endif } -int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family) +static int __inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family, bool reset) { struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk); struct inet_bind_hashbucket *head, *head2; @@ -883,7 +883,11 @@ int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family) if (!inet_csk(sk)->icsk_bind2_hash) { /* Not bind()ed before. */ - inet_update_saddr(sk, saddr, family); + if (reset) + inet_reset_saddr(sk); + else + inet_update_saddr(sk, saddr, family); + return 0; } @@ -892,8 +896,19 @@ int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family) * allocation fails. */ new_tb2 = kmem_cache_alloc(hinfo->bind2_bucket_cachep, GFP_ATOMIC); - if (!new_tb2) + if (!new_tb2) { + if (reset) { + /* The (INADDR_ANY, port) bucket might have already + * been freed, then we cannot fixup icsk_bind2_hash, + * so we give up and unlink sk from bhash/bhash2 not + * to leave inconsistency in bhash2. + */ + inet_put_port(sk); + inet_reset_saddr(sk); + } + return -ENOMEM; + } bhash = inet_bhashfn(net, port, hinfo->bhash_size); head = &hinfo->bhash[bhash]; @@ -909,7 +924,10 @@ int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family) inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, inet_csk(sk)->icsk_bind2_hash); spin_unlock(&head2->lock); - inet_update_saddr(sk, saddr, family); + if (reset) + inet_reset_saddr(sk); + else + inet_update_saddr(sk, saddr, family); head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); @@ -930,8 +948,20 @@ int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family) return 0; } + +int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family) +{ + return __inet_bhash2_update_saddr(sk, saddr, family, false); +} EXPORT_SYMBOL_GPL(inet_bhash2_update_saddr); +void inet_bhash2_reset_saddr(struct sock *sk) +{ + if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) + __inet_bhash2_update_saddr(sk, NULL, 0, true); +} +EXPORT_SYMBOL_GPL(inet_bhash2_reset_saddr); + /* RFC 6056 3.3.4. Algorithm 4: Double-Hash Port Selection Algorithm * Note that we use 32bit integers (vs RFC 'short integers') * because 2^16 is not a multiple of num_ephemeral and this diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 54836a6b81d6..4f2205756cfe 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3114,8 +3114,7 @@ int tcp_disconnect(struct sock *sk, int flags) inet->inet_dport = 0; - if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) - inet_reset_saddr(sk); + inet_bhash2_reset_saddr(sk); sk->sk_shutdown = 0; sock_reset_flag(sk, SOCK_DONE); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 23dd7e9df2d5..da46357f501b 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -331,8 +331,7 @@ failure: * if necessary. */ tcp_set_state(sk, TCP_CLOSE); - if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) - inet_reset_saddr(sk); + inet_bhash2_reset_saddr(sk); ip_rt_put(rt); sk->sk_route_caps = 0; inet->inet_dport = 0; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 2f3ca3190d26..f0548dbcabd2 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -346,8 +346,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, late_failure: tcp_set_state(sk, TCP_CLOSE); - if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) - inet_reset_saddr(sk); + inet_bhash2_reset_saddr(sk); failure: inet->inet_dport = 0; sk->sk_route_caps = 0; -- cgit v1.2.3 From 568fe84940ac0e4e0b2cd7751b8b4911f7b9c215 Mon Sep 17 00:00:00 2001 From: Ziyang Xuan Date: Sun, 20 Nov 2022 15:28:38 +0800 Subject: ipv4: Fix error return code in fib_table_insert() In fib_table_insert(), if the alias was already inserted, but node not exist, the error code should be set before return from error handling path. Fixes: a6c76c17df02 ("ipv4: Notify route after insertion to the routing table") Signed-off-by: Ziyang Xuan Link: https://lore.kernel.org/r/20221120072838.2167047-1-william.xuanziyang@huawei.com Signed-off-by: Jakub Kicinski --- net/ipv4/fib_trie.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index c88bf856c443..74d403dbd2b4 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1381,8 +1381,10 @@ int fib_table_insert(struct net *net, struct fib_table *tb, /* The alias was already inserted, so the node must exist. */ l = l ? l : fib_find_node(t, &tp, key); - if (WARN_ON_ONCE(!l)) + if (WARN_ON_ONCE(!l)) { + err = -ENOENT; goto out_free_new_fa; + } if (fib_find_alias(&l->leaf, new_fa->fa_slen, 0, 0, tb->tb_id, true) == new_fa) { -- cgit v1.2.3 From 391c18cf776eb4569ecda1f7794f360fe0a45a26 Mon Sep 17 00:00:00 2001 From: Dominique Martinet Date: Fri, 18 Nov 2022 22:44:41 +0900 Subject: 9p/xen: check logical size for buffer size trans_xen did not check the data fits into the buffer before copying from the xen ring, but we probably should. Add a check that just skips the request and return an error to userspace if it did not fit Tested-by: Stefano Stabellini Reviewed-by: Christian Schoenebeck Link: https://lkml.kernel.org/r/20221118135542.63400-1-asmadeus@codewreck.org Signed-off-by: Dominique Martinet --- net/9p/trans_xen.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'net') diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c index b15c64128c3e..aaa5fd364691 100644 --- a/net/9p/trans_xen.c +++ b/net/9p/trans_xen.c @@ -208,6 +208,14 @@ static void p9_xen_response(struct work_struct *work) continue; } + if (h.size > req->rc.capacity) { + dev_warn(&priv->dev->dev, + "requested packet size too big: %d for tag %d with capacity %zd\n", + h.size, h.tag, req->rc.capacity); + req->status = REQ_STATUS_ERROR; + goto recv_error; + } + memcpy(&req->rc, &h, sizeof(h)); req->rc.offset = 0; @@ -217,6 +225,7 @@ static void p9_xen_response(struct work_struct *work) masked_prod, &masked_cons, XEN_9PFS_RING_SIZE(ring)); +recv_error: virt_mb(); cons += h.size; ring->intf->in_cons = cons; -- cgit v1.2.3 From af295e854a4e3813ffbdef26dbb6a4d6226c3ea1 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Mon, 21 Nov 2022 09:54:26 +0100 Subject: l2tp: Don't sleep and disable BH under writer-side sk_callback_lock When holding a reader-writer spin lock we cannot sleep. Calling setup_udp_tunnel_sock() with write lock held violates this rule, because we end up calling percpu_down_read(), which might sleep, as syzbot reports [1]: __might_resched.cold+0x222/0x26b kernel/sched/core.c:9890 percpu_down_read include/linux/percpu-rwsem.h:49 [inline] cpus_read_lock+0x1b/0x140 kernel/cpu.c:310 static_key_slow_inc+0x12/0x20 kernel/jump_label.c:158 udp_tunnel_encap_enable include/net/udp_tunnel.h:187 [inline] setup_udp_tunnel_sock+0x43d/0x550 net/ipv4/udp_tunnel_core.c:81 l2tp_tunnel_register+0xc51/0x1210 net/l2tp/l2tp_core.c:1509 pppol2tp_connect+0xcdc/0x1a10 net/l2tp/l2tp_ppp.c:723 Trim the writer-side critical section for sk_callback_lock down to the minimum, so that it covers only operations on sk_user_data. Also, when grabbing the sk_callback_lock, we always need to disable BH, as Eric points out. Failing to do so leads to deadlocks because we acquire sk_callback_lock in softirq context, which can get stuck waiting on us if: 1) it runs on the same CPU, or CPU0 ---- lock(clock-AF_INET6); lock(clock-AF_INET6); 2) lock ordering leads to priority inversion CPU0 CPU1 ---- ---- lock(clock-AF_INET6); local_irq_disable(); lock(&tcp_hashinfo.bhash[i].lock); lock(clock-AF_INET6); lock(&tcp_hashinfo.bhash[i].lock); ... as syzbot reports [2,3]. Use the _bh variants for write_(un)lock. [1] https://lore.kernel.org/netdev/0000000000004e78ec05eda79749@google.com/ [2] https://lore.kernel.org/netdev/000000000000e38b6605eda76f98@google.com/ [3] https://lore.kernel.org/netdev/000000000000dfa31e05eda76f75@google.com/ v2: - Check and set sk_user_data while holding sk_callback_lock for both L2TP encapsulation types (IP and UDP) (Tetsuo) Cc: Tom Parkin Cc: Tetsuo Handa Fixes: b68777d54fac ("l2tp: Serialize access to sk_user_data with sk_callback_lock") Reported-by: Eric Dumazet Reported-by: syzbot+703d9e154b3b58277261@syzkaller.appspotmail.com Reported-by: syzbot+50680ced9e98a61f7698@syzkaller.appspotmail.com Reported-by: syzbot+de987172bb74a381879b@syzkaller.appspotmail.com Signed-off-by: Jakub Sitnicki Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 754fdda8a5f5..9a1415fe3fa7 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1474,11 +1474,12 @@ int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net, } sk = sock->sk; - write_lock(&sk->sk_callback_lock); - + write_lock_bh(&sk->sk_callback_lock); ret = l2tp_validate_socket(sk, net, tunnel->encap); if (ret < 0) - goto err_sock; + goto err_inval_sock; + rcu_assign_sk_user_data(sk, tunnel); + write_unlock_bh(&sk->sk_callback_lock); tunnel->l2tp_net = net; pn = l2tp_pernet(net); @@ -1507,8 +1508,6 @@ int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net, }; setup_udp_tunnel_sock(net, sock, &udp_cfg); - } else { - rcu_assign_sk_user_data(sk, tunnel); } tunnel->old_sk_destruct = sk->sk_destruct; @@ -1522,16 +1521,18 @@ int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net, if (tunnel->fd >= 0) sockfd_put(sock); - write_unlock(&sk->sk_callback_lock); return 0; err_sock: + write_lock_bh(&sk->sk_callback_lock); + rcu_assign_sk_user_data(sk, NULL); +err_inval_sock: + write_unlock_bh(&sk->sk_callback_lock); + if (tunnel->fd < 0) sock_release(sock); else sockfd_put(sock); - - write_unlock(&sk->sk_callback_lock); err: return ret; } -- cgit v1.2.3