From 47934e06b65637c88a762d9c98329ae6e3238888 Mon Sep 17 00:00:00 2001 From: Congyu Liu Date: Tue, 18 Jan 2022 14:20:13 -0500 Subject: net: fix information leakage in /proc/net/ptype In one net namespace, after creating a packet socket without binding it to a device, users in other net namespaces can observe the new `packet_type` added by this packet socket by reading `/proc/net/ptype` file. This is minor information leakage as packet socket is namespace aware. Add a net pointer in `packet_type` to keep the net namespace of of corresponding packet socket. In `ptype_seq_show`, this net pointer must be checked when it is not NULL. Fixes: 2feb27dbe00c ("[NETNS]: Minor information leak via /proc/net/ptype file.") Signed-off-by: Congyu Liu Signed-off-by: David S. Miller --- net/core/net-procfs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index d8b9dbabd4a4..5b8016335aca 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -260,7 +260,8 @@ static int ptype_seq_show(struct seq_file *seq, void *v) if (v == SEQ_START_TOKEN) seq_puts(seq, "Type Device Function\n"); - else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) { + else if ((!pt->af_packet_net || net_eq(pt->af_packet_net, seq_file_net(seq))) && + (!pt->dev || net_eq(dev_net(pt->dev), seq_file_net(seq)))) { if (pt->type == htons(ETH_P_ALL)) seq_puts(seq, "ALL "); else -- cgit v1.2.3 From 1d10f8a1f40b965d449e8f2d5ed7b96a7c138b77 Mon Sep 17 00:00:00 2001 From: Jianguo Wu Date: Fri, 21 Jan 2022 17:15:31 +0800 Subject: net-procfs: show net devices bound packet types After commit:7866a621043f ("dev: add per net_device packet type chains"), we can not get packet types that are bound to a specified net device by /proc/net/ptype, this patch fix the regression. Run "tcpdump -i ens192 udp -nns0" Before and after apply this patch: Before: [root@localhost ~]# cat /proc/net/ptype Type Device Function 0800 ip_rcv 0806 arp_rcv 86dd ipv6_rcv After: [root@localhost ~]# cat /proc/net/ptype Type Device Function ALL ens192 tpacket_rcv 0800 ip_rcv 0806 arp_rcv 86dd ipv6_rcv v1 -> v2: - fix the regression rather than adding new /proc API as suggested by Stephen Hemminger. Fixes: 7866a621043f ("dev: add per net_device packet type chains") Signed-off-by: Jianguo Wu Signed-off-by: David S. Miller --- net/core/net-procfs.c | 35 ++++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index 5b8016335aca..88cc0ad7d386 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -190,12 +190,23 @@ static const struct seq_operations softnet_seq_ops = { .show = softnet_seq_show, }; -static void *ptype_get_idx(loff_t pos) +static void *ptype_get_idx(struct seq_file *seq, loff_t pos) { + struct list_head *ptype_list = NULL; struct packet_type *pt = NULL; + struct net_device *dev; loff_t i = 0; int t; + for_each_netdev_rcu(seq_file_net(seq), dev) { + ptype_list = &dev->ptype_all; + list_for_each_entry_rcu(pt, ptype_list, list) { + if (i == pos) + return pt; + ++i; + } + } + list_for_each_entry_rcu(pt, &ptype_all, list) { if (i == pos) return pt; @@ -216,22 +227,40 @@ static void *ptype_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { rcu_read_lock(); - return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN; + return *pos ? ptype_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; } static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos) { + struct net_device *dev; struct packet_type *pt; struct list_head *nxt; int hash; ++*pos; if (v == SEQ_START_TOKEN) - return ptype_get_idx(0); + return ptype_get_idx(seq, 0); pt = v; nxt = pt->list.next; + if (pt->dev) { + if (nxt != &pt->dev->ptype_all) + goto found; + + dev = pt->dev; + for_each_netdev_continue_rcu(seq_file_net(seq), dev) { + if (!list_empty(&dev->ptype_all)) { + nxt = dev->ptype_all.next; + goto found; + } + } + + nxt = ptype_all.next; + goto ptype_all; + } + if (pt->type == htons(ETH_P_ALL)) { +ptype_all: if (nxt != &ptype_all) goto found; hash = 0; -- cgit v1.2.3 From c6f6f2444bdbe0079e41914a35081530d0409963 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 31 Jan 2022 17:21:06 -0800 Subject: rtnetlink: make sure to refresh master_dev/m_ops in __rtnl_newlink() While looking at one unrelated syzbot bug, I found the replay logic in __rtnl_newlink() to potentially trigger use-after-free. It is better to clear master_dev and m_ops inside the loop, in case we have to replay it. Fixes: ba7d49b1f0f8 ("rtnetlink: provide api for getting and setting slave info") Signed-off-by: Eric Dumazet Cc: Jiri Pirko Link: https://lore.kernel.org/r/20220201012106.216495-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- net/core/rtnetlink.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index e476403231f0..710da8a36729 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3275,8 +3275,8 @@ static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1]; unsigned char name_assign_type = NET_NAME_USER; struct nlattr *linkinfo[IFLA_INFO_MAX + 1]; - const struct rtnl_link_ops *m_ops = NULL; - struct net_device *master_dev = NULL; + const struct rtnl_link_ops *m_ops; + struct net_device *master_dev; struct net *net = sock_net(skb->sk); const struct rtnl_link_ops *ops; struct nlattr *tb[IFLA_MAX + 1]; @@ -3314,6 +3314,8 @@ replay: else dev = NULL; + master_dev = NULL; + m_ops = NULL; if (dev) { master_dev = netdev_master_upper_dev_get(dev); if (master_dev) -- cgit v1.2.3 From 4a81f6da9cb2d1ef911131a6fd8bd15cb61fc772 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 1 Feb 2022 20:39:42 +0100 Subject: net, neigh: Do not trigger immediate probes on NUD_FAILED from neigh_managed_work syzkaller was able to trigger a deadlock for NTF_MANAGED entries [0]: kworker/0:16/14617 is trying to acquire lock: ffffffff8d4dd370 (&tbl->lock){++-.}-{2:2}, at: ___neigh_create+0x9e1/0x2990 net/core/neighbour.c:652 [...] but task is already holding lock: ffffffff8d4dd370 (&tbl->lock){++-.}-{2:2}, at: neigh_managed_work+0x35/0x250 net/core/neighbour.c:1572 The neighbor entry turned to NUD_FAILED state, where __neigh_event_send() triggered an immediate probe as per commit cd28ca0a3dd1 ("neigh: reduce arp latency") via neigh_probe() given table lock was held. One option to fix this situation is to defer the neigh_probe() back to the neigh_timer_handler() similarly as pre cd28ca0a3dd1. For the case of NTF_MANAGED, this deferral is acceptable given this only happens on actual failure state and regular / expected state is NUD_VALID with the entry already present. The fix adds a parameter to __neigh_event_send() in order to communicate whether immediate probe is allowed or disallowed. Existing call-sites of neigh_event_send() default as-is to immediate probe. However, the neigh_managed_work() disables it via use of neigh_event_send_probe(). [0] __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 print_deadlock_bug kernel/locking/lockdep.c:2956 [inline] check_deadlock kernel/locking/lockdep.c:2999 [inline] validate_chain kernel/locking/lockdep.c:3788 [inline] __lock_acquire.cold+0x149/0x3ab kernel/locking/lockdep.c:5027 lock_acquire kernel/locking/lockdep.c:5639 [inline] lock_acquire+0x1ab/0x510 kernel/locking/lockdep.c:5604 __raw_write_lock_bh include/linux/rwlock_api_smp.h:202 [inline] _raw_write_lock_bh+0x2f/0x40 kernel/locking/spinlock.c:334 ___neigh_create+0x9e1/0x2990 net/core/neighbour.c:652 ip6_finish_output2+0x1070/0x14f0 net/ipv6/ip6_output.c:123 __ip6_finish_output net/ipv6/ip6_output.c:191 [inline] __ip6_finish_output+0x61e/0xe90 net/ipv6/ip6_output.c:170 ip6_finish_output+0x32/0x200 net/ipv6/ip6_output.c:201 NF_HOOK_COND include/linux/netfilter.h:296 [inline] ip6_output+0x1e4/0x530 net/ipv6/ip6_output.c:224 dst_output include/net/dst.h:451 [inline] NF_HOOK include/linux/netfilter.h:307 [inline] ndisc_send_skb+0xa99/0x17f0 net/ipv6/ndisc.c:508 ndisc_send_ns+0x3a9/0x840 net/ipv6/ndisc.c:650 ndisc_solicit+0x2cd/0x4f0 net/ipv6/ndisc.c:742 neigh_probe+0xc2/0x110 net/core/neighbour.c:1040 __neigh_event_send+0x37d/0x1570 net/core/neighbour.c:1201 neigh_event_send include/net/neighbour.h:470 [inline] neigh_managed_work+0x162/0x250 net/core/neighbour.c:1574 process_one_work+0x9ac/0x1650 kernel/workqueue.c:2307 worker_thread+0x657/0x1110 kernel/workqueue.c:2454 kthread+0x2e9/0x3a0 kernel/kthread.c:377 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:295 Fixes: 7482e3841d52 ("net, neigh: Add NTF_MANAGED flag for managed neighbor entries") Reported-by: syzbot+5239d0e1778a500d477a@syzkaller.appspotmail.com Signed-off-by: Daniel Borkmann Cc: Eric Dumazet Cc: Roopa Prabhu Tested-by: syzbot+5239d0e1778a500d477a@syzkaller.appspotmail.com Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20220201193942.5055-1-daniel@iogearbox.net Signed-off-by: Jakub Kicinski --- include/net/neighbour.h | 18 +++++++++++++----- net/core/neighbour.c | 18 ++++++++++++------ 2 files changed, 25 insertions(+), 11 deletions(-) (limited to 'net/core') diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 937389e04c8e..87419f7f5421 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -350,7 +350,8 @@ static inline struct neighbour *neigh_create(struct neigh_table *tbl, return __neigh_create(tbl, pkey, dev, true); } void neigh_destroy(struct neighbour *neigh); -int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb); +int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb, + const bool immediate_ok); int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, u32 flags, u32 nlmsg_pid); void __neigh_set_probe_once(struct neighbour *neigh); @@ -460,17 +461,24 @@ static inline struct neighbour * neigh_clone(struct neighbour *neigh) #define neigh_hold(n) refcount_inc(&(n)->refcnt) -static inline int neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) +static __always_inline int neigh_event_send_probe(struct neighbour *neigh, + struct sk_buff *skb, + const bool immediate_ok) { unsigned long now = jiffies; - + if (READ_ONCE(neigh->used) != now) WRITE_ONCE(neigh->used, now); - if (!(neigh->nud_state&(NUD_CONNECTED|NUD_DELAY|NUD_PROBE))) - return __neigh_event_send(neigh, skb); + if (!(neigh->nud_state & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE))) + return __neigh_event_send(neigh, skb, immediate_ok); return 0; } +static inline int neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) +{ + return neigh_event_send_probe(neigh, skb, true); +} + #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) static inline int neigh_hh_bridge(struct hh_cache *hh, struct sk_buff *skb) { diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 6c2016f7f3d1..ec0bf737b076 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1133,7 +1133,8 @@ out: neigh_release(neigh); } -int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) +int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb, + const bool immediate_ok) { int rc; bool immediate_probe = false; @@ -1154,12 +1155,17 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) atomic_set(&neigh->probes, NEIGH_VAR(neigh->parms, UCAST_PROBES)); neigh_del_timer(neigh); - neigh->nud_state = NUD_INCOMPLETE; + neigh->nud_state = NUD_INCOMPLETE; neigh->updated = now; - next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), - HZ/100); + if (!immediate_ok) { + next = now + 1; + } else { + immediate_probe = true; + next = now + max(NEIGH_VAR(neigh->parms, + RETRANS_TIME), + HZ / 100); + } neigh_add_timer(neigh, next); - immediate_probe = true; } else { neigh->nud_state = NUD_FAILED; neigh->updated = jiffies; @@ -1571,7 +1577,7 @@ static void neigh_managed_work(struct work_struct *work) write_lock_bh(&tbl->lock); list_for_each_entry(neigh, &tbl->managed_list, managed_list) - neigh_event_send(neigh, NULL); + neigh_event_send_probe(neigh, NULL, false); queue_delayed_work(system_power_efficient_wq, &tbl->managed_work, NEIGH_VAR(&tbl->parms, DELAY_PROBE_TIME)); write_unlock_bh(&tbl->lock); -- cgit v1.2.3 From 58e61e416b5abedcacd32032144b333bca30cf1e Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Wed, 9 Feb 2022 07:02:42 -0800 Subject: skbuff: cleanup double word in comment Remove the second 'to'. Signed-off-by: Tom Rix Signed-off-by: David S. Miller --- net/core/skbuff.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 0118f0afaa4f..9d0388bed0c1 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -681,7 +681,7 @@ exit: * while trying to recycle fragments on __skb_frag_unref() we need * to make one SKB responsible for triggering the recycle path. * So disable the recycling bit if an SKB is cloned and we have - * additional references to to the fragmented part of the SKB. + * additional references to the fragmented part of the SKB. * Eventually the last SKB will have the recycling bit set and it's * dataref set to 0, which will trigger the recycling */ -- cgit v1.2.3 From dcd54265c8bc14bd023815e36e2d5f9d66ee1fee Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 10 Feb 2022 09:13:31 -0800 Subject: drop_monitor: fix data-race in dropmon_net_event / trace_napi_poll_hit trace_napi_poll_hit() is reading stat->dev while another thread can write on it from dropmon_net_event() Use READ_ONCE()/WRITE_ONCE() here, RCU rules are properly enforced already, we only have to take care of load/store tearing. BUG: KCSAN: data-race in dropmon_net_event / trace_napi_poll_hit write to 0xffff88816f3ab9c0 of 8 bytes by task 20260 on cpu 1: dropmon_net_event+0xb8/0x2b0 net/core/drop_monitor.c:1579 notifier_call_chain kernel/notifier.c:84 [inline] raw_notifier_call_chain+0x53/0xb0 kernel/notifier.c:392 call_netdevice_notifiers_info net/core/dev.c:1919 [inline] call_netdevice_notifiers_extack net/core/dev.c:1931 [inline] call_netdevice_notifiers net/core/dev.c:1945 [inline] unregister_netdevice_many+0x867/0xfb0 net/core/dev.c:10415 ip_tunnel_delete_nets+0x24a/0x280 net/ipv4/ip_tunnel.c:1123 vti_exit_batch_net+0x2a/0x30 net/ipv4/ip_vti.c:515 ops_exit_list net/core/net_namespace.c:173 [inline] cleanup_net+0x4dc/0x8d0 net/core/net_namespace.c:597 process_one_work+0x3f6/0x960 kernel/workqueue.c:2307 worker_thread+0x616/0xa70 kernel/workqueue.c:2454 kthread+0x1bf/0x1e0 kernel/kthread.c:377 ret_from_fork+0x1f/0x30 read to 0xffff88816f3ab9c0 of 8 bytes by interrupt on cpu 0: trace_napi_poll_hit+0x89/0x1c0 net/core/drop_monitor.c:292 trace_napi_poll include/trace/events/napi.h:14 [inline] __napi_poll+0x36b/0x3f0 net/core/dev.c:6366 napi_poll net/core/dev.c:6432 [inline] net_rx_action+0x29e/0x650 net/core/dev.c:6519 __do_softirq+0x158/0x2de kernel/softirq.c:558 do_softirq+0xb1/0xf0 kernel/softirq.c:459 __local_bh_enable_ip+0x68/0x70 kernel/softirq.c:383 __raw_spin_unlock_bh include/linux/spinlock_api_smp.h:167 [inline] _raw_spin_unlock_bh+0x33/0x40 kernel/locking/spinlock.c:210 spin_unlock_bh include/linux/spinlock.h:394 [inline] ptr_ring_consume_bh include/linux/ptr_ring.h:367 [inline] wg_packet_decrypt_worker+0x73c/0x780 drivers/net/wireguard/receive.c:506 process_one_work+0x3f6/0x960 kernel/workqueue.c:2307 worker_thread+0x616/0xa70 kernel/workqueue.c:2454 kthread+0x1bf/0x1e0 kernel/kthread.c:377 ret_from_fork+0x1f/0x30 value changed: 0xffff88815883e000 -> 0x0000000000000000 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 26435 Comm: kworker/0:1 Not tainted 5.17.0-rc1-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: wg-crypt-wg2 wg_packet_decrypt_worker Fixes: 4ea7e38696c7 ("dropmon: add ability to detect when hardware dropsrxpackets") Signed-off-by: Eric Dumazet Cc: Neil Horman Reported-by: syzbot Signed-off-by: David S. Miller --- net/core/drop_monitor.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 7b288a121a41..d5dc6be2522c 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -283,13 +283,17 @@ static void trace_napi_poll_hit(void *ignore, struct napi_struct *napi, rcu_read_lock(); list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { + struct net_device *dev; + /* * only add a note to our monitor buffer if: * 1) this is the dev we received on * 2) its after the last_rx delta * 3) our rx_dropped count has gone up */ - if ((new_stat->dev == napi->dev) && + /* Paired with WRITE_ONCE() in dropmon_net_event() */ + dev = READ_ONCE(new_stat->dev); + if ((dev == napi->dev) && (time_after(jiffies, new_stat->last_rx + dm_hw_check_delta)) && (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) { trace_drop_common(NULL, NULL); @@ -1576,7 +1580,10 @@ static int dropmon_net_event(struct notifier_block *ev_block, mutex_lock(&net_dm_mutex); list_for_each_entry_safe(new_stat, tmp, &hw_stats_list, list) { if (new_stat->dev == dev) { - new_stat->dev = NULL; + + /* Paired with READ_ONCE() in trace_napi_poll_hit() */ + WRITE_ONCE(new_stat->dev, NULL); + if (trace_state == TRACE_OFF) { list_del_rcu(&new_stat->list); kfree_rcu(new_stat, rcu); -- cgit v1.2.3 From 5891cd5ec46c2c2eb6427cb54d214b149635dd0e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 11 Feb 2022 12:06:23 -0800 Subject: net_sched: add __rcu annotation to netdev->qdisc syzbot found a data-race [1] which lead me to add __rcu annotations to netdev->qdisc, and proper accessors to get LOCKDEP support. [1] BUG: KCSAN: data-race in dev_activate / qdisc_lookup_rcu write to 0xffff888168ad6410 of 8 bytes by task 13559 on cpu 1: attach_default_qdiscs net/sched/sch_generic.c:1167 [inline] dev_activate+0x2ed/0x8f0 net/sched/sch_generic.c:1221 __dev_open+0x2e9/0x3a0 net/core/dev.c:1416 __dev_change_flags+0x167/0x3f0 net/core/dev.c:8139 rtnl_configure_link+0xc2/0x150 net/core/rtnetlink.c:3150 __rtnl_newlink net/core/rtnetlink.c:3489 [inline] rtnl_newlink+0xf4d/0x13e0 net/core/rtnetlink.c:3529 rtnetlink_rcv_msg+0x745/0x7e0 net/core/rtnetlink.c:5594 netlink_rcv_skb+0x14e/0x250 net/netlink/af_netlink.c:2494 rtnetlink_rcv+0x18/0x20 net/core/rtnetlink.c:5612 netlink_unicast_kernel net/netlink/af_netlink.c:1317 [inline] netlink_unicast+0x602/0x6d0 net/netlink/af_netlink.c:1343 netlink_sendmsg+0x728/0x850 net/netlink/af_netlink.c:1919 sock_sendmsg_nosec net/socket.c:705 [inline] sock_sendmsg net/socket.c:725 [inline] ____sys_sendmsg+0x39a/0x510 net/socket.c:2413 ___sys_sendmsg net/socket.c:2467 [inline] __sys_sendmsg+0x195/0x230 net/socket.c:2496 __do_sys_sendmsg net/socket.c:2505 [inline] __se_sys_sendmsg net/socket.c:2503 [inline] __x64_sys_sendmsg+0x42/0x50 net/socket.c:2503 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae read to 0xffff888168ad6410 of 8 bytes by task 13560 on cpu 0: qdisc_lookup_rcu+0x30/0x2e0 net/sched/sch_api.c:323 __tcf_qdisc_find+0x74/0x3a0 net/sched/cls_api.c:1050 tc_del_tfilter+0x1c7/0x1350 net/sched/cls_api.c:2211 rtnetlink_rcv_msg+0x5ba/0x7e0 net/core/rtnetlink.c:5585 netlink_rcv_skb+0x14e/0x250 net/netlink/af_netlink.c:2494 rtnetlink_rcv+0x18/0x20 net/core/rtnetlink.c:5612 netlink_unicast_kernel net/netlink/af_netlink.c:1317 [inline] netlink_unicast+0x602/0x6d0 net/netlink/af_netlink.c:1343 netlink_sendmsg+0x728/0x850 net/netlink/af_netlink.c:1919 sock_sendmsg_nosec net/socket.c:705 [inline] sock_sendmsg net/socket.c:725 [inline] ____sys_sendmsg+0x39a/0x510 net/socket.c:2413 ___sys_sendmsg net/socket.c:2467 [inline] __sys_sendmsg+0x195/0x230 net/socket.c:2496 __do_sys_sendmsg net/socket.c:2505 [inline] __se_sys_sendmsg net/socket.c:2503 [inline] __x64_sys_sendmsg+0x42/0x50 net/socket.c:2503 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0xffffffff85dee080 -> 0xffff88815d96ec00 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 13560 Comm: syz-executor.2 Not tainted 5.17.0-rc3-syzkaller-00116-gf1baf68e1383-dirty #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Fixes: 470502de5bdb ("net: sched: unlock rules update API") Signed-off-by: Eric Dumazet Cc: Vlad Buslov Reported-by: syzbot Cc: Jamal Hadi Salim Cc: Cong Wang Cc: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- net/core/rtnetlink.c | 6 ++++-- net/sched/cls_api.c | 6 +++--- net/sched/sch_api.c | 22 ++++++++++++---------- net/sched/sch_generic.c | 29 ++++++++++++++++------------- 5 files changed, 36 insertions(+), 29 deletions(-) (limited to 'net/core') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e490b84732d1..8b5a314db167 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2158,7 +2158,7 @@ struct net_device { struct netdev_queue *_tx ____cacheline_aligned_in_smp; unsigned int num_tx_queues; unsigned int real_num_tx_queues; - struct Qdisc *qdisc; + struct Qdisc __rcu *qdisc; unsigned int tx_queue_len; spinlock_t tx_global_lock; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 710da8a36729..2fb8eb6791e8 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1699,6 +1699,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, { struct ifinfomsg *ifm; struct nlmsghdr *nlh; + struct Qdisc *qdisc; ASSERT_RTNL(); nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifm), flags); @@ -1716,6 +1717,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, if (tgt_netnsid >= 0 && nla_put_s32(skb, IFLA_TARGET_NETNSID, tgt_netnsid)) goto nla_put_failure; + qdisc = rtnl_dereference(dev->qdisc); if (nla_put_string(skb, IFLA_IFNAME, dev->name) || nla_put_u32(skb, IFLA_TXQLEN, dev->tx_queue_len) || nla_put_u8(skb, IFLA_OPERSTATE, @@ -1735,8 +1737,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, #endif put_master_ifindex(skb, dev) || nla_put_u8(skb, IFLA_CARRIER, netif_carrier_ok(dev)) || - (dev->qdisc && - nla_put_string(skb, IFLA_QDISC, dev->qdisc->ops->id)) || + (qdisc && + nla_put_string(skb, IFLA_QDISC, qdisc->ops->id)) || nla_put_ifalias(skb, dev) || nla_put_u32(skb, IFLA_CARRIER_CHANGES, atomic_read(&dev->carrier_up_count) + diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 5f0f346b576f..5ce1208a6ea3 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1044,7 +1044,7 @@ static int __tcf_qdisc_find(struct net *net, struct Qdisc **q, /* Find qdisc */ if (!*parent) { - *q = dev->qdisc; + *q = rcu_dereference(dev->qdisc); *parent = (*q)->handle; } else { *q = qdisc_lookup_rcu(dev, TC_H_MAJ(*parent)); @@ -2587,7 +2587,7 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) parent = tcm->tcm_parent; if (!parent) - q = dev->qdisc; + q = rtnl_dereference(dev->qdisc); else q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent)); if (!q) @@ -2962,7 +2962,7 @@ static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; if (!tcm->tcm_parent) - q = dev->qdisc; + q = rtnl_dereference(dev->qdisc); else q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent)); diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 179825a3b2fd..e3c0e8ea2dbb 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -301,7 +301,7 @@ struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle) if (!handle) return NULL; - q = qdisc_match_from_root(dev->qdisc, handle); + q = qdisc_match_from_root(rtnl_dereference(dev->qdisc), handle); if (q) goto out; @@ -320,7 +320,7 @@ struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle) if (!handle) return NULL; - q = qdisc_match_from_root(dev->qdisc, handle); + q = qdisc_match_from_root(rcu_dereference(dev->qdisc), handle); if (q) goto out; @@ -1082,10 +1082,10 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent, skip: if (!ingress) { notify_and_destroy(net, skb, n, classid, - dev->qdisc, new); + rtnl_dereference(dev->qdisc), new); if (new && !new->ops->attach) qdisc_refcount_inc(new); - dev->qdisc = new ? : &noop_qdisc; + rcu_assign_pointer(dev->qdisc, new ? : &noop_qdisc); if (new && new->ops->attach) new->ops->attach(new); @@ -1451,7 +1451,7 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, q = dev_ingress_queue(dev)->qdisc_sleeping; } } else { - q = dev->qdisc; + q = rtnl_dereference(dev->qdisc); } if (!q) { NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device"); @@ -1540,7 +1540,7 @@ replay: q = dev_ingress_queue(dev)->qdisc_sleeping; } } else { - q = dev->qdisc; + q = rtnl_dereference(dev->qdisc); } /* It may be default qdisc, ignore it */ @@ -1762,7 +1762,8 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) s_q_idx = 0; q_idx = 0; - if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx, + if (tc_dump_qdisc_root(rtnl_dereference(dev->qdisc), + skb, cb, &q_idx, s_q_idx, true, tca[TCA_DUMP_INVISIBLE]) < 0) goto done; @@ -2033,7 +2034,7 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, } else if (qid1) { qid = qid1; } else if (qid == 0) - qid = dev->qdisc->handle; + qid = rtnl_dereference(dev->qdisc)->handle; /* Now qid is genuine qdisc handle consistent * both with parent and child. @@ -2044,7 +2045,7 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, portid = TC_H_MAKE(qid, portid); } else { if (qid == 0) - qid = dev->qdisc->handle; + qid = rtnl_dereference(dev->qdisc)->handle; } /* OK. Locate qdisc */ @@ -2205,7 +2206,8 @@ static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) s_t = cb->args[0]; t = 0; - if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t, true) < 0) + if (tc_dump_tclass_root(rtnl_dereference(dev->qdisc), + skb, tcm, cb, &t, s_t, true) < 0) goto done; dev_queue = dev_ingress_queue(dev); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index f893d9a81b01..5bab9f8b8f45 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -1164,30 +1164,33 @@ static void attach_default_qdiscs(struct net_device *dev) if (!netif_is_multiqueue(dev) || dev->priv_flags & IFF_NO_QUEUE) { netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL); - dev->qdisc = txq->qdisc_sleeping; - qdisc_refcount_inc(dev->qdisc); + qdisc = txq->qdisc_sleeping; + rcu_assign_pointer(dev->qdisc, qdisc); + qdisc_refcount_inc(qdisc); } else { qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT, NULL); if (qdisc) { - dev->qdisc = qdisc; + rcu_assign_pointer(dev->qdisc, qdisc); qdisc->ops->attach(qdisc); } } + qdisc = rtnl_dereference(dev->qdisc); /* Detect default qdisc setup/init failed and fallback to "noqueue" */ - if (dev->qdisc == &noop_qdisc) { + if (qdisc == &noop_qdisc) { netdev_warn(dev, "default qdisc (%s) fail, fallback to %s\n", default_qdisc_ops->id, noqueue_qdisc_ops.id); dev->priv_flags |= IFF_NO_QUEUE; netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL); - dev->qdisc = txq->qdisc_sleeping; - qdisc_refcount_inc(dev->qdisc); + qdisc = txq->qdisc_sleeping; + rcu_assign_pointer(dev->qdisc, qdisc); + qdisc_refcount_inc(qdisc); dev->priv_flags ^= IFF_NO_QUEUE; } #ifdef CONFIG_NET_SCHED - if (dev->qdisc != &noop_qdisc) - qdisc_hash_add(dev->qdisc, false); + if (qdisc != &noop_qdisc) + qdisc_hash_add(qdisc, false); #endif } @@ -1217,7 +1220,7 @@ void dev_activate(struct net_device *dev) * and noqueue_qdisc for virtual interfaces */ - if (dev->qdisc == &noop_qdisc) + if (rtnl_dereference(dev->qdisc) == &noop_qdisc) attach_default_qdiscs(dev); if (!netif_carrier_ok(dev)) @@ -1383,7 +1386,7 @@ static int qdisc_change_tx_queue_len(struct net_device *dev, void dev_qdisc_change_real_num_tx(struct net_device *dev, unsigned int new_real_tx) { - struct Qdisc *qdisc = dev->qdisc; + struct Qdisc *qdisc = rtnl_dereference(dev->qdisc); if (qdisc->ops->change_real_num_tx) qdisc->ops->change_real_num_tx(qdisc, new_real_tx); @@ -1447,7 +1450,7 @@ static void dev_init_scheduler_queue(struct net_device *dev, void dev_init_scheduler(struct net_device *dev) { - dev->qdisc = &noop_qdisc; + rcu_assign_pointer(dev->qdisc, &noop_qdisc); netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc); if (dev_ingress_queue(dev)) dev_init_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc); @@ -1475,8 +1478,8 @@ void dev_shutdown(struct net_device *dev) netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc); if (dev_ingress_queue(dev)) shutdown_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc); - qdisc_put(dev->qdisc); - dev->qdisc = &noop_qdisc; + qdisc_put(rtnl_dereference(dev->qdisc)); + rcu_assign_pointer(dev->qdisc, &noop_qdisc); WARN_ON(timer_pending(&dev->watchdog_timer)); } -- cgit v1.2.3