From c5cff8561d2d0006e972bd114afd51f082fee77c Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Mon, 21 Aug 2017 09:47:10 -0700 Subject: ipv6: add rcu grace period before freeing fib6_node We currently keep rt->rt6i_node pointing to the fib6_node for the route. And some functions make use of this pointer to dereference the fib6_node from rt structure, e.g. rt6_check(). However, as there is neither refcount nor rcu taken when dereferencing rt->rt6i_node, it could potentially cause crashes as rt->rt6i_node could be set to NULL by other CPUs when doing a route deletion. This patch introduces an rcu grace period before freeing fib6_node and makes sure the functions that dereference it takes rcu_read_lock(). Note: there is no "Fixes" tag because this bug was there in a very early stage. Signed-off-by: Wei Wang Acked-by: Eric Dumazet Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 1a88008cc6f5..e9c59db92942 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -70,6 +70,7 @@ struct fib6_node { __u16 fn_flags; int fn_sernum; struct rt6_info *rr_ptr; + struct rcu_head rcu; }; #ifndef CONFIG_IPV6_SUBTREES @@ -167,13 +168,40 @@ static inline void rt6_update_expires(struct rt6_info *rt0, int timeout) rt0->rt6i_flags |= RTF_EXPIRES; } +/* Function to safely get fn->sernum for passed in rt + * and store result in passed in cookie. + * Return true if we can get cookie safely + * Return false if not + */ +static inline bool rt6_get_cookie_safe(const struct rt6_info *rt, + u32 *cookie) +{ + struct fib6_node *fn; + bool status = false; + + rcu_read_lock(); + fn = rcu_dereference(rt->rt6i_node); + + if (fn) { + *cookie = fn->fn_sernum; + status = true; + } + + rcu_read_unlock(); + return status; +} + static inline u32 rt6_get_cookie(const struct rt6_info *rt) { + u32 cookie = 0; + if (rt->rt6i_flags & RTF_PCPU || (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from)) rt = (struct rt6_info *)(rt->dst.from); - return rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0; + rt6_get_cookie_safe(rt, &cookie); + + return cookie; } static inline void ip6_rt_put(struct rt6_info *rt) -- cgit v1.2.3 From 551143d8d954fe398324a5caa276f518466c428b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 24 Aug 2017 21:12:28 -0700 Subject: net_sched: fix a refcount_t issue with noop_qdisc syzkaller reported a refcount_t warning [1] Issue here is that noop_qdisc refcnt was never really considered as a true refcount, since qdisc_destroy() found TCQ_F_BUILTIN set : if (qdisc->flags & TCQ_F_BUILTIN || !refcount_dec_and_test(&qdisc->refcnt))) return; Meaning that all atomic_inc() we did on noop_qdisc.refcnt were not really needed, but harmless until refcount_t came. To fix this problem, we simply need to not increment noop_qdisc.refcnt, since we never decrement it. [1] refcount_t: increment on 0; use-after-free. ------------[ cut here ]------------ WARNING: CPU: 0 PID: 21754 at lib/refcount.c:152 refcount_inc+0x47/0x50 lib/refcount.c:152 Kernel panic - not syncing: panic_on_warn set ... CPU: 0 PID: 21754 Comm: syz-executor7 Not tainted 4.13.0-rc6+ #20 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:16 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:52 panic+0x1e4/0x417 kernel/panic.c:180 __warn+0x1c4/0x1d9 kernel/panic.c:541 report_bug+0x211/0x2d0 lib/bug.c:183 fixup_bug+0x40/0x90 arch/x86/kernel/traps.c:190 do_trap_no_signal arch/x86/kernel/traps.c:224 [inline] do_trap+0x260/0x390 arch/x86/kernel/traps.c:273 do_error_trap+0x120/0x390 arch/x86/kernel/traps.c:310 do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:323 invalid_op+0x1e/0x30 arch/x86/entry/entry_64.S:846 RIP: 0010:refcount_inc+0x47/0x50 lib/refcount.c:152 RSP: 0018:ffff8801c43477a0 EFLAGS: 00010282 RAX: 000000000000002b RBX: ffffffff86093c14 RCX: 0000000000000000 RDX: 000000000000002b RSI: ffffffff8159314e RDI: ffffed0038868ee8 RBP: ffff8801c43477a8 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: ffffffff86093ac0 R13: 0000000000000001 R14: ffff8801d0f3bac0 R15: dffffc0000000000 attach_default_qdiscs net/sched/sch_generic.c:792 [inline] dev_activate+0x7d3/0xaa0 net/sched/sch_generic.c:833 __dev_open+0x227/0x330 net/core/dev.c:1380 __dev_change_flags+0x695/0x990 net/core/dev.c:6726 dev_change_flags+0x88/0x140 net/core/dev.c:6792 dev_ifsioc+0x5a6/0x930 net/core/dev_ioctl.c:256 dev_ioctl+0x2bc/0xf90 net/core/dev_ioctl.c:554 sock_do_ioctl+0x94/0xb0 net/socket.c:968 sock_ioctl+0x2c2/0x440 net/socket.c:1058 vfs_ioctl fs/ioctl.c:45 [inline] do_vfs_ioctl+0x1b1/0x1520 fs/ioctl.c:685 SYSC_ioctl fs/ioctl.c:700 [inline] SyS_ioctl+0x8f/0xc0 fs/ioctl.c:691 Fixes: 7b9364050246 ("net, sched: convert Qdisc.refcnt from atomic_t to refcount_t") Signed-off-by: Eric Dumazet Reported-by: Dmitry Vyukov Cc: Reshetova, Elena Signed-off-by: David S. Miller --- include/net/sch_generic.h | 7 +++++++ net/sched/sch_api.c | 6 +++--- net/sched/sch_generic.c | 2 +- 3 files changed, 11 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 67f815e5d525..c1109cdbbfa6 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -101,6 +101,13 @@ struct Qdisc { spinlock_t busylock ____cacheline_aligned_in_smp; }; +static inline void qdisc_refcount_inc(struct Qdisc *qdisc) +{ + if (qdisc->flags & TCQ_F_BUILTIN) + return; + refcount_inc(&qdisc->refcnt); +} + static inline bool qdisc_is_running(const struct Qdisc *qdisc) { return (raw_read_seqcount(&qdisc->running) & 1) ? true : false; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index a3fa144b8648..4fb5a3222d0d 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -836,7 +836,7 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent, old = dev_graft_qdisc(dev_queue, new); if (new && i > 0) - refcount_inc(&new->refcnt); + qdisc_refcount_inc(new); if (!ingress) qdisc_destroy(old); @@ -847,7 +847,7 @@ skip: notify_and_destroy(net, skb, n, classid, dev->qdisc, new); if (new && !new->ops->attach) - refcount_inc(&new->refcnt); + qdisc_refcount_inc(new); dev->qdisc = new ? : &noop_qdisc; if (new && new->ops->attach) @@ -1256,7 +1256,7 @@ replay: if (q == p || (p && check_loop(q, p, 0))) return -ELOOP; - refcount_inc(&q->refcnt); + qdisc_refcount_inc(q); goto graft; } else { if (!q) diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 57ba406f1437..4ba6da5fb254 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -785,7 +785,7 @@ static void attach_default_qdiscs(struct net_device *dev) dev->priv_flags & IFF_NO_QUEUE) { netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL); dev->qdisc = txq->qdisc_sleeping; - refcount_inc(&dev->qdisc->refcnt); + qdisc_refcount_inc(dev->qdisc); } else { qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT); if (qdisc) { -- cgit v1.2.3 From ebfa00c5745660fe7f0a91eea88d4dff658486c4 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Fri, 25 Aug 2017 13:10:12 +0200 Subject: tcp: fix refcnt leak with ebpf congestion control There are a few bugs around refcnt handling in the new BPF congestion control setsockopt: - The new ca is assigned to icsk->icsk_ca_ops even in the case where we cannot get a reference on it. This would lead to a use after free, since that ca is going away soon. - Changing the congestion control case doesn't release the refcnt on the previous ca. - In the reinit case, we first leak a reference on the old ca, then we call tcp_reinit_congestion_control on the ca that we have just assigned, leading to deinitializing the wrong ca (->release of the new ca on the old ca's data) and releasing the refcount on the ca that we actually want to use. This is visible by building (for example) BIC as a module and setting net.ipv4.tcp_congestion_control=bic, and using tcp_cong_kern.c from samples/bpf. This patch fixes the refcount issues, and moves reinit back into tcp core to avoid passing a ca pointer back to BPF. Fixes: 91b5b21c7c16 ("bpf: Add support for changing congestion control") Signed-off-by: Sabrina Dubroca Acked-by: Lawrence Brakmo Signed-off-by: David S. Miller --- include/net/tcp.h | 4 +--- net/core/filter.c | 7 ++----- net/ipv4/tcp.c | 2 +- net/ipv4/tcp_cong.c | 19 ++++++++++++++----- 4 files changed, 18 insertions(+), 14 deletions(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index ada65e767b28..f642a39f9eee 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1004,9 +1004,7 @@ void tcp_get_default_congestion_control(char *name); void tcp_get_available_congestion_control(char *buf, size_t len); void tcp_get_allowed_congestion_control(char *buf, size_t len); int tcp_set_allowed_congestion_control(char *allowed); -int tcp_set_congestion_control(struct sock *sk, const char *name, bool load); -void tcp_reinit_congestion_control(struct sock *sk, - const struct tcp_congestion_ops *ca); +int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, bool reinit); u32 tcp_slow_start(struct tcp_sock *tp, u32 acked); void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked); diff --git a/net/core/filter.c b/net/core/filter.c index 8eb81e5fae08..169974998c76 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2836,15 +2836,12 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, sk->sk_prot->setsockopt == tcp_setsockopt) { if (optname == TCP_CONGESTION) { char name[TCP_CA_NAME_MAX]; + bool reinit = bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN; strncpy(name, optval, min_t(long, optlen, TCP_CA_NAME_MAX-1)); name[TCP_CA_NAME_MAX-1] = 0; - ret = tcp_set_congestion_control(sk, name, false); - if (!ret && bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN) - /* replacing an existing ca */ - tcp_reinit_congestion_control(sk, - inet_csk(sk)->icsk_ca_ops); + ret = tcp_set_congestion_control(sk, name, false, reinit); } else { struct tcp_sock *tp = tcp_sk(sk); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 71ce33decd97..a3e91b552edc 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2481,7 +2481,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, name[val] = 0; lock_sock(sk); - err = tcp_set_congestion_control(sk, name, true); + err = tcp_set_congestion_control(sk, name, true, true); release_sock(sk); return err; } diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index fde983f6376b..421ea1b918da 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -189,8 +189,8 @@ void tcp_init_congestion_control(struct sock *sk) INET_ECN_dontxmit(sk); } -void tcp_reinit_congestion_control(struct sock *sk, - const struct tcp_congestion_ops *ca) +static void tcp_reinit_congestion_control(struct sock *sk, + const struct tcp_congestion_ops *ca) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -338,7 +338,7 @@ out: * tcp_reinit_congestion_control (if the current congestion control was * already initialized. */ -int tcp_set_congestion_control(struct sock *sk, const char *name, bool load) +int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, bool reinit) { struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_congestion_ops *ca; @@ -360,9 +360,18 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load) if (!ca) { err = -ENOENT; } else if (!load) { - icsk->icsk_ca_ops = ca; - if (!try_module_get(ca->owner)) + const struct tcp_congestion_ops *old_ca = icsk->icsk_ca_ops; + + if (try_module_get(ca->owner)) { + if (reinit) { + tcp_reinit_congestion_control(sk, ca); + } else { + icsk->icsk_ca_ops = ca; + module_put(old_ca->owner); + } + } else { err = -EBUSY; + } } else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))) { err = -EPERM; -- cgit v1.2.3 From 64f0f5d18a47c703c85576375cc010e83dac6a48 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 25 Aug 2017 14:31:01 +0200 Subject: udp6: set rx_dst_cookie on rx_dst updates Currently, in the udp6 code, the dst cookie is not initialized/updated concurrently with the RX dst used by early demux. As a result, the dst_check() in the early_demux path always fails, the rx dst cache is always invalidated, and we can't really leverage significant gain from the demux lookup. Fix it adding udp6 specific variant of sk_rx_dst_set() and use it to set the dst cookie when the dst entry is really changed. The issue is there since the introduction of early demux for ipv6. Fixes: 5425077d73e0 ("net: ipv6: Add early demux handler for UDP unicast") Acked-by: Hannes Frederic Sowa Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/net/udp.h | 2 +- net/ipv4/udp.c | 4 +++- net/ipv6/udp.c | 11 ++++++++++- 3 files changed, 14 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/udp.h b/include/net/udp.h index 586de4b811b5..626c2d8a70c5 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -260,7 +260,7 @@ static inline struct sk_buff *skb_recv_udp(struct sock *sk, unsigned int flags, } void udp_v4_early_demux(struct sk_buff *skb); -void udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst); +bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst); int udp_get_port(struct sock *sk, unsigned short snum, int (*saddr_cmp)(const struct sock *, const struct sock *)); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index cd1d044a7fa5..a6dc48d76a29 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1929,14 +1929,16 @@ drop: /* For TCP sockets, sk_rx_dst is protected by socket lock * For UDP, we use xchg() to guard against concurrent changes. */ -void udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) +bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) { struct dst_entry *old; if (dst_hold_safe(dst)) { old = xchg(&sk->sk_rx_dst, dst); dst_release(old); + return old != dst; } + return false; } EXPORT_SYMBOL(udp_sk_rx_dst_set); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 20039c8501eb..d6886228e1d0 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -768,6 +768,15 @@ start_lookup: return 0; } +static void udp6_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) +{ + if (udp_sk_rx_dst_set(sk, dst)) { + const struct rt6_info *rt = (const struct rt6_info *)dst; + + inet6_sk(sk)->rx_dst_cookie = rt6_get_cookie(rt); + } +} + int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, int proto) { @@ -817,7 +826,7 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, int ret; if (unlikely(sk->sk_rx_dst != dst)) - udp_sk_rx_dst_set(sk, dst); + udp6_sk_rx_dst_set(sk, dst); ret = udpv6_queue_rcv_skb(sk, skb); sock_put(sk); -- cgit v1.2.3 From 4e587ea71bf924f7dac621f1351653bd41e446cb Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 25 Aug 2017 15:03:10 -0700 Subject: ipv6: fix sparse warning on rt6i_node Commit c5cff8561d2d adds rcu grace period before freeing fib6_node. This generates a new sparse warning on rt->rt6i_node related code: net/ipv6/route.c:1394:30: error: incompatible types in comparison expression (different address spaces) ./include/net/ip6_fib.h:187:14: error: incompatible types in comparison expression (different address spaces) This commit adds "__rcu" tag for rt6i_node and makes sure corresponding rcu API is used for it. After this fix, sparse no longer generates the above warning. Fixes: c5cff8561d2d ("ipv6: add rcu grace period before freeing fib6_node") Signed-off-by: Wei Wang Acked-by: Eric Dumazet Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 2 +- net/ipv6/addrconf.c | 2 +- net/ipv6/ip6_fib.c | 11 +++++++---- net/ipv6/route.c | 3 ++- 4 files changed, 11 insertions(+), 7 deletions(-) (limited to 'include/net') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index e9c59db92942..af509f801084 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -105,7 +105,7 @@ struct rt6_info { * the same cache line. */ struct fib6_table *rt6i_table; - struct fib6_node *rt6i_node; + struct fib6_node __rcu *rt6i_node; struct in6_addr rt6i_gateway; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 3c46e9513a31..936e9ab4dda5 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -5556,7 +5556,7 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) * our DAD process, so we don't need * to do it again */ - if (!(ifp->rt->rt6i_node)) + if (!rcu_access_pointer(ifp->rt->rt6i_node)) ip6_ins_rt(ifp->rt); if (ifp->idev->cnf.forwarding) addrconf_join_anycast(ifp); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index a5ebf86f6be8..10b4b1f8b838 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -889,7 +889,7 @@ add: rt->dst.rt6_next = iter; *ins = rt; - rt->rt6i_node = fn; + rcu_assign_pointer(rt->rt6i_node, fn); atomic_inc(&rt->rt6i_ref); if (!info->skip_notify) inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); @@ -915,7 +915,7 @@ add: return err; *ins = rt; - rt->rt6i_node = fn; + rcu_assign_pointer(rt->rt6i_node, fn); rt->dst.rt6_next = iter->dst.rt6_next; atomic_inc(&rt->rt6i_ref); if (!info->skip_notify) @@ -1480,8 +1480,9 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, int fib6_del(struct rt6_info *rt, struct nl_info *info) { + struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node, + lockdep_is_held(&rt->rt6i_table->tb6_lock)); struct net *net = info->nl_net; - struct fib6_node *fn = rt->rt6i_node; struct rt6_info **rtp; #if RT6_DEBUG >= 2 @@ -1670,7 +1671,9 @@ static int fib6_clean_node(struct fib6_walker *w) if (res) { #if RT6_DEBUG >= 2 pr_debug("%s: del failed: rt=%p@%p err=%d\n", - __func__, rt, rt->rt6i_node, res); + __func__, rt, + rcu_access_pointer(rt->rt6i_node), + res); #endif continue; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 48c8c92dcbd3..86fb2411e2bd 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1383,7 +1383,8 @@ static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) { return !(rt->rt6i_flags & RTF_CACHE) && - (rt->rt6i_flags & RTF_PCPU || rt->rt6i_node); + (rt->rt6i_flags & RTF_PCPU || + rcu_access_pointer(rt->rt6i_node)); } static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, -- cgit v1.2.3