From fe5c3561e6f0ac7c9546209f01351113c1b77ec8 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Sat, 13 Jul 2013 10:18:18 -0700 Subject: vxlan: add necessary locking on device removal The socket management is now done in workqueue (outside of RTNL) and protected by vn->sock_lock. There were two possible bugs, first the vxlan device was removed from the VNI hash table per socket without holding lock. And there was a race when device is created and the workqueue could run after deletion. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'drivers/net/vxlan.c') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 0ba1e7edbb1b..a5ba8dd7e6be 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1767,9 +1767,15 @@ static int vxlan_newlink(struct net *net, struct net_device *dev, static void vxlan_dellink(struct net_device *dev, struct list_head *head) { + struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id); struct vxlan_dev *vxlan = netdev_priv(dev); + flush_workqueue(vxlan_wq); + + spin_lock(&vn->sock_lock); hlist_del_rcu(&vxlan->hlist); + spin_unlock(&vn->sock_lock); + list_del(&vxlan->next); unregister_netdevice_queue(dev, head); } -- cgit v1.2.3 From 372675a4a9ac0a0af962d44dadeea69926ce45e0 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Thu, 18 Jul 2013 08:38:26 -0700 Subject: vxlan: unregister on namespace exit Fix memory leaks and other badness from VXLAN network namespace teardown. When network namespace is removed, all the vxlan devices should be unregistered (not closed). Signed-off-by: Stephen Hemminger Reviewed-by: Pravin B Shelar Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers/net/vxlan.c') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index a5ba8dd7e6be..f101034a297a 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1878,10 +1878,12 @@ static __net_exit void vxlan_exit_net(struct net *net) { struct vxlan_net *vn = net_generic(net, vxlan_net_id); struct vxlan_dev *vxlan; + LIST_HEAD(list); rtnl_lock(); list_for_each_entry(vxlan, &vn->vxlan_list, next) - dev_close(vxlan->dev); + unregister_netdevice_queue(vxlan->dev, &list); + unregister_netdevice_many(&list); rtnl_unlock(); } -- cgit v1.2.3 From 3fc2de2faba387218bdf9dbc6b13f513ac3b060a Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Thu, 18 Jul 2013 08:40:15 -0700 Subject: vxlan: fix igmp races There are two race conditions in existing code for doing IGMP management in workqueue in vxlan. First, the vxlan_group_used function checks the list of vxlan's without any protection, and it is possible for open followed by close to occur before the igmp work queue runs. To solve these move the check into vxlan_open/stop so it is protected by RTNL. And split into two work structures so that there is no racy reference to underlying device state. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 53 ++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 38 insertions(+), 15 deletions(-) (limited to 'drivers/net/vxlan.c') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index f101034a297a..f4c6db419ddb 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -136,7 +136,8 @@ struct vxlan_dev { u32 flags; /* VXLAN_F_* below */ struct work_struct sock_work; - struct work_struct igmp_work; + struct work_struct igmp_join; + struct work_struct igmp_leave; unsigned long age_interval; struct timer_list age_timer; @@ -736,7 +737,6 @@ static bool vxlan_snoop(struct net_device *dev, return false; } - /* See if multicast group is already in use by other ID */ static bool vxlan_group_used(struct vxlan_net *vn, __be32 remote_ip) { @@ -770,12 +770,13 @@ static void vxlan_sock_release(struct vxlan_net *vn, struct vxlan_sock *vs) queue_work(vxlan_wq, &vs->del_work); } -/* Callback to update multicast group membership. - * Scheduled when vxlan goes up/down. +/* Callback to update multicast group membership when first VNI on + * multicast asddress is brought up + * Done as workqueue because ip_mc_join_group acquires RTNL. */ -static void vxlan_igmp_work(struct work_struct *work) +static void vxlan_igmp_join(struct work_struct *work) { - struct vxlan_dev *vxlan = container_of(work, struct vxlan_dev, igmp_work); + struct vxlan_dev *vxlan = container_of(work, struct vxlan_dev, igmp_join); struct vxlan_net *vn = net_generic(dev_net(vxlan->dev), vxlan_net_id); struct vxlan_sock *vs = vxlan->vn_sock; struct sock *sk = vs->sock->sk; @@ -785,10 +786,27 @@ static void vxlan_igmp_work(struct work_struct *work) }; lock_sock(sk); - if (vxlan_group_used(vn, vxlan->default_dst.remote_ip)) - ip_mc_join_group(sk, &mreq); - else - ip_mc_leave_group(sk, &mreq); + ip_mc_join_group(sk, &mreq); + release_sock(sk); + + vxlan_sock_release(vn, vs); + dev_put(vxlan->dev); +} + +/* Inverse of vxlan_igmp_join when last VNI is brought down */ +static void vxlan_igmp_leave(struct work_struct *work) +{ + struct vxlan_dev *vxlan = container_of(work, struct vxlan_dev, igmp_leave); + struct vxlan_net *vn = net_generic(dev_net(vxlan->dev), vxlan_net_id); + struct vxlan_sock *vs = vxlan->vn_sock; + struct sock *sk = vs->sock->sk; + struct ip_mreqn mreq = { + .imr_multiaddr.s_addr = vxlan->default_dst.remote_ip, + .imr_ifindex = vxlan->default_dst.remote_ifindex, + }; + + lock_sock(sk); + ip_mc_leave_group(sk, &mreq); release_sock(sk); vxlan_sock_release(vn, vs); @@ -1359,6 +1377,7 @@ static void vxlan_uninit(struct net_device *dev) /* Start ageing timer and join group when device is brought up */ static int vxlan_open(struct net_device *dev) { + struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id); struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_sock *vs = vxlan->vn_sock; @@ -1366,10 +1385,11 @@ static int vxlan_open(struct net_device *dev) if (!vs) return -ENOTCONN; - if (IN_MULTICAST(ntohl(vxlan->default_dst.remote_ip))) { + if (IN_MULTICAST(ntohl(vxlan->default_dst.remote_ip)) && + ! vxlan_group_used(vn, vxlan->default_dst.remote_ip)) { vxlan_sock_hold(vs); dev_hold(dev); - queue_work(vxlan_wq, &vxlan->igmp_work); + queue_work(vxlan_wq, &vxlan->igmp_join); } if (vxlan->age_interval) @@ -1400,13 +1420,15 @@ static void vxlan_flush(struct vxlan_dev *vxlan) /* Cleanup timer and forwarding table on shutdown */ static int vxlan_stop(struct net_device *dev) { + struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id); struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_sock *vs = vxlan->vn_sock; - if (vs && IN_MULTICAST(ntohl(vxlan->default_dst.remote_ip))) { + if (vs && IN_MULTICAST(ntohl(vxlan->default_dst.remote_ip)) && + ! vxlan_group_used(vn, vxlan->default_dst.remote_ip)) { vxlan_sock_hold(vs); dev_hold(dev); - queue_work(vxlan_wq, &vxlan->igmp_work); + queue_work(vxlan_wq, &vxlan->igmp_leave); } del_timer_sync(&vxlan->age_timer); @@ -1471,7 +1493,8 @@ static void vxlan_setup(struct net_device *dev) INIT_LIST_HEAD(&vxlan->next); spin_lock_init(&vxlan->hash_lock); - INIT_WORK(&vxlan->igmp_work, vxlan_igmp_work); + INIT_WORK(&vxlan->igmp_join, vxlan_igmp_join); + INIT_WORK(&vxlan->igmp_leave, vxlan_igmp_leave); INIT_WORK(&vxlan->sock_work, vxlan_sock_work); init_timer_deferrable(&vxlan->age_timer); -- cgit v1.2.3 From 614334df2d6dcc5e15eaa8066a649535d50f329b Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 7 Aug 2013 16:35:45 +0800 Subject: vxlan: fix a regression of igmp join This is a regression introduced by: commit 3fc2de2faba387218bdf9dbc6b13f513ac3b060a Author: stephen hemminger Date: Thu Jul 18 08:40:15 2013 -0700 vxlan: fix igmp races Before this commit, the old code was: if (vxlan_group_used(vn, vxlan->default_dst.remote_ip)) ip_mc_join_group(sk, &mreq); else ip_mc_leave_group(sk, &mreq); therefore we shoud check vxlan_group_used(), not its opposite, for igmp_join. Cc: Stephen Hemminger Cc: David S. Miller Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/net/vxlan.c') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index f4c6db419ddb..11a6c03d202a 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1386,7 +1386,7 @@ static int vxlan_open(struct net_device *dev) return -ENOTCONN; if (IN_MULTICAST(ntohl(vxlan->default_dst.remote_ip)) && - ! vxlan_group_used(vn, vxlan->default_dst.remote_ip)) { + vxlan_group_used(vn, vxlan->default_dst.remote_ip)) { vxlan_sock_hold(vs); dev_hold(dev); queue_work(vxlan_wq, &vxlan->igmp_join); -- cgit v1.2.3 From ffbe4a539f69fa3e2f3be74627ca2c830f9d923b Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 7 Aug 2013 16:43:22 +0800 Subject: vxlan: fix a soft lockup in vxlan module removal This is a regression introduced by: commit fe5c3561e6f0ac7c9546209f01351113c1b77ec8 Author: stephen hemminger Date: Sat Jul 13 10:18:18 2013 -0700 vxlan: add necessary locking on device removal The problem is that vxlan_dellink(), which is called with RTNL lock held, tries to flush the workqueue synchronously, but apparently igmp_join and igmp_leave work need to hold RTNL lock too, therefore we have a soft lockup! As suggested by Stephen, probably the flush_workqueue can just be removed and let the normal refcounting work. The workqueue has a reference to device and socket, therefore the cleanups should work correctly. Suggested-by: Stephen Hemminger Cc: Stephen Hemminger Cc: David S. Miller Tested-by: Cong Wang Signed-off-by: Cong Wang Acked-by: Stephen Hemminger Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'drivers/net/vxlan.c') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 11a6c03d202a..767f7af3bd40 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1793,8 +1793,6 @@ static void vxlan_dellink(struct net_device *dev, struct list_head *head) struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id); struct vxlan_dev *vxlan = netdev_priv(dev); - flush_workqueue(vxlan_wq); - spin_lock(&vn->sock_lock); hlist_del_rcu(&vxlan->hlist); spin_unlock(&vn->sock_lock); -- cgit v1.2.3