diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2015-11-04 09:41:05 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-11-04 09:41:05 -0800 |
commit | b0f85fa11aefc4f3e03306b4cd47f113bd57dcba (patch) | |
tree | 1333d36d99fde3f97210795941fc246f0ad08a75 /drivers/net/vxlan.c | |
parent | ccc9d4a6d640cbde05d519edeb727881646cf71b (diff) | |
parent | f32bfb9a8ca083f8d148ea90ae5ba66f4831836e (diff) | |
download | linux-b0f85fa11aefc4f3e03306b4cd47f113bd57dcba.tar.bz2 |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller:
Changes of note:
1) Allow to schedule ICMP packets in IPVS, from Alex Gartrell.
2) Provide FIB table ID in ipv4 route dumps just as ipv6 does, from
David Ahern.
3) Allow the user to ask for the statistics to be filtered out of
ipv4/ipv6 address netlink dumps. From Sowmini Varadhan.
4) More work to pass the network namespace context around deep into
various packet path APIs, starting with the netfilter hooks. From
Eric W Biederman.
5) Add layer 2 TX/RX checksum offloading to qeth driver, from Thomas
Richter.
6) Use usec resolution for SYN/ACK RTTs in TCP, from Yuchung Cheng.
7) Support Very High Throughput in wireless MESH code, from Bob
Copeland.
8) Allow setting the ageing_time in switchdev/rocker. From Scott
Feldman.
9) Properly autoload L2TP type modules, from Stephen Hemminger.
10) Fix and enable offload features by default in 8139cp driver, from
David Woodhouse.
11) Support both ipv4 and ipv6 sockets in a single vxlan device, from
Jiri Benc.
12) Fix CWND limiting of thin streams in TCP, from Bendik Rønning
Opstad.
13) Fix IPSEC flowcache overflows on large systems, from Steffen
Klassert.
14) Convert bridging to track VLANs using rhashtable entries rather than
a bitmap. From Nikolay Aleksandrov.
15) Make TCP listener handling completely lockless, this is a major
accomplishment. Incoming request sockets now live in the
established hash table just like any other socket too.
From Eric Dumazet.
15) Provide more bridging attributes to netlink, from Nikolay
Aleksandrov.
16) Use hash based algorithm for ipv4 multipath routing, this was very
long overdue. From Peter Nørlund.
17) Several y2038 cures, mostly avoiding timespec. From Arnd Bergmann.
18) Allow non-root execution of EBPF programs, from Alexei Starovoitov.
19) Support SO_INCOMING_CPU as setsockopt, from Eric Dumazet. This
influences the port binding selection logic used by SO_REUSEPORT.
20) Add ipv6 support to VRF, from David Ahern.
21) Add support for Mellanox Spectrum switch ASIC, from Jiri Pirko.
22) Add rtl8xxxu Realtek wireless driver, from Jes Sorensen.
23) Implement RACK loss recovery in TCP, from Yuchung Cheng.
24) Support multipath routes in MPLS, from Roopa Prabhu.
25) Fix POLLOUT notification for listening sockets in AF_UNIX, from Eric
Dumazet.
26) Add new QED Qlogic river, from Yuval Mintz, Manish Chopra, and
Sudarsana Kalluru.
27) Don't fetch timestamps on AF_UNIX sockets, from Hannes Frederic
Sowa.
28) Support ipv6 geneve tunnels, from John W Linville.
29) Add flood control support to switchdev layer, from Ido Schimmel.
30) Fix CHECKSUM_PARTIAL handling of potentially fragmented frames, from
Hannes Frederic Sowa.
31) Support persistent maps and progs in bpf, from Daniel Borkmann.
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1790 commits)
sh_eth: use DMA barriers
switchdev: respect SKIP_EOPNOTSUPP flag in case there is no recursion
net: sched: kill dead code in sch_choke.c
irda: Delete an unnecessary check before the function call "irlmp_unregister_service"
net: dsa: mv88e6xxx: include DSA ports in VLANs
net: dsa: mv88e6xxx: disable SA learning for DSA and CPU ports
net/core: fix for_each_netdev_feature
vlan: Invoke driver vlan hooks only if device is present
arcnet/com20020: add LEDS_CLASS dependency
bpf, verifier: annotate verbose printer with __printf
dp83640: Only wait for timestamps for packets with timestamping enabled.
ptp: Change ptp_class to a proper bitmask
dp83640: Prune rx timestamp list before reading from it
dp83640: Delay scheduled work.
dp83640: Include hash in timestamp/packet matching
ipv6: fix tunnel error handling
net/mlx5e: Fix LSO vlan insertion
net/mlx5e: Re-eanble client vlan TX acceleration
net/mlx5e: Return error in case mlx5e_set_features() fails
net/mlx5e: Don't allow more than max supported channels
...
Diffstat (limited to 'drivers/net/vxlan.c')
-rw-r--r-- | drivers/net/vxlan.c | 167 |
1 files changed, 108 insertions, 59 deletions
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index c1587ece28cf..6369a5734d4c 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -75,8 +75,7 @@ static struct rtnl_link_ops vxlan_link_ops; static const u8 all_zeros_mac[ETH_ALEN]; -static struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port, - bool no_share, u32 flags); +static int vxlan_sock_add(struct vxlan_dev *vxlan); /* per-network namespace private data for this module */ struct vxlan_net { @@ -994,19 +993,30 @@ static bool vxlan_snoop(struct net_device *dev, static bool vxlan_group_used(struct vxlan_net *vn, struct vxlan_dev *dev) { struct vxlan_dev *vxlan; + unsigned short family = dev->default_dst.remote_ip.sa.sa_family; /* The vxlan_sock is only used by dev, leaving group has * no effect on other vxlan devices. */ - if (atomic_read(&dev->vn_sock->refcnt) == 1) + if (family == AF_INET && dev->vn4_sock && + atomic_read(&dev->vn4_sock->refcnt) == 1) return false; +#if IS_ENABLED(CONFIG_IPV6) + if (family == AF_INET6 && dev->vn6_sock && + atomic_read(&dev->vn6_sock->refcnt) == 1) + return false; +#endif list_for_each_entry(vxlan, &vn->vxlan_list, next) { if (!netif_running(vxlan->dev) || vxlan == dev) continue; - if (vxlan->vn_sock != dev->vn_sock) + if (family == AF_INET && vxlan->vn4_sock != dev->vn4_sock) continue; +#if IS_ENABLED(CONFIG_IPV6) + if (family == AF_INET6 && vxlan->vn6_sock != dev->vn6_sock) + continue; +#endif if (!vxlan_addr_equal(&vxlan->default_dst.remote_ip, &dev->default_dst.remote_ip)) @@ -1022,15 +1032,16 @@ static bool vxlan_group_used(struct vxlan_net *vn, struct vxlan_dev *dev) return false; } -static void vxlan_sock_release(struct vxlan_sock *vs) +static void __vxlan_sock_release(struct vxlan_sock *vs) { - struct sock *sk = vs->sock->sk; - struct net *net = sock_net(sk); - struct vxlan_net *vn = net_generic(net, vxlan_net_id); + struct vxlan_net *vn; + if (!vs) + return; if (!atomic_dec_and_test(&vs->refcnt)) return; + vn = net_generic(sock_net(vs->sock->sk), vxlan_net_id); spin_lock(&vn->sock_lock); hlist_del_rcu(&vs->hlist); vxlan_notify_del_rx_port(vs); @@ -1039,32 +1050,43 @@ static void vxlan_sock_release(struct vxlan_sock *vs) queue_work(vxlan_wq, &vs->del_work); } +static void vxlan_sock_release(struct vxlan_dev *vxlan) +{ + __vxlan_sock_release(vxlan->vn4_sock); +#if IS_ENABLED(CONFIG_IPV6) + __vxlan_sock_release(vxlan->vn6_sock); +#endif +} + /* Update multicast group membership when first VNI on * multicast address is brought up */ static int vxlan_igmp_join(struct vxlan_dev *vxlan) { - struct vxlan_sock *vs = vxlan->vn_sock; - struct sock *sk = vs->sock->sk; + struct sock *sk; union vxlan_addr *ip = &vxlan->default_dst.remote_ip; int ifindex = vxlan->default_dst.remote_ifindex; int ret = -EINVAL; - lock_sock(sk); if (ip->sa.sa_family == AF_INET) { struct ip_mreqn mreq = { .imr_multiaddr.s_addr = ip->sin.sin_addr.s_addr, .imr_ifindex = ifindex, }; + sk = vxlan->vn4_sock->sock->sk; + lock_sock(sk); ret = ip_mc_join_group(sk, &mreq); + release_sock(sk); #if IS_ENABLED(CONFIG_IPV6) } else { + sk = vxlan->vn6_sock->sock->sk; + lock_sock(sk); ret = ipv6_stub->ipv6_sock_mc_join(sk, ifindex, &ip->sin6.sin6_addr); + release_sock(sk); #endif } - release_sock(sk); return ret; } @@ -1072,27 +1094,30 @@ static int vxlan_igmp_join(struct vxlan_dev *vxlan) /* Inverse of vxlan_igmp_join when last VNI is brought down */ static int vxlan_igmp_leave(struct vxlan_dev *vxlan) { - struct vxlan_sock *vs = vxlan->vn_sock; - struct sock *sk = vs->sock->sk; + struct sock *sk; union vxlan_addr *ip = &vxlan->default_dst.remote_ip; int ifindex = vxlan->default_dst.remote_ifindex; int ret = -EINVAL; - lock_sock(sk); if (ip->sa.sa_family == AF_INET) { struct ip_mreqn mreq = { .imr_multiaddr.s_addr = ip->sin.sin_addr.s_addr, .imr_ifindex = ifindex, }; + sk = vxlan->vn4_sock->sock->sk; + lock_sock(sk); ret = ip_mc_leave_group(sk, &mreq); + release_sock(sk); #if IS_ENABLED(CONFIG_IPV6) } else { + sk = vxlan->vn6_sock->sock->sk; + lock_sock(sk); ret = ipv6_stub->ipv6_sock_mc_drop(sk, ifindex, &ip->sin6.sin6_addr); + release_sock(sk); #endif } - release_sock(sk); return ret; } @@ -1873,8 +1898,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, { struct ip_tunnel_info *info; struct vxlan_dev *vxlan = netdev_priv(dev); - struct sock *sk = vxlan->vn_sock->sock->sk; - unsigned short family = vxlan_get_sk_family(vxlan->vn_sock); + struct sock *sk; struct rtable *rt = NULL; const struct iphdr *old_iph; struct flowi4 fl4; @@ -1901,13 +1925,10 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, dev->name); goto drop; } - if (family != ip_tunnel_info_af(info)) - goto drop; - dst_port = info->key.tp_dst ? : vxlan->cfg.dst_port; vni = be64_to_cpu(info->key.tun_id); - remote_ip.sa.sa_family = family; - if (family == AF_INET) + remote_ip.sa.sa_family = ip_tunnel_info_af(info); + if (remote_ip.sa.sa_family == AF_INET) remote_ip.sin.sin_addr.s_addr = info->key.u.ipv4.dst; else remote_ip.sin6.sin6_addr = info->key.u.ipv6.dst; @@ -1952,6 +1973,10 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, } if (dst->sa.sa_family == AF_INET) { + if (!vxlan->vn4_sock) + goto drop; + sk = vxlan->vn4_sock->sock->sk; + if (info && (info->key.tun_flags & TUNNEL_DONT_FRAGMENT)) df = htons(IP_DF); @@ -2013,6 +2038,10 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, struct flowi6 fl6; u32 rt6i_flags; + if (!vxlan->vn6_sock) + goto drop; + sk = vxlan->vn6_sock->sock->sk; + memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_oif = rdst ? rdst->remote_ifindex : 0; fl6.daddr = dst->sin6.sin6_addr; @@ -2204,7 +2233,6 @@ static void vxlan_vs_add_dev(struct vxlan_sock *vs, struct vxlan_dev *vxlan) struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); __u32 vni = vxlan->default_dst.remote_vni; - vxlan->vn_sock = vs; spin_lock(&vn->sock_lock); hlist_add_head_rcu(&vxlan->hlist, vni_head(vs, vni)); spin_unlock(&vn->sock_lock); @@ -2244,22 +2272,18 @@ static void vxlan_uninit(struct net_device *dev) static int vxlan_open(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); - struct vxlan_sock *vs; - int ret = 0; + int ret; - vs = vxlan_sock_add(vxlan->net, vxlan->cfg.dst_port, - vxlan->cfg.no_share, vxlan->flags); - if (IS_ERR(vs)) - return PTR_ERR(vs); - - vxlan_vs_add_dev(vs, vxlan); + ret = vxlan_sock_add(vxlan); + if (ret < 0) + return ret; if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip)) { ret = vxlan_igmp_join(vxlan); if (ret == -EADDRINUSE) ret = 0; if (ret) { - vxlan_sock_release(vs); + vxlan_sock_release(vxlan); return ret; } } @@ -2294,7 +2318,6 @@ static int vxlan_stop(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); - struct vxlan_sock *vs = vxlan->vn_sock; int ret = 0; if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip) && @@ -2304,7 +2327,7 @@ static int vxlan_stop(struct net_device *dev) del_timer_sync(&vxlan->age_timer); vxlan_flush(vxlan); - vxlan_sock_release(vs); + vxlan_sock_release(vxlan); return ret; } @@ -2581,14 +2604,13 @@ static struct socket *vxlan_create_sock(struct net *net, bool ipv6, } /* Create new listen socket if needed */ -static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port, - u32 flags) +static struct vxlan_sock *vxlan_socket_create(struct net *net, bool ipv6, + __be16 port, u32 flags) { struct vxlan_net *vn = net_generic(net, vxlan_net_id); struct vxlan_sock *vs; struct socket *sock; unsigned int h; - bool ipv6 = !!(flags & VXLAN_F_IPV6); struct udp_tunnel_sock_cfg tunnel_cfg; vs = kzalloc(sizeof(*vs), GFP_KERNEL); @@ -2633,27 +2655,53 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port, return vs; } -static struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port, - bool no_share, u32 flags) +static int __vxlan_sock_add(struct vxlan_dev *vxlan, bool ipv6) { - struct vxlan_net *vn = net_generic(net, vxlan_net_id); - struct vxlan_sock *vs; - bool ipv6 = flags & VXLAN_F_IPV6; + struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); + struct vxlan_sock *vs = NULL; - if (!no_share) { + if (!vxlan->cfg.no_share) { spin_lock(&vn->sock_lock); - vs = vxlan_find_sock(net, ipv6 ? AF_INET6 : AF_INET, port, - flags); - if (vs) { - if (!atomic_add_unless(&vs->refcnt, 1, 0)) - vs = ERR_PTR(-EBUSY); + vs = vxlan_find_sock(vxlan->net, ipv6 ? AF_INET6 : AF_INET, + vxlan->cfg.dst_port, vxlan->flags); + if (vs && !atomic_add_unless(&vs->refcnt, 1, 0)) { spin_unlock(&vn->sock_lock); - return vs; + return -EBUSY; } spin_unlock(&vn->sock_lock); } + if (!vs) + vs = vxlan_socket_create(vxlan->net, ipv6, + vxlan->cfg.dst_port, vxlan->flags); + if (IS_ERR(vs)) + return PTR_ERR(vs); +#if IS_ENABLED(CONFIG_IPV6) + if (ipv6) + vxlan->vn6_sock = vs; + else +#endif + vxlan->vn4_sock = vs; + vxlan_vs_add_dev(vs, vxlan); + return 0; +} - return vxlan_socket_create(net, port, flags); +static int vxlan_sock_add(struct vxlan_dev *vxlan) +{ + bool ipv6 = vxlan->flags & VXLAN_F_IPV6; + bool metadata = vxlan->flags & VXLAN_F_COLLECT_METADATA; + int ret = 0; + + vxlan->vn4_sock = NULL; +#if IS_ENABLED(CONFIG_IPV6) + vxlan->vn6_sock = NULL; + if (ipv6 || metadata) + ret = __vxlan_sock_add(vxlan, true); +#endif + if (!ret && (!ipv6 || metadata)) + ret = __vxlan_sock_add(vxlan, false); + if (ret < 0) + vxlan_sock_release(vxlan); + return ret; } static int vxlan_dev_configure(struct net *src_net, struct net_device *dev, @@ -2662,6 +2710,7 @@ static int vxlan_dev_configure(struct net *src_net, struct net_device *dev, struct vxlan_net *vn = net_generic(src_net, vxlan_net_id); struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_rdst *dst = &vxlan->default_dst; + unsigned short needed_headroom = ETH_HLEN; int err; bool use_ipv6 = false; __be16 default_port = vxlan->cfg.dst_port; @@ -2681,6 +2730,7 @@ static int vxlan_dev_configure(struct net *src_net, struct net_device *dev, if (!IS_ENABLED(CONFIG_IPV6)) return -EPFNOSUPPORT; use_ipv6 = true; + vxlan->flags |= VXLAN_F_IPV6; } if (conf->remote_ifindex) { @@ -2701,22 +2751,21 @@ static int vxlan_dev_configure(struct net *src_net, struct net_device *dev, pr_info("IPv6 is disabled via sysctl\n"); return -EPERM; } - vxlan->flags |= VXLAN_F_IPV6; } #endif if (!conf->mtu) dev->mtu = lowerdev->mtu - (use_ipv6 ? VXLAN6_HEADROOM : VXLAN_HEADROOM); - dev->needed_headroom = lowerdev->hard_header_len + - (use_ipv6 ? VXLAN6_HEADROOM : VXLAN_HEADROOM); - } else if (use_ipv6) { - vxlan->flags |= VXLAN_F_IPV6; - dev->needed_headroom = ETH_HLEN + VXLAN6_HEADROOM; - } else { - dev->needed_headroom = ETH_HLEN + VXLAN_HEADROOM; + needed_headroom = lowerdev->hard_header_len; } + if (use_ipv6 || conf->flags & VXLAN_F_COLLECT_METADATA) + needed_headroom += VXLAN6_HEADROOM; + else + needed_headroom += VXLAN_HEADROOM; + dev->needed_headroom = needed_headroom; + memcpy(&vxlan->cfg, conf, sizeof(*conf)); if (!vxlan->cfg.dst_port) vxlan->cfg.dst_port = default_port; |