From 468dfffcd762cbb2777ec5a76bc21e3748ebf47e Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 12 Feb 2016 15:43:58 +0100 Subject: geneve: add dst caching support use generic dst implementation for both plain geneve devices and lwtunnels. In case of UDP traffic with datagram length below MTU this give about 2% performance increase for plain geneve tunnel over ipv4, about 65% performance increase for ipv6 tunnel. Signed-off-by: Paolo Abeni Suggested-and-Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- drivers/net/geneve.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'drivers/net/geneve.c') diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 0b14ac3b8d11..6f208132a574 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -72,6 +72,7 @@ struct geneve_dev { bool collect_md; struct gro_cells gro_cells; u32 flags; + struct dst_cache dst_cache; }; /* Geneve device flags */ @@ -297,6 +298,13 @@ static int geneve_init(struct net_device *dev) return err; } + err = dst_cache_init(&geneve->dst_cache, GFP_KERNEL); + if (err) { + free_percpu(dev->tstats); + gro_cells_destroy(&geneve->gro_cells); + return err; + } + return 0; } @@ -304,6 +312,7 @@ static void geneve_uninit(struct net_device *dev) { struct geneve_dev *geneve = netdev_priv(dev); + dst_cache_destroy(&geneve->dst_cache); gro_cells_destroy(&geneve->gro_cells); free_percpu(dev->tstats); } @@ -753,7 +762,9 @@ static struct rtable *geneve_get_v4_rt(struct sk_buff *skb, struct ip_tunnel_info *info) { struct geneve_dev *geneve = netdev_priv(dev); + struct dst_cache *dst_cache; struct rtable *rt = NULL; + bool use_cache = true; __u8 tos; memset(fl4, 0, sizeof(*fl4)); @@ -764,16 +775,26 @@ static struct rtable *geneve_get_v4_rt(struct sk_buff *skb, fl4->daddr = info->key.u.ipv4.dst; fl4->saddr = info->key.u.ipv4.src; fl4->flowi4_tos = RT_TOS(info->key.tos); + dst_cache = &info->dst_cache; } else { tos = geneve->tos; if (tos == 1) { const struct iphdr *iip = ip_hdr(skb); tos = ip_tunnel_get_dsfield(iip, skb); + use_cache = false; } fl4->flowi4_tos = RT_TOS(tos); fl4->daddr = geneve->remote.sin.sin_addr.s_addr; + dst_cache = &geneve->dst_cache; + } + + use_cache = use_cache && !skb->mark; + if (use_cache) { + rt = dst_cache_get_ip4(dst_cache, &fl4->saddr); + if (rt) + return rt; } rt = ip_route_output_key(geneve->net, fl4); @@ -786,6 +807,8 @@ static struct rtable *geneve_get_v4_rt(struct sk_buff *skb, ip_rt_put(rt); return ERR_PTR(-ELOOP); } + if (use_cache) + dst_cache_set_ip4(dst_cache, &rt->dst, fl4->saddr); return rt; } @@ -798,6 +821,8 @@ static struct dst_entry *geneve_get_v6_dst(struct sk_buff *skb, struct geneve_dev *geneve = netdev_priv(dev); struct geneve_sock *gs6 = geneve->sock6; struct dst_entry *dst = NULL; + struct dst_cache *dst_cache; + bool use_cache = true; __u8 prio; memset(fl6, 0, sizeof(*fl6)); @@ -808,16 +833,26 @@ static struct dst_entry *geneve_get_v6_dst(struct sk_buff *skb, fl6->daddr = info->key.u.ipv6.dst; fl6->saddr = info->key.u.ipv6.src; fl6->flowi6_tos = RT_TOS(info->key.tos); + dst_cache = &info->dst_cache; } else { prio = geneve->tos; if (prio == 1) { const struct iphdr *iip = ip_hdr(skb); prio = ip_tunnel_get_dsfield(iip, skb); + use_cache = false; } fl6->flowi6_tos = RT_TOS(prio); fl6->daddr = geneve->remote.sin6.sin6_addr; + dst_cache = &geneve->dst_cache; + } + + use_cache = use_cache && !skb->mark; + if (use_cache) { + dst = dst_cache_get_ip6(dst_cache, &fl6->saddr); + if (dst) + return dst; } if (ipv6_stub->ipv6_dst_lookup(geneve->net, gs6->sock->sk, &dst, fl6)) { @@ -830,6 +865,8 @@ static struct dst_entry *geneve_get_v6_dst(struct sk_buff *skb, return ERR_PTR(-ELOOP); } + if (use_cache) + dst_cache_set_ip6(dst_cache, dst, &fl6->saddr); return dst; } #endif @@ -1272,6 +1309,8 @@ static int geneve_configure(struct net *net, struct net_device *dev, return -EPERM; } + dst_cache_reset(&geneve->dst_cache); + err = register_netdevice(dev); if (err) return err; -- cgit v1.2.3 From 1e9f12ec92ab7307ac7386924e343905f7f12205 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 18 Feb 2016 11:22:49 +0100 Subject: geneve: implement geneve_get_sk_family helper Similarly to the existing vxlan_get_sk_family. Signed-off-by: Jiri Benc Signed-off-by: David S. Miller --- drivers/net/geneve.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'drivers/net/geneve.c') diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 6f208132a574..f09de1e30955 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -110,6 +110,11 @@ static __be64 vni_to_tunnel_id(const __u8 *vni) #endif } +static sa_family_t geneve_get_sk_family(struct geneve_sock *gs) +{ + return gs->sock->sk->sk_family; +} + static struct geneve_dev *geneve_lookup(struct geneve_sock *gs, __be32 addr, u8 vni[]) { @@ -165,16 +170,13 @@ static void geneve_rx(struct geneve_sock *gs, struct sk_buff *skb) static u8 zero_vni[3]; u8 *vni; int err = 0; - sa_family_t sa_family; #if IS_ENABLED(CONFIG_IPV6) struct ipv6hdr *ip6h = NULL; struct in6_addr addr6; static struct in6_addr zero_addr6; #endif - sa_family = gs->sock->sk->sk_family; - - if (sa_family == AF_INET) { + if (geneve_get_sk_family(gs) == AF_INET) { iph = ip_hdr(skb); /* outer IP header... */ if (gs->collect_md) { @@ -188,7 +190,7 @@ static void geneve_rx(struct geneve_sock *gs, struct sk_buff *skb) geneve = geneve_lookup(gs, addr, vni); #if IS_ENABLED(CONFIG_IPV6) - } else if (sa_family == AF_INET6) { + } else if (geneve_get_sk_family(gs) == AF_INET6) { ip6h = ipv6_hdr(skb); /* outer IPv6 header... */ if (gs->collect_md) { @@ -213,7 +215,7 @@ static void geneve_rx(struct geneve_sock *gs, struct sk_buff *skb) (gnvh->oam ? TUNNEL_OAM : 0) | (gnvh->critical ? TUNNEL_CRIT_OPT : 0); - tun_dst = udp_tun_rx_dst(skb, sa_family, flags, + tun_dst = udp_tun_rx_dst(skb, geneve_get_sk_family(gs), flags, vni_to_tunnel_id(gnvh->vni), gnvh->opt_len * 4); if (!tun_dst) @@ -392,7 +394,7 @@ static void geneve_notify_add_rx_port(struct geneve_sock *gs) struct net_device *dev; struct sock *sk = gs->sock->sk; struct net *net = sock_net(sk); - sa_family_t sa_family = sk->sk_family; + sa_family_t sa_family = geneve_get_sk_family(gs); __be16 port = inet_sk(sk)->inet_sport; int err; @@ -553,7 +555,7 @@ static void geneve_notify_del_rx_port(struct geneve_sock *gs) struct net_device *dev; struct sock *sk = gs->sock->sk; struct net *net = sock_net(sk); - sa_family_t sa_family = sk->sk_family; + sa_family_t sa_family = geneve_get_sk_family(gs); __be16 port = inet_sk(sk)->inet_sport; rcu_read_lock(); @@ -596,7 +598,7 @@ static struct geneve_sock *geneve_find_sock(struct geneve_net *gn, list_for_each_entry(gs, &gn->sock_list, list) { if (inet_sk(gs->sock->sk)->inet_sport == dst_port && - inet_sk(gs->sock->sk)->sk.sk_family == family) { + geneve_get_sk_family(gs) == family) { return gs; } } -- cgit v1.2.3 From 9fc4754582bf46b0998a64fce74f570cea720e18 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 18 Feb 2016 11:22:50 +0100 Subject: geneve: move geneve device lookup before iptunnel_pull_header This is in preparation for iptunnel_pull_header calling skb_scrub_packet. Signed-off-by: Jiri Benc Signed-off-by: David S. Miller --- drivers/net/geneve.c | 76 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 44 insertions(+), 32 deletions(-) (limited to 'drivers/net/geneve.c') diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index f09de1e30955..4ceccf871b3f 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -158,55 +158,60 @@ static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb) return (struct genevehdr *)(udp_hdr(skb) + 1); } -/* geneve receive/decap routine */ -static void geneve_rx(struct geneve_sock *gs, struct sk_buff *skb) +static struct geneve_dev *geneve_lookup_skb(struct geneve_sock *gs, + struct sk_buff *skb) { - struct genevehdr *gnvh = geneve_hdr(skb); - struct metadata_dst *tun_dst = NULL; - struct geneve_dev *geneve = NULL; - struct pcpu_sw_netstats *stats; - struct iphdr *iph = NULL; + u8 *vni; __be32 addr; static u8 zero_vni[3]; - u8 *vni; - int err = 0; #if IS_ENABLED(CONFIG_IPV6) - struct ipv6hdr *ip6h = NULL; - struct in6_addr addr6; static struct in6_addr zero_addr6; #endif if (geneve_get_sk_family(gs) == AF_INET) { + struct iphdr *iph; + iph = ip_hdr(skb); /* outer IP header... */ if (gs->collect_md) { vni = zero_vni; addr = 0; } else { - vni = gnvh->vni; - + vni = geneve_hdr(skb)->vni; addr = iph->saddr; } - geneve = geneve_lookup(gs, addr, vni); + return geneve_lookup(gs, addr, vni); #if IS_ENABLED(CONFIG_IPV6) } else if (geneve_get_sk_family(gs) == AF_INET6) { + struct ipv6hdr *ip6h; + struct in6_addr addr6; + ip6h = ipv6_hdr(skb); /* outer IPv6 header... */ if (gs->collect_md) { vni = zero_vni; addr6 = zero_addr6; } else { - vni = gnvh->vni; - + vni = geneve_hdr(skb)->vni; addr6 = ip6h->saddr; } - geneve = geneve6_lookup(gs, addr6, vni); + return geneve6_lookup(gs, addr6, vni); #endif } - if (!geneve) - goto drop; + return NULL; +} + +/* geneve receive/decap routine */ +static void geneve_rx(struct geneve_dev *geneve, struct geneve_sock *gs, + struct sk_buff *skb) +{ + struct genevehdr *gnvh = geneve_hdr(skb); + struct metadata_dst *tun_dst = NULL; + struct pcpu_sw_netstats *stats; + int err = 0; + void *oiph; if (ip_tunnel_collect_metadata() || gs->collect_md) { __be16 flags; @@ -243,25 +248,27 @@ static void geneve_rx(struct geneve_sock *gs, struct sk_buff *skb) if (ether_addr_equal(eth_hdr(skb)->h_source, geneve->dev->dev_addr)) goto drop; + oiph = skb_network_header(skb); skb_reset_network_header(skb); - if (iph) - err = IP_ECN_decapsulate(iph, skb); + if (geneve_get_sk_family(gs) == AF_INET) + err = IP_ECN_decapsulate(oiph, skb); #if IS_ENABLED(CONFIG_IPV6) - if (ip6h) - err = IP6_ECN_decapsulate(ip6h, skb); + else + err = IP6_ECN_decapsulate(oiph, skb); #endif if (unlikely(err)) { if (log_ecn_error) { - if (iph) + if (geneve_get_sk_family(gs) == AF_INET) net_info_ratelimited("non-ECT from %pI4 " "with TOS=%#x\n", - &iph->saddr, iph->tos); + &((struct iphdr *)oiph)->saddr, + ((struct iphdr *)oiph)->tos); #if IS_ENABLED(CONFIG_IPV6) - if (ip6h) + else net_info_ratelimited("non-ECT from %pI6\n", - &ip6h->saddr); + &((struct ipv6hdr *)oiph)->saddr); #endif } if (err > 1) { @@ -323,6 +330,7 @@ static void geneve_uninit(struct net_device *dev) static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb) { struct genevehdr *geneveh; + struct geneve_dev *geneve; struct geneve_sock *gs; int opts_len; @@ -338,16 +346,20 @@ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb) if (unlikely(geneveh->proto_type != htons(ETH_P_TEB))) goto error; + gs = rcu_dereference_sk_user_data(sk); + if (!gs) + goto drop; + + geneve = geneve_lookup_skb(gs, skb); + if (!geneve) + goto drop; + opts_len = geneveh->opt_len * 4; if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len, htons(ETH_P_TEB))) goto drop; - gs = rcu_dereference_sk_user_data(sk); - if (!gs) - goto drop; - - geneve_rx(gs, skb); + geneve_rx(geneve, gs, skb); return 0; drop: -- cgit v1.2.3 From 7f290c94352e59b1d720055fce760a69a63bd0a1 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 18 Feb 2016 11:22:52 +0100 Subject: iptunnel: scrub packet in iptunnel_pull_header Part of skb_scrub_packet was open coded in iptunnel_pull_header. Let it call skb_scrub_packet directly instead. Signed-off-by: Jiri Benc Signed-off-by: David S. Miller --- drivers/net/geneve.c | 4 ++-- drivers/net/vxlan.c | 4 ++-- include/net/ip_tunnels.h | 3 ++- net/ipv4/ip_gre.c | 2 +- net/ipv4/ip_tunnel_core.c | 8 +++----- net/ipv4/ipip.c | 2 +- net/ipv6/sit.c | 2 +- 7 files changed, 12 insertions(+), 13 deletions(-) (limited to 'drivers/net/geneve.c') diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 4ceccf871b3f..dfbe3ca687f7 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -237,7 +237,6 @@ static void geneve_rx(struct geneve_dev *geneve, struct geneve_sock *gs, } skb_reset_mac_header(skb); - skb_scrub_packet(skb, !net_eq(geneve->net, dev_net(geneve->dev))); skb->protocol = eth_type_trans(skb, geneve->dev); skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); @@ -356,7 +355,8 @@ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb) opts_len = geneveh->opt_len * 4; if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len, - htons(ETH_P_TEB))) + htons(ETH_P_TEB), + !net_eq(geneve->net, dev_net(geneve->dev)))) goto drop; geneve_rx(geneve, gs, skb); diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 16a176cd0dad..c963897e713d 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1198,7 +1198,6 @@ static void vxlan_rcv(struct vxlan_dev *vxlan, struct vxlan_sock *vs, int err = 0; skb_reset_mac_header(skb); - skb_scrub_packet(skb, !net_eq(vxlan->net, dev_net(vxlan->dev))); skb->protocol = eth_type_trans(skb, vxlan->dev); skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); @@ -1305,7 +1304,8 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) if (!vxlan) goto drop; - if (iptunnel_pull_header(skb, VXLAN_HLEN, htons(ETH_P_TEB))) + if (iptunnel_pull_header(skb, VXLAN_HLEN, htons(ETH_P_TEB), + !net_eq(vxlan->net, dev_net(vxlan->dev)))) goto drop; if (vxlan_collect_metadata(vs)) { diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 87408ab80856..4dd616376fec 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -270,7 +270,8 @@ static inline u8 ip_tunnel_ecn_encap(u8 tos, const struct iphdr *iph, return INET_ECN_encapsulate(tos, inner); } -int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto); +int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto, + bool xnet); void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, __be32 src, __be32 dst, u8 proto, u8 tos, u8 ttl, __be16 df, bool xnet); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 917c2c1bfadd..12071e28d958 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -238,7 +238,7 @@ static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, return -EINVAL; } } - return iptunnel_pull_header(skb, hdr_len, tpi->proto); + return iptunnel_pull_header(skb, hdr_len, tpi->proto, false); } static void ipgre_err(struct sk_buff *skb, u32 info, diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index a6e58b6141cd..eaca2449a09a 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -86,7 +86,8 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, } EXPORT_SYMBOL_GPL(iptunnel_xmit); -int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto) +int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto, + bool xnet) { if (unlikely(!pskb_may_pull(skb, hdr_len))) return -ENOMEM; @@ -109,13 +110,10 @@ int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto) skb->protocol = inner_proto; } - nf_reset(skb); - secpath_reset(skb); skb_clear_hash_if_not_l4(skb); - skb_dst_drop(skb); skb->vlan_tci = 0; skb_set_queue_mapping(skb, 0); - skb->pkt_type = PACKET_HOST; + skb_scrub_packet(skb, xnet); return 0; } EXPORT_SYMBOL_GPL(iptunnel_pull_header); diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 6ec5b42fd172..ec51d02166de 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -195,7 +195,7 @@ static int ipip_rcv(struct sk_buff *skb) if (tunnel) { if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; - if (iptunnel_pull_header(skb, 0, tpi.proto)) + if (iptunnel_pull_header(skb, 0, tpi.proto, false)) goto drop; return ip_tunnel_rcv(tunnel, skb, &tpi, NULL, log_ecn_error); } diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 0625ac6356b5..f45b8ffc2840 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -740,7 +740,7 @@ static int ipip_rcv(struct sk_buff *skb) if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; - if (iptunnel_pull_header(skb, 0, tpi.proto)) + if (iptunnel_pull_header(skb, 0, tpi.proto, false)) goto drop; return ip_tunnel_rcv(tunnel, skb, &tpi, NULL, log_ecn_error); } -- cgit v1.2.3 From 14f1f724355206dea1cf3f23ee87993bfd47c70c Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 19 Feb 2016 11:26:24 -0800 Subject: GENEVE: Support outer IPv4 Tx checksums by default This change makes it so that if UDP CSUM is not specified we will default to enabling it. The main motivation behind this is the fact that with the use of outer checksum we can greatly improve the performance for GENEVE tunnels on hardware that doesn't know how to parse them. Signed-off-by: Alexander Duyck Acked-by: Tom Herbert Signed-off-by: David S. Miller --- drivers/net/geneve.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'drivers/net/geneve.c') diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index dfbe3ca687f7..8fa8388cc5d4 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -76,7 +76,7 @@ struct geneve_dev { }; /* Geneve device flags */ -#define GENEVE_F_UDP_CSUM BIT(0) +#define GENEVE_F_UDP_ZERO_CSUM_TX BIT(0) #define GENEVE_F_UDP_ZERO_CSUM6_TX BIT(1) #define GENEVE_F_UDP_ZERO_CSUM6_RX BIT(2) @@ -703,7 +703,7 @@ static int geneve_build_skb(struct rtable *rt, struct sk_buff *skb, struct genevehdr *gnvh; int min_headroom; int err; - bool udp_sum = !!(flags & GENEVE_F_UDP_CSUM); + bool udp_sum = !(flags & GENEVE_F_UDP_ZERO_CSUM_TX); skb_scrub_packet(skb, xnet); @@ -944,9 +944,9 @@ static netdev_tx_t geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev, opts = ip_tunnel_info_opts(info); if (key->tun_flags & TUNNEL_CSUM) - flags |= GENEVE_F_UDP_CSUM; + flags &= ~GENEVE_F_UDP_ZERO_CSUM_TX; else - flags &= ~GENEVE_F_UDP_CSUM; + flags |= GENEVE_F_UDP_ZERO_CSUM_TX; err = geneve_build_skb(rt, skb, key->tun_flags, vni, info->options_len, opts, flags, xnet); @@ -972,7 +972,7 @@ static netdev_tx_t geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev, udp_tunnel_xmit_skb(rt, gs4->sock->sk, skb, fl4.saddr, fl4.daddr, tos, ttl, df, sport, geneve->dst_port, !net_eq(geneve->net, dev_net(geneve->dev)), - !(flags & GENEVE_F_UDP_CSUM)); + !!(flags & GENEVE_F_UDP_ZERO_CSUM_TX)); return NETDEV_TX_OK; @@ -1383,8 +1383,8 @@ static int geneve_newlink(struct net *net, struct net_device *dev, metadata = true; if (data[IFLA_GENEVE_UDP_CSUM] && - nla_get_u8(data[IFLA_GENEVE_UDP_CSUM])) - flags |= GENEVE_F_UDP_CSUM; + !nla_get_u8(data[IFLA_GENEVE_UDP_CSUM])) + flags |= GENEVE_F_UDP_ZERO_CSUM_TX; if (data[IFLA_GENEVE_UDP_ZERO_CSUM6_TX] && nla_get_u8(data[IFLA_GENEVE_UDP_ZERO_CSUM6_TX])) @@ -1454,7 +1454,7 @@ static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev) } if (nla_put_u8(skb, IFLA_GENEVE_UDP_CSUM, - !!(geneve->flags & GENEVE_F_UDP_CSUM)) || + !(geneve->flags & GENEVE_F_UDP_ZERO_CSUM_TX)) || nla_put_u8(skb, IFLA_GENEVE_UDP_ZERO_CSUM6_TX, !!(geneve->flags & GENEVE_F_UDP_ZERO_CSUM6_TX)) || nla_put_u8(skb, IFLA_GENEVE_UDP_ZERO_CSUM6_RX, -- cgit v1.2.3 From 14ca0751c96f8d3d0f52e8ed3b3236f8b34d3460 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 Mar 2016 15:15:06 +0100 Subject: bpf: support for access to tunnel options After eBPF being able to programmatically access/manage tunnel key meta data via commit d3aa45ce6b94 ("bpf: add helpers to access tunnel metadata") and more recently also for IPv6 through c6c33454072f ("bpf: support ipv6 for bpf_skb_{set,get}_tunnel_key"), this work adds two complementary helpers to generically access their auxiliary tunnel options. Geneve and vxlan support this facility. For geneve, TLVs can be pushed, and for the vxlan case its GBP extension. I.e. setting tunnel key for geneve case only makes sense, if we can also read/write TLVs into it. In the GBP case, it provides the flexibility to easily map the group policy ID in combination with other helpers or maps. I chose to model this as two separate helpers, bpf_skb_{set,get}_tunnel_opt(), for a couple of reasons. bpf_skb_{set,get}_tunnel_key() is already rather complex by itself, and there may be cases for tunnel key backends where tunnel options are not always needed. If we would have integrated this into bpf_skb_{set,get}_tunnel_key() nevertheless, we are very limited with remaining helper arguments, so keeping compatibility on structs in case of passing in a flat buffer gets more cumbersome. Separating both also allows for more flexibility and future extensibility, f.e. options could be fed directly from a map, etc. Moreover, change geneve's xmit path to test only for info->options_len instead of TUNNEL_GENEVE_OPT flag. This makes it more consistent with vxlan's xmit path and allows for avoiding to specify a protocol flag in the API on xmit, so it can be protocol agnostic. Having info->options_len is enough information that is needed. Tested with vxlan and geneve. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- drivers/net/geneve.c | 4 +-- include/uapi/linux/bpf.h | 11 +++++++ net/core/filter.c | 83 ++++++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 90 insertions(+), 8 deletions(-) (limited to 'drivers/net/geneve.c') diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index bc5da357e16d..36db4cf0579c 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -940,7 +940,7 @@ static netdev_tx_t geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev, u8 vni[3]; tunnel_id_to_vni(key->tun_id, vni); - if (key->tun_flags & TUNNEL_GENEVE_OPT) + if (info->options_len) opts = ip_tunnel_info_opts(info); if (key->tun_flags & TUNNEL_CSUM) @@ -1027,7 +1027,7 @@ static netdev_tx_t geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev, u8 vni[3]; tunnel_id_to_vni(key->tun_id, vni); - if (key->tun_flags & TUNNEL_GENEVE_OPT) + if (info->options_len) opts = ip_tunnel_info_opts(info); if (key->tun_flags & TUNNEL_CSUM) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 21ee6d52016f..9221f653fee3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -298,6 +298,17 @@ enum bpf_func_id { * Return: csum result */ BPF_FUNC_csum_diff, + + /** + * bpf_skb_[gs]et_tunnel_opt(skb, opt, size) + * retrieve or populate tunnel options metadata + * @skb: pointer to skb + * @opt: pointer to raw tunnel option data + * @size: size of @opt + * Return: 0 on success for set, option size for get + */ + BPF_FUNC_skb_get_tunnel_opt, + BPF_FUNC_skb_set_tunnel_opt, __BPF_FUNC_MAX_ID, }; diff --git a/net/core/filter.c b/net/core/filter.c index 6c9d15561d04..012a10c2da94 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1809,6 +1809,32 @@ static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = { .arg4_type = ARG_ANYTHING, }; +static u64 bpf_skb_get_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5) +{ + struct sk_buff *skb = (struct sk_buff *) (long) r1; + u8 *to = (u8 *) (long) r2; + const struct ip_tunnel_info *info = skb_tunnel_info(skb); + + if (unlikely(!info || + !(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT))) + return -ENOENT; + if (unlikely(size < info->options_len)) + return -ENOMEM; + + ip_tunnel_info_opts_get(to, info); + + return info->options_len; +} + +static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = { + .func = bpf_skb_get_tunnel_opt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_STACK, + .arg3_type = ARG_CONST_STACK_SIZE, +}; + static struct metadata_dst __percpu *md_dst; static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) @@ -1875,17 +1901,58 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = { .arg4_type = ARG_ANYTHING, }; -static const struct bpf_func_proto *bpf_get_skb_set_tunnel_key_proto(void) +#define BPF_TUNLEN_MAX 255 + +static u64 bpf_skb_set_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5) +{ + struct sk_buff *skb = (struct sk_buff *) (long) r1; + u8 *from = (u8 *) (long) r2; + struct ip_tunnel_info *info = skb_tunnel_info(skb); + const struct metadata_dst *md = this_cpu_ptr(md_dst); + + if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1)))) + return -EINVAL; + if (unlikely(size > BPF_TUNLEN_MAX)) + return -ENOMEM; + + ip_tunnel_info_opts_set(info, from, size); + + return 0; +} + +static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = { + .func = bpf_skb_set_tunnel_opt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_STACK, + .arg3_type = ARG_CONST_STACK_SIZE, +}; + +static const struct bpf_func_proto * +bpf_get_skb_set_tunnel_proto(enum bpf_func_id which) { if (!md_dst) { - /* race is not possible, since it's called from - * verifier that is holding verifier mutex + BUILD_BUG_ON(FIELD_SIZEOF(struct ip_tunnel_info, + options_len) != 1); + + /* Race is not possible, since it's called from verifier + * that is holding verifier mutex. */ - md_dst = metadata_dst_alloc_percpu(0, GFP_KERNEL); + md_dst = metadata_dst_alloc_percpu(BPF_TUNLEN_MAX, + GFP_KERNEL); if (!md_dst) return NULL; } - return &bpf_skb_set_tunnel_key_proto; + + switch (which) { + case BPF_FUNC_skb_set_tunnel_key: + return &bpf_skb_set_tunnel_key_proto; + case BPF_FUNC_skb_set_tunnel_opt: + return &bpf_skb_set_tunnel_opt_proto; + default: + return NULL; + } } static const struct bpf_func_proto * @@ -1939,7 +2006,11 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) case BPF_FUNC_skb_get_tunnel_key: return &bpf_skb_get_tunnel_key_proto; case BPF_FUNC_skb_set_tunnel_key: - return bpf_get_skb_set_tunnel_key_proto(); + return bpf_get_skb_set_tunnel_proto(func_id); + case BPF_FUNC_skb_get_tunnel_opt: + return &bpf_skb_get_tunnel_opt_proto; + case BPF_FUNC_skb_set_tunnel_opt: + return bpf_get_skb_set_tunnel_proto(func_id); case BPF_FUNC_redirect: return &bpf_redirect_proto; case BPF_FUNC_get_route_realm: -- cgit v1.2.3 From db3c6139e6ead91b42e7c2ad044ed8beaee884e6 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 Mar 2016 15:15:07 +0100 Subject: bpf, vxlan, geneve, gre: fix usage of dst_cache on xmit The assumptions from commit 0c1d70af924b ("net: use dst_cache for vxlan device"), 468dfffcd762 ("geneve: add dst caching support") and 3c1cb4d2604c ("net/ipv4: add dst cache support for gre lwtunnels") on dst_cache usage when ip_tunnel_info is used is unfortunately not always valid as assumed. While it seems correct for ip_tunnel_info front-ends such as OVS, eBPF however can fill in ip_tunnel_info for consumers like vxlan, geneve or gre with different remote dsts, tos, etc, therefore they cannot be assumed as packet independent. Right now vxlan, geneve, gre would cache the dst for eBPF and every packet would reuse the same entry that was first created on the initial route lookup. eBPF doesn't store/cache the ip_tunnel_info, so each skb may have a different one. Fix it by adding a flag that checks the ip_tunnel_info. Also the !tos test in vxlan needs to be handeled differently in this context as it is currently inferred from ip_tunnel_info as well if present. ip_tunnel_dst_cache_usable() helper is added for the three tunnel cases, which checks if we can use dst cache. Fixes: 0c1d70af924b ("net: use dst_cache for vxlan device") Fixes: 468dfffcd762 ("geneve: add dst caching support") Fixes: 3c1cb4d2604c ("net/ipv4: add dst cache support for gre lwtunnels") Signed-off-by: Daniel Borkmann Acked-by: Paolo Abeni Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- drivers/net/geneve.c | 6 ++---- drivers/net/vxlan.c | 24 ++++++++++++------------ include/net/ip_tunnels.h | 15 +++++++++++++++ net/core/filter.c | 2 +- net/ipv4/ip_gre.c | 10 ++++++---- 5 files changed, 36 insertions(+), 21 deletions(-) (limited to 'drivers/net/geneve.c') diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 36db4cf0579c..6a0cbbe03e5d 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -775,10 +775,10 @@ static struct rtable *geneve_get_v4_rt(struct sk_buff *skb, struct flowi4 *fl4, struct ip_tunnel_info *info) { + bool use_cache = ip_tunnel_dst_cache_usable(skb, info); struct geneve_dev *geneve = netdev_priv(dev); struct dst_cache *dst_cache; struct rtable *rt = NULL; - bool use_cache = true; __u8 tos; memset(fl4, 0, sizeof(*fl4)); @@ -804,7 +804,6 @@ static struct rtable *geneve_get_v4_rt(struct sk_buff *skb, dst_cache = &geneve->dst_cache; } - use_cache = use_cache && !skb->mark; if (use_cache) { rt = dst_cache_get_ip4(dst_cache, &fl4->saddr); if (rt) @@ -832,11 +831,11 @@ static struct dst_entry *geneve_get_v6_dst(struct sk_buff *skb, struct flowi6 *fl6, struct ip_tunnel_info *info) { + bool use_cache = ip_tunnel_dst_cache_usable(skb, info); struct geneve_dev *geneve = netdev_priv(dev); struct geneve_sock *gs6 = geneve->sock6; struct dst_entry *dst = NULL; struct dst_cache *dst_cache; - bool use_cache = true; __u8 prio; memset(fl6, 0, sizeof(*fl6)); @@ -862,7 +861,6 @@ static struct dst_entry *geneve_get_v6_dst(struct sk_buff *skb, dst_cache = &geneve->dst_cache; } - use_cache = use_cache && !skb->mark; if (use_cache) { dst = dst_cache_get_ip6(dst_cache, &fl6->saddr); if (dst) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index fc998a3bd234..7294a459b13c 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1756,17 +1756,15 @@ static struct rtable *vxlan_get_route(struct vxlan_dev *vxlan, struct sk_buff *skb, int oif, u8 tos, __be32 daddr, __be32 *saddr, struct dst_cache *dst_cache, - struct ip_tunnel_info *info) + const struct ip_tunnel_info *info) { + bool use_cache = ip_tunnel_dst_cache_usable(skb, info); struct rtable *rt = NULL; - bool use_cache = false; struct flowi4 fl4; - /* when the ip_tunnel_info is availble, the tos used for lookup is - * packet independent, so we can use the cache - */ - if (!skb->mark && (!tos || info)) { - use_cache = true; + if (tos && !info) + use_cache = false; + if (use_cache) { rt = dst_cache_get_ip4(dst_cache, saddr); if (rt) return rt; @@ -1794,13 +1792,15 @@ static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan, struct sk_buff *skb, int oif, const struct in6_addr *daddr, struct in6_addr *saddr, - struct dst_cache *dst_cache) + struct dst_cache *dst_cache, + const struct ip_tunnel_info *info) { + bool use_cache = ip_tunnel_dst_cache_usable(skb, info); struct dst_entry *ndst; struct flowi6 fl6; int err; - if (!skb->mark) { + if (use_cache) { ndst = dst_cache_get_ip6(dst_cache, saddr); if (ndst) return ndst; @@ -1820,7 +1820,7 @@ static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan, return ERR_PTR(err); *saddr = fl6.saddr; - if (!skb->mark) + if (use_cache) dst_cache_set_ip6(dst_cache, ndst, saddr); return ndst; } @@ -2018,7 +2018,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, ndst = vxlan6_get_route(vxlan, skb, rdst ? rdst->remote_ifindex : 0, &dst->sin6.sin6_addr, &saddr, - dst_cache); + dst_cache, info); if (IS_ERR(ndst)) { netdev_dbg(dev, "no route to %pI6\n", &dst->sin6.sin6_addr); @@ -2387,7 +2387,7 @@ static int vxlan_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) return -EINVAL; ndst = vxlan6_get_route(vxlan, skb, 0, &info->key.u.ipv6.dst, - &info->key.u.ipv6.src, NULL); + &info->key.u.ipv6.src, NULL, info); if (IS_ERR(ndst)) return PTR_ERR(ndst); dst_release(ndst); diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 5f28b606633e..e1395d70fb48 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -140,6 +140,7 @@ struct ip_tunnel { #define TUNNEL_CRIT_OPT __cpu_to_be16(0x0400) #define TUNNEL_GENEVE_OPT __cpu_to_be16(0x0800) #define TUNNEL_VXLAN_OPT __cpu_to_be16(0x1000) +#define TUNNEL_NOCACHE __cpu_to_be16(0x2000) #define TUNNEL_OPTIONS_PRESENT (TUNNEL_GENEVE_OPT | TUNNEL_VXLAN_OPT) @@ -206,6 +207,20 @@ static inline void ip_tunnel_key_init(struct ip_tunnel_key *key, 0, sizeof(*key) - IP_TUNNEL_KEY_SIZE); } +static inline bool +ip_tunnel_dst_cache_usable(const struct sk_buff *skb, + const struct ip_tunnel_info *info) +{ + if (skb->mark) + return false; + if (!info) + return true; + if (info->key.tun_flags & TUNNEL_NOCACHE) + return false; + + return true; +} + static inline unsigned short ip_tunnel_info_af(const struct ip_tunnel_info *tun_info) { diff --git a/net/core/filter.c b/net/core/filter.c index 012a10c2da94..a66dc03c261f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1870,7 +1870,7 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) info = &md->u.tun_info; info->mode = IP_TUNNEL_INFO_TX; - info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM; + info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE; if (flags & BPF_F_DONT_FRAGMENT) info->key.tun_flags |= TUNNEL_DONT_FRAGMENT; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 202437d6087b..31936d387cfd 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -527,11 +527,12 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; + struct rtable *rt = NULL; struct flowi4 fl; - struct rtable *rt; int min_headroom; int tunnel_hlen; __be16 df, flags; + bool use_cache; int err; tun_info = skb_tunnel_info(skb); @@ -540,13 +541,14 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev) goto err_free_skb; key = &tun_info->key; - rt = !skb->mark ? dst_cache_get_ip4(&tun_info->dst_cache, &fl.saddr) : - NULL; + use_cache = ip_tunnel_dst_cache_usable(skb, tun_info); + if (use_cache) + rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl.saddr); if (!rt) { rt = gre_get_rt(skb, dev, &fl, key); if (IS_ERR(rt)) goto err_free_skb; - if (!skb->mark) + if (use_cache) dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst, fl.saddr); } -- cgit v1.2.3 From 134611446dc657e1bbc73ca0e4e6b599df687db0 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 9 Mar 2016 03:00:02 +0100 Subject: ip_tunnel: add support for setting flow label via collect metadata This patch extends udp_tunnel6_xmit_skb() to pass in the IPv6 flow label from call sites. Currently, there's no such option and it's always set to zero when writing ip6_flow_hdr(). Add a label member to ip_tunnel_key, so that flow-based tunnels via collect metadata frontends can make use of it. vxlan and geneve will be converted to add flow label support separately. Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- drivers/net/geneve.c | 2 +- drivers/net/vxlan.c | 2 +- include/net/dst_metadata.h | 5 ++++- include/net/ip_tunnels.h | 4 +++- include/net/udp_tunnel.h | 4 ++-- net/ipv6/ip6_udp_tunnel.c | 6 +++--- net/tipc/udp_media.c | 2 +- 7 files changed, 15 insertions(+), 10 deletions(-) (limited to 'drivers/net/geneve.c') diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 6a0cbbe03e5d..89ccff79d76c 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -1054,7 +1054,7 @@ static netdev_tx_t geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev, ttl = ttl ? : ip6_dst_hoplimit(dst); } udp_tunnel6_xmit_skb(dst, gs6->sock->sk, skb, dev, - &fl6.saddr, &fl6.daddr, prio, ttl, + &fl6.saddr, &fl6.daddr, prio, ttl, 0, sport, geneve->dst_port, !!(flags & GENEVE_F_UDP_ZERO_CSUM6_TX)); return NETDEV_TX_OK; diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 2399099e68cf..8bdcd5ea8424 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -2066,7 +2066,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, return; } udp_tunnel6_xmit_skb(ndst, sk, skb, dev, - &saddr, &dst->sin6.sin6_addr, tos, ttl, + &saddr, &dst->sin6.sin6_addr, tos, ttl, 0, src_port, dst_port, !udp_sum); #endif } diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index 84b833af6882..5db9f5910428 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -126,7 +126,7 @@ static inline struct metadata_dst *ip_tun_rx_dst(struct sk_buff *skb, ip_tunnel_key_init(&tun_dst->u.tun_info.key, iph->saddr, iph->daddr, iph->tos, iph->ttl, - 0, 0, tunnel_id, flags); + 0, 0, 0, tunnel_id, flags); return tun_dst; } @@ -152,8 +152,11 @@ static inline struct metadata_dst *ipv6_tun_rx_dst(struct sk_buff *skb, info->key.u.ipv6.src = ip6h->saddr; info->key.u.ipv6.dst = ip6h->daddr; + info->key.tos = ipv6_get_dsfield(ip6h); info->key.ttl = ip6h->hop_limit; + info->key.label = ip6_flowlabel(ip6h); + return tun_dst; } diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 0acd80fadb32..5dc2e454f866 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -48,6 +48,7 @@ struct ip_tunnel_key { __be16 tun_flags; u8 tos; /* TOS for IPv4, TC for IPv6 */ u8 ttl; /* TTL for IPv4, HL for IPv6 */ + __be32 label; /* Flow Label for IPv6 */ __be16 tp_src; __be16 tp_dst; }; @@ -181,7 +182,7 @@ int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *op, static inline void ip_tunnel_key_init(struct ip_tunnel_key *key, __be32 saddr, __be32 daddr, - u8 tos, u8 ttl, + u8 tos, u8 ttl, __be32 label, __be16 tp_src, __be16 tp_dst, __be64 tun_id, __be16 tun_flags) { @@ -192,6 +193,7 @@ static inline void ip_tunnel_key_init(struct ip_tunnel_key *key, 0, IP_TUNNEL_KEY_IPV4_PAD_LEN); key->tos = tos; key->ttl = ttl; + key->label = label; key->tun_flags = tun_flags; /* For the tunnel types on the top of IPsec, the tp_src and tp_dst of diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index 97f5adb121a6..b83114077cee 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -88,8 +88,8 @@ int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, struct net_device *dev, struct in6_addr *saddr, struct in6_addr *daddr, - __u8 prio, __u8 ttl, __be16 src_port, - __be16 dst_port, bool nocheck); + __u8 prio, __u8 ttl, __be32 label, + __be16 src_port, __be16 dst_port, bool nocheck); #endif void udp_tunnel_sock_release(struct socket *sock); diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c index 14dacf1df529..a7520528ecd2 100644 --- a/net/ipv6/ip6_udp_tunnel.c +++ b/net/ipv6/ip6_udp_tunnel.c @@ -73,8 +73,8 @@ int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, struct net_device *dev, struct in6_addr *saddr, struct in6_addr *daddr, - __u8 prio, __u8 ttl, __be16 src_port, - __be16 dst_port, bool nocheck) + __u8 prio, __u8 ttl, __be32 label, + __be16 src_port, __be16 dst_port, bool nocheck) { struct udphdr *uh; struct ipv6hdr *ip6h; @@ -98,7 +98,7 @@ int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, __skb_push(skb, sizeof(*ip6h)); skb_reset_network_header(skb); ip6h = ipv6_hdr(skb); - ip6_flow_hdr(ip6h, prio, htonl(0)); + ip6_flow_hdr(ip6h, prio, label); ip6h->payload_len = htons(skb->len); ip6h->nexthdr = IPPROTO_UDP; ip6h->hop_limit = ttl; diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index 49b3c2ede7ab..c94f9a15e2cd 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -196,7 +196,7 @@ static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb, ttl = ip6_dst_hoplimit(ndst); err = udp_tunnel6_xmit_skb(ndst, ub->ubsock->sk, skb, ndst->dev, &src->ipv6, - &dst->ipv6, 0, ttl, src->udp_port, + &dst->ipv6, 0, ttl, 0, src->udp_port, dst->udp_port, false); #endif } -- cgit v1.2.3 From 8eb3b99554b82da968d1fbc00df9f3156c5e2d63 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 9 Mar 2016 03:00:04 +0100 Subject: geneve: support setting IPv6 flow label This work adds support for setting the IPv6 flow label for geneve per device and through collect metadata (ip_tunnel_key) frontends. Also here, the geneve dst cache does not need any special considerations, for the cases where caches can be used, the label is static per cache. Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- drivers/net/geneve.c | 35 +++++++++++++++++++++++++++-------- include/uapi/linux/if_link.h | 1 + 2 files changed, 28 insertions(+), 8 deletions(-) (limited to 'drivers/net/geneve.c') diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 89ccff79d76c..33185b9a435e 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -68,6 +68,7 @@ struct geneve_dev { u8 tos; /* TOS override */ union geneve_addr remote; /* IP address for link partner */ struct list_head next; /* geneve's per namespace list */ + __be32 label; /* IPv6 flowlabel override */ __be16 dst_port; bool collect_md; struct gro_cells gro_cells; @@ -846,6 +847,7 @@ static struct dst_entry *geneve_get_v6_dst(struct sk_buff *skb, fl6->daddr = info->key.u.ipv6.dst; fl6->saddr = info->key.u.ipv6.src; fl6->flowi6_tos = RT_TOS(info->key.tos); + fl6->flowlabel = info->key.label; dst_cache = &info->dst_cache; } else { prio = geneve->tos; @@ -857,6 +859,7 @@ static struct dst_entry *geneve_get_v6_dst(struct sk_buff *skb, } fl6->flowi6_tos = RT_TOS(prio); + fl6->flowlabel = geneve->label; fl6->daddr = geneve->remote.sin6.sin6_addr; dst_cache = &geneve->dst_cache; } @@ -998,6 +1001,7 @@ static netdev_tx_t geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev, struct flowi6 fl6; __u8 prio, ttl; __be16 sport; + __be32 label; bool xnet = !net_eq(geneve->net, dev_net(geneve->dev)); u32 flags = geneve->flags; @@ -1041,6 +1045,7 @@ static netdev_tx_t geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev, prio = ip_tunnel_ecn_encap(key->tos, iip, skb); ttl = key->ttl; + label = info->key.label; } else { err = geneve6_build_skb(dst, skb, 0, geneve->vni, 0, NULL, flags, xnet); @@ -1052,9 +1057,11 @@ static netdev_tx_t geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev, if (!ttl && ipv6_addr_is_multicast(&fl6.daddr)) ttl = 1; ttl = ttl ? : ip6_dst_hoplimit(dst); + label = geneve->label; } + udp_tunnel6_xmit_skb(dst, gs6->sock->sk, skb, dev, - &fl6.saddr, &fl6.daddr, prio, ttl, 0, + &fl6.saddr, &fl6.daddr, prio, ttl, label, sport, geneve->dst_port, !!(flags & GENEVE_F_UDP_ZERO_CSUM6_TX)); return NETDEV_TX_OK; @@ -1238,6 +1245,7 @@ static const struct nla_policy geneve_policy[IFLA_GENEVE_MAX + 1] = { [IFLA_GENEVE_REMOTE6] = { .len = sizeof(struct in6_addr) }, [IFLA_GENEVE_TTL] = { .type = NLA_U8 }, [IFLA_GENEVE_TOS] = { .type = NLA_U8 }, + [IFLA_GENEVE_LABEL] = { .type = NLA_U32 }, [IFLA_GENEVE_PORT] = { .type = NLA_U16 }, [IFLA_GENEVE_COLLECT_METADATA] = { .type = NLA_FLAG }, [IFLA_GENEVE_UDP_CSUM] = { .type = NLA_U8 }, @@ -1295,8 +1303,8 @@ static struct geneve_dev *geneve_find_dev(struct geneve_net *gn, static int geneve_configure(struct net *net, struct net_device *dev, union geneve_addr *remote, - __u32 vni, __u8 ttl, __u8 tos, __be16 dst_port, - bool metadata, u32 flags) + __u32 vni, __u8 ttl, __u8 tos, __be32 label, + __be16 dst_port, bool metadata, u32 flags) { struct geneve_net *gn = net_generic(net, geneve_net_id); struct geneve_dev *t, *geneve = netdev_priv(dev); @@ -1306,7 +1314,7 @@ static int geneve_configure(struct net *net, struct net_device *dev, if (!remote) return -EINVAL; if (metadata && - (remote->sa.sa_family != AF_UNSPEC || vni || tos || ttl)) + (remote->sa.sa_family != AF_UNSPEC || vni || tos || ttl || label)) return -EINVAL; geneve->net = net; @@ -1321,10 +1329,14 @@ static int geneve_configure(struct net *net, struct net_device *dev, (remote->sa.sa_family == AF_INET6 && ipv6_addr_is_multicast(&remote->sin6.sin6_addr))) return -EINVAL; + if (label && remote->sa.sa_family != AF_INET6) + return -EINVAL; + geneve->remote = *remote; geneve->ttl = ttl; geneve->tos = tos; + geneve->label = label; geneve->dst_port = dst_port; geneve->collect_md = metadata; geneve->flags = flags; @@ -1367,6 +1379,7 @@ static int geneve_newlink(struct net *net, struct net_device *dev, __u8 ttl = 0, tos = 0; bool metadata = false; union geneve_addr remote = geneve_remote_unspec; + __be32 label = 0; __u32 vni = 0; u32 flags = 0; @@ -1403,6 +1416,10 @@ static int geneve_newlink(struct net *net, struct net_device *dev, if (data[IFLA_GENEVE_TOS]) tos = nla_get_u8(data[IFLA_GENEVE_TOS]); + if (data[IFLA_GENEVE_LABEL]) + label = nla_get_be32(data[IFLA_GENEVE_LABEL]) & + IPV6_FLOWLABEL_MASK; + if (data[IFLA_GENEVE_PORT]) dst_port = nla_get_be16(data[IFLA_GENEVE_PORT]); @@ -1421,8 +1438,8 @@ static int geneve_newlink(struct net *net, struct net_device *dev, nla_get_u8(data[IFLA_GENEVE_UDP_ZERO_CSUM6_RX])) flags |= GENEVE_F_UDP_ZERO_CSUM6_RX; - return geneve_configure(net, dev, &remote, vni, ttl, tos, dst_port, - metadata, flags); + return geneve_configure(net, dev, &remote, vni, ttl, tos, label, + dst_port, metadata, flags); } static void geneve_dellink(struct net_device *dev, struct list_head *head) @@ -1439,6 +1456,7 @@ static size_t geneve_get_size(const struct net_device *dev) nla_total_size(sizeof(struct in6_addr)) + /* IFLA_GENEVE_REMOTE{6} */ nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_TTL */ nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_TOS */ + nla_total_size(sizeof(__be32)) + /* IFLA_GENEVE_LABEL */ nla_total_size(sizeof(__be16)) + /* IFLA_GENEVE_PORT */ nla_total_size(0) + /* IFLA_GENEVE_COLLECT_METADATA */ nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_CSUM */ @@ -1469,7 +1487,8 @@ static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev) } if (nla_put_u8(skb, IFLA_GENEVE_TTL, geneve->ttl) || - nla_put_u8(skb, IFLA_GENEVE_TOS, geneve->tos)) + nla_put_u8(skb, IFLA_GENEVE_TOS, geneve->tos) || + nla_put_be32(skb, IFLA_GENEVE_LABEL, geneve->label)) goto nla_put_failure; if (nla_put_be16(skb, IFLA_GENEVE_PORT, geneve->dst_port)) @@ -1521,7 +1540,7 @@ struct net_device *geneve_dev_create_fb(struct net *net, const char *name, return dev; err = geneve_configure(net, dev, &geneve_remote_unspec, - 0, 0, 0, htons(dst_port), true, + 0, 0, 0, 0, htons(dst_port), true, GENEVE_F_UDP_ZERO_CSUM6_RX); if (err) goto err; diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 6bebc975031d..249eef9a21bd 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -479,6 +479,7 @@ enum { IFLA_GENEVE_UDP_CSUM, IFLA_GENEVE_UDP_ZERO_CSUM6_TX, IFLA_GENEVE_UDP_ZERO_CSUM6_RX, + IFLA_GENEVE_LABEL, __IFLA_GENEVE_MAX }; #define IFLA_GENEVE_MAX (__IFLA_GENEVE_MAX - 1) -- cgit v1.2.3 From c194cf93c164ed1c71142485ee0f70f9f2d1fe35 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 9 Mar 2016 09:24:23 -0800 Subject: gro: Defer clearing of flush bit in tunnel paths This patch updates the GRO handlers for GRE, VXLAN, GENEVE, and FOU so that we do not clear the flush bit until after we have called the next level GRO handler. Previously this was being cleared before parsing through the list of frames, however this resulted in several paths where either the bit needed to be reset but wasn't as in the case of FOU, or cases where it was being set as in GENEVE. By just deferring the clearing of the bit until after the next level protocol has been parsed we can avoid any unnecessary bit twiddling and avoid bugs. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- drivers/net/geneve.c | 7 ++----- drivers/net/vxlan.c | 3 +-- net/ipv4/fou.c | 3 +-- net/ipv4/gre_offload.c | 3 +-- 4 files changed, 5 insertions(+), 11 deletions(-) (limited to 'drivers/net/geneve.c') diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 33185b9a435e..192631a345df 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -463,8 +463,6 @@ static struct sk_buff **geneve_gro_receive(struct sk_buff **head, goto out; } - flush = 0; - for (p = *head; p; p = p->next) { if (!NAPI_GRO_CB(p)->same_flow) continue; @@ -481,14 +479,13 @@ static struct sk_buff **geneve_gro_receive(struct sk_buff **head, rcu_read_lock(); ptype = gro_find_receive_by_type(type); - if (!ptype) { - flush = 1; + if (!ptype) goto out_unlock; - } skb_gro_pull(skb, gh_len); skb_gro_postpull_rcsum(skb, gh, gh_len); pp = ptype->callbacks.gro_receive(head, skb); + flush = 0; out_unlock: rcu_read_unlock(); diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 8eda76f9e474..800106a7246c 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -591,8 +591,6 @@ static struct sk_buff **vxlan_gro_receive(struct sk_buff **head, skb_gro_pull(skb, sizeof(struct vxlanhdr)); /* pull vxlan header */ - flush = 0; - for (p = *head; p; p = p->next) { if (!NAPI_GRO_CB(p)->same_flow) continue; @@ -606,6 +604,7 @@ static struct sk_buff **vxlan_gro_receive(struct sk_buff **head, } pp = eth_gro_receive(head, skb); + flush = 0; out: skb_gro_remcsum_cleanup(skb, &grc); diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 88dab0c1670c..780484243e14 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -319,8 +319,6 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head, skb_gro_pull(skb, hdrlen); - flush = 0; - for (p = *head; p; p = p->next) { const struct guehdr *guehdr2; @@ -352,6 +350,7 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head, goto out_unlock; pp = ops->callbacks.gro_receive(head, skb); + flush = 0; out_unlock: rcu_read_unlock(); diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 47f4c544c916..540866dbd27d 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -175,8 +175,6 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head, null_compute_pseudo); } - flush = 0; - for (p = *head; p; p = p->next) { const struct gre_base_hdr *greh2; @@ -213,6 +211,7 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head, skb_gro_postpull_rcsum(skb, greh, grehlen); pp = ptype->callbacks.gro_receive(head, skb); + flush = 0; out_unlock: rcu_read_unlock(); -- cgit v1.2.3