From 4cb47a8644cc9eb8ec81190a50e79e6530d0297f Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Tue, 4 Aug 2020 07:53:43 +0200
Subject: tunnels: PMTU discovery support for directly bridged IP packets

It's currently possible to bridge Ethernet tunnels carrying IP
packets directly to external interfaces without assigning them
addresses and routes on the bridged network itself: this is the case
for UDP tunnels bridged with a standard bridge or by Open vSwitch.

PMTU discovery is currently broken with those configurations, because
the encapsulation effectively decreases the MTU of the link, and
while we are able to account for this using PMTU discovery on the
lower layer, we don't have a way to relay ICMP or ICMPv6 messages
needed by the sender, because we don't have valid routes to it.

On the other hand, as a tunnel endpoint, we can't fragment packets
as a general approach: this is for instance clearly forbidden for
VXLAN by RFC 7348, section 4.3:

   VTEPs MUST NOT fragment VXLAN packets.  Intermediate routers may
   fragment encapsulated VXLAN packets due to the larger frame size.
   The destination VTEP MAY silently discard such VXLAN fragments.

The same paragraph recommends that the MTU over the physical network
accomodates for encapsulations, but this isn't a practical option for
complex topologies, especially for typical Open vSwitch use cases.

Further, it states that:

   Other techniques like Path MTU discovery (see [RFC1191] and
   [RFC1981]) MAY be used to address this requirement as well.

Now, PMTU discovery already works for routed interfaces, we get
route exceptions created by the encapsulation device as they receive
ICMP Fragmentation Needed and ICMPv6 Packet Too Big messages, and
we already rebuild those messages with the appropriate MTU and route
them back to the sender.

Add the missing bits for bridged cases:

- checks in skb_tunnel_check_pmtu() to understand if it's appropriate
  to trigger a reply according to RFC 1122 section 3.2.2 for ICMP and
  RFC 4443 section 2.4 for ICMPv6. This function is already called by
  UDP tunnels

- a new function generating those ICMP or ICMPv6 replies. We can't
  reuse icmp_send() and icmp6_send() as we don't see the sender as a
  valid destination. This doesn't need to be generic, as we don't
  cover any other type of ICMP errors given that we only provide an
  encapsulation function to the sender

While at it, make the MTU check in skb_tunnel_check_pmtu() accurate:
we might receive GSO buffers here, and the passed headroom already
includes the inner MAC length, so we don't have to account for it
a second time (that would imply three MAC headers on the wire, but
there are just two).

This issue became visible while bridging IPv6 packets with 4500 bytes
of payload over GENEVE using IPv4 with a PMTU of 4000. Given the 50
bytes of encapsulation headroom, we would advertise MTU as 3950, and
we would reject fragmented IPv6 datagrams of 3958 bytes size on the
wire. We're exclusively dealing with network MTU here, though, so we
could get Ethernet frames up to 3964 octets in that case.

v2:
- moved skb_tunnel_check_pmtu() to ip_tunnel_core.c (David Ahern)
- split IPv4/IPv6 functions (David Ahern)

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bareudp.c | 5 +++--
 drivers/net/geneve.c  | 5 +++--
 drivers/net/vxlan.c   | 4 ++--
 3 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'drivers/net')

diff --git a/drivers/net/bareudp.c b/drivers/net/bareudp.c
index 3b6664c7e73c..841910f1db65 100644
--- a/drivers/net/bareudp.c
+++ b/drivers/net/bareudp.c
@@ -308,7 +308,7 @@ static int bareudp_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 		return PTR_ERR(rt);
 
 	skb_tunnel_check_pmtu(skb, &rt->dst,
-			      BAREUDP_IPV4_HLEN + info->options_len);
+			      BAREUDP_IPV4_HLEN + info->options_len, false);
 
 	sport = udp_flow_src_port(bareudp->net, skb,
 				  bareudp->sport_min, USHRT_MAX,
@@ -369,7 +369,8 @@ static int bareudp6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 	if (IS_ERR(dst))
 		return PTR_ERR(dst);
 
-	skb_tunnel_check_pmtu(skb, dst, BAREUDP_IPV6_HLEN + info->options_len);
+	skb_tunnel_check_pmtu(skb, dst, BAREUDP_IPV6_HLEN + info->options_len,
+			      false);
 
 	sport = udp_flow_src_port(bareudp->net, skb,
 				  bareudp->sport_min, USHRT_MAX,
diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 017c13acc911..de86b6d82132 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -894,7 +894,7 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 		return PTR_ERR(rt);
 
 	skb_tunnel_check_pmtu(skb, &rt->dst,
-			      GENEVE_IPV4_HLEN + info->options_len);
+			      GENEVE_IPV4_HLEN + info->options_len, false);
 
 	sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true);
 	if (geneve->cfg.collect_md) {
@@ -955,7 +955,8 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 	if (IS_ERR(dst))
 		return PTR_ERR(dst);
 
-	skb_tunnel_check_pmtu(skb, dst, GENEVE_IPV6_HLEN + info->options_len);
+	skb_tunnel_check_pmtu(skb, dst, GENEVE_IPV6_HLEN + info->options_len,
+			      false);
 
 	sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true);
 	if (geneve->cfg.collect_md) {
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 77658425db8a..1432544da507 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -2720,7 +2720,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 		}
 
 		ndst = &rt->dst;
-		skb_tunnel_check_pmtu(skb, ndst, VXLAN_HEADROOM);
+		skb_tunnel_check_pmtu(skb, ndst, VXLAN_HEADROOM, false);
 
 		tos = ip_tunnel_ecn_encap(RT_TOS(tos), old_iph, skb);
 		ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
@@ -2760,7 +2760,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 				goto out_unlock;
 		}
 
-		skb_tunnel_check_pmtu(skb, ndst, VXLAN6_HEADROOM);
+		skb_tunnel_check_pmtu(skb, ndst, VXLAN6_HEADROOM, false);
 
 		tos = ip_tunnel_ecn_encap(RT_TOS(tos), old_iph, skb);
 		ttl = ttl ? : ip6_dst_hoplimit(ndst);
-- 
cgit v1.2.3


From fc68c99577cc66e38d11b3e29304efb83fa08d53 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Tue, 4 Aug 2020 07:53:44 +0200
Subject: vxlan: Support for PMTU discovery on directly bridged links

If the interface is a bridge or Open vSwitch port, and we can't
forward a packet because it exceeds the local PMTU estimate,
trigger an ICMP or ICMPv6 reply to the sender, using the same
interface to forward it back.

If metadata collection is enabled, reverse destination and source
addresses, so that Open vSwitch is able to match this packet against
the existing, reverse flow.

v2: Use netif_is_any_bridge_port() (David Ahern)

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c | 47 +++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 41 insertions(+), 6 deletions(-)

(limited to 'drivers/net')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 1432544da507..6d5816be6131 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -2500,7 +2500,8 @@ static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan,
 
 /* Bypass encapsulation if the destination is local */
 static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan,
-			       struct vxlan_dev *dst_vxlan, __be32 vni)
+			       struct vxlan_dev *dst_vxlan, __be32 vni,
+			       bool snoop)
 {
 	struct pcpu_sw_netstats *tx_stats, *rx_stats;
 	union vxlan_addr loopback;
@@ -2532,7 +2533,7 @@ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan,
 		goto drop;
 	}
 
-	if (dst_vxlan->cfg.flags & VXLAN_F_LEARN)
+	if ((dst_vxlan->cfg.flags & VXLAN_F_LEARN) && snoop)
 		vxlan_snoop(dev, &loopback, eth_hdr(skb)->h_source, 0, vni);
 
 	u64_stats_update_begin(&tx_stats->syncp);
@@ -2581,7 +2582,7 @@ static int encap_bypass_if_local(struct sk_buff *skb, struct net_device *dev,
 
 			return -ENOENT;
 		}
-		vxlan_encap_bypass(skb, vxlan, dst_vxlan, vni);
+		vxlan_encap_bypass(skb, vxlan, dst_vxlan, vni, true);
 		return 1;
 	}
 
@@ -2617,7 +2618,8 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 		if (vxlan_addr_any(dst)) {
 			if (did_rsc) {
 				/* short-circuited back to local bridge */
-				vxlan_encap_bypass(skb, vxlan, vxlan, default_vni);
+				vxlan_encap_bypass(skb, vxlan, vxlan,
+						   default_vni, true);
 				return;
 			}
 			goto drop;
@@ -2720,7 +2722,23 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 		}
 
 		ndst = &rt->dst;
-		skb_tunnel_check_pmtu(skb, ndst, VXLAN_HEADROOM, false);
+		err = skb_tunnel_check_pmtu(skb, ndst, VXLAN_HEADROOM,
+					    netif_is_any_bridge_port(dev));
+		if (err < 0) {
+			goto tx_error;
+		} else if (err) {
+			if (info) {
+				struct in_addr src, dst;
+
+				src = remote_ip.sin.sin_addr;
+				dst = local_ip.sin.sin_addr;
+				info->key.u.ipv4.src = src.s_addr;
+				info->key.u.ipv4.dst = dst.s_addr;
+			}
+			vxlan_encap_bypass(skb, vxlan, vxlan, vni, false);
+			dst_release(ndst);
+			goto out_unlock;
+		}
 
 		tos = ip_tunnel_ecn_encap(RT_TOS(tos), old_iph, skb);
 		ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
@@ -2760,7 +2778,24 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 				goto out_unlock;
 		}
 
-		skb_tunnel_check_pmtu(skb, ndst, VXLAN6_HEADROOM, false);
+		err = skb_tunnel_check_pmtu(skb, ndst, VXLAN6_HEADROOM,
+					    netif_is_any_bridge_port(dev));
+		if (err < 0) {
+			goto tx_error;
+		} else if (err) {
+			if (info) {
+				struct in6_addr src, dst;
+
+				src = remote_ip.sin6.sin6_addr;
+				dst = local_ip.sin6.sin6_addr;
+				info->key.u.ipv6.src = src;
+				info->key.u.ipv6.dst = dst;
+			}
+
+			vxlan_encap_bypass(skb, vxlan, vxlan, vni, false);
+			dst_release(ndst);
+			goto out_unlock;
+		}
 
 		tos = ip_tunnel_ecn_encap(RT_TOS(tos), old_iph, skb);
 		ttl = ttl ? : ip6_dst_hoplimit(ndst);
-- 
cgit v1.2.3


From c1a800e88dbffca4ef48000cb3f9ad618dc7ad89 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Tue, 4 Aug 2020 07:53:45 +0200
Subject: geneve: Support for PMTU discovery on directly bridged links

If the interface is a bridge or Open vSwitch port, and we can't
forward a packet because it exceeds the local PMTU estimate,
trigger an ICMP or ICMPv6 reply to the sender, using the same
interface to forward it back.

If metadata collection is enabled, set destination and source
addresses for the flow as if we were receiving the packet, so that
Open vSwitch can match the ICMP error against the existing
association.

v2: Use netif_is_any_bridge_port() (David Ahern)

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/geneve.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 51 insertions(+), 5 deletions(-)

(limited to 'drivers/net')

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index de86b6d82132..c71f994fbc73 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -893,8 +893,31 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 	if (IS_ERR(rt))
 		return PTR_ERR(rt);
 
-	skb_tunnel_check_pmtu(skb, &rt->dst,
-			      GENEVE_IPV4_HLEN + info->options_len, false);
+	err = skb_tunnel_check_pmtu(skb, &rt->dst,
+				    GENEVE_IPV4_HLEN + info->options_len,
+				    netif_is_any_bridge_port(dev));
+	if (err < 0) {
+		dst_release(&rt->dst);
+		return err;
+	} else if (err) {
+		struct ip_tunnel_info *info;
+
+		info = skb_tunnel_info(skb);
+		if (info) {
+			info->key.u.ipv4.dst = fl4.saddr;
+			info->key.u.ipv4.src = fl4.daddr;
+		}
+
+		if (!pskb_may_pull(skb, ETH_HLEN)) {
+			dst_release(&rt->dst);
+			return -EINVAL;
+		}
+
+		skb->protocol = eth_type_trans(skb, geneve->dev);
+		netif_rx(skb);
+		dst_release(&rt->dst);
+		return -EMSGSIZE;
+	}
 
 	sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true);
 	if (geneve->cfg.collect_md) {
@@ -955,8 +978,30 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 	if (IS_ERR(dst))
 		return PTR_ERR(dst);
 
-	skb_tunnel_check_pmtu(skb, dst, GENEVE_IPV6_HLEN + info->options_len,
-			      false);
+	err = skb_tunnel_check_pmtu(skb, dst,
+				    GENEVE_IPV6_HLEN + info->options_len,
+				    netif_is_any_bridge_port(dev));
+	if (err < 0) {
+		dst_release(dst);
+		return err;
+	} else if (err) {
+		struct ip_tunnel_info *info = skb_tunnel_info(skb);
+
+		if (info) {
+			info->key.u.ipv6.dst = fl6.saddr;
+			info->key.u.ipv6.src = fl6.daddr;
+		}
+
+		if (!pskb_may_pull(skb, ETH_HLEN)) {
+			dst_release(dst);
+			return -EINVAL;
+		}
+
+		skb->protocol = eth_type_trans(skb, geneve->dev);
+		netif_rx(skb);
+		dst_release(dst);
+		return -EMSGSIZE;
+	}
 
 	sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true);
 	if (geneve->cfg.collect_md) {
@@ -1013,7 +1058,8 @@ static netdev_tx_t geneve_xmit(struct sk_buff *skb, struct net_device *dev)
 	if (likely(!err))
 		return NETDEV_TX_OK;
 
-	dev_kfree_skb(skb);
+	if (err != -EMSGSIZE)
+		dev_kfree_skb(skb);
 
 	if (err == -ELOOP)
 		dev->stats.collisions++;
-- 
cgit v1.2.3