From 510c321b557121861601f9d259aadd65aa274f35 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Wed, 14 Feb 2018 19:06:02 +0800
Subject: xfrm: reuse uncached_list to track xdsts

In early time, when freeing a xdst, it would be inserted into
dst_garbage.list first. Then if it's refcnt was still held
somewhere, later it would be put into dst_busy_list in
dst_gc_task().

When one dev was being unregistered, the dev of these dsts in
dst_busy_list would be set with loopback_dev and put this dev.
So that this dev's removal wouldn't get blocked, and avoid the
kmsg warning:

  kernel:unregister_netdevice: waiting for veth0 to become \
  free. Usage count = 2

However after Commit 52df157f17e5 ("xfrm: take refcnt of dst
when creating struct xfrm_dst bundle"), the xdst will not be
freed with dst gc, and this warning happens.

To fix it, we need to find these xdsts that are still held by
others when removing the dev, and free xdst's dev and set it
with loopback_dev.

But unfortunately after flow_cache for xfrm was deleted, no
list tracks them anymore. So we need to save these xdsts
somewhere to release the xdst's dev later.

To make this easier, this patch is to reuse uncached_list to
track xdsts, so that the dev refcnt can be released in the
event NETDEV_UNREGISTER process of fib_netdev_notifier.

Thanks to Florian, we could move forward this fix quickly.

Fixes: 52df157f17e5 ("xfrm: take refcnt of dst when creating struct xfrm_dst bundle")
Reported-by: Jianlin Shi <jishi@redhat.com>
Reported-by: Hangbin Liu <liuhangbin@gmail.com>
Tested-by: Eyal Birger <eyal.birger@gmail.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv4/route.c        | 21 +++++++++++++--------
 net/ipv4/xfrm4_policy.c |  4 +++-
 2 files changed, 16 insertions(+), 9 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 49cc1c1df1ba..1d1e4abe04b0 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1383,7 +1383,7 @@ struct uncached_list {
 
 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
 
-static void rt_add_uncached_list(struct rtable *rt)
+void rt_add_uncached_list(struct rtable *rt)
 {
 	struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
 
@@ -1394,14 +1394,8 @@ static void rt_add_uncached_list(struct rtable *rt)
 	spin_unlock_bh(&ul->lock);
 }
 
-static void ipv4_dst_destroy(struct dst_entry *dst)
+void rt_del_uncached_list(struct rtable *rt)
 {
-	struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
-	struct rtable *rt = (struct rtable *) dst;
-
-	if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
-		kfree(p);
-
 	if (!list_empty(&rt->rt_uncached)) {
 		struct uncached_list *ul = rt->rt_uncached_list;
 
@@ -1411,6 +1405,17 @@ static void ipv4_dst_destroy(struct dst_entry *dst)
 	}
 }
 
+static void ipv4_dst_destroy(struct dst_entry *dst)
+{
+	struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
+	struct rtable *rt = (struct rtable *)dst;
+
+	if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
+		kfree(p);
+
+	rt_del_uncached_list(rt);
+}
+
 void rt_flush_dev(struct net_device *dev)
 {
 	struct net *net = dev_net(dev);
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 05017e2c849c..8d33f7b311f4 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -102,6 +102,7 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
 	xdst->u.rt.rt_pmtu = rt->rt_pmtu;
 	xdst->u.rt.rt_table_id = rt->rt_table_id;
 	INIT_LIST_HEAD(&xdst->u.rt.rt_uncached);
+	rt_add_uncached_list(&xdst->u.rt);
 
 	return 0;
 }
@@ -241,7 +242,8 @@ static void xfrm4_dst_destroy(struct dst_entry *dst)
 	struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
 
 	dst_destroy_metrics_generic(dst);
-
+	if (xdst->u.rt.rt_uncached_list)
+		rt_del_uncached_list(&xdst->u.rt);
 	xfrm_dst_destroy(xdst);
 }
 
-- 
cgit v1.2.3


From 2cbb4ea7de167b02ffa63e9cdfdb07a7e7094615 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 16 Feb 2018 11:03:03 -0800
Subject: net: Only honor ifindex in IP_PKTINFO if non-0

Only allow ifindex from IP_PKTINFO to override SO_BINDTODEVICE settings
if the index is actually set in the message.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_sockglue.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 9c41a0cef1a5..74c962b9b09c 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -258,7 +258,8 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc,
 			src_info = (struct in6_pktinfo *)CMSG_DATA(cmsg);
 			if (!ipv6_addr_v4mapped(&src_info->ipi6_addr))
 				return -EINVAL;
-			ipc->oif = src_info->ipi6_ifindex;
+			if (src_info->ipi6_ifindex)
+				ipc->oif = src_info->ipi6_ifindex;
 			ipc->addr = src_info->ipi6_addr.s6_addr32[3];
 			continue;
 		}
@@ -288,7 +289,8 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc,
 			if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct in_pktinfo)))
 				return -EINVAL;
 			info = (struct in_pktinfo *)CMSG_DATA(cmsg);
-			ipc->oif = info->ipi_ifindex;
+			if (info->ipi_ifindex)
+				ipc->oif = info->ipi_ifindex;
 			ipc->addr = info->ipi_spec_dst.s_addr;
 			break;
 		}
-- 
cgit v1.2.3


From 87cdf3148b11d46382dbce2754ae7036aba96380 Mon Sep 17 00:00:00 2001
From: Yossi Kuperman <yossiku@mellanox.com>
Date: Sun, 4 Mar 2018 21:03:52 +0200
Subject: xfrm: Verify MAC header exists before overwriting
 eth_hdr(skb)->h_proto

Artem Savkov reported that commit 5efec5c655dd leads to a packet loss under
IPSec configuration. It appears that his setup consists of a TUN device,
which does not have a MAC header.

Make sure MAC header exists.

Note: TUN device sets a MAC header pointer, although it does not have one.

Fixes: 5efec5c655dd ("xfrm: Fix eth_hdr(skb)->h_proto to reflect inner IP version")
Reported-by: Artem Savkov <artem.savkov@gmail.com>
Tested-by: Artem Savkov <artem.savkov@gmail.com>
Signed-off-by: Yossi Kuperman <yossiku@mellanox.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv4/xfrm4_mode_tunnel.c | 3 ++-
 net/ipv6/xfrm6_mode_tunnel.c | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index 63faeee989a9..2a9764bd1719 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -92,7 +92,8 @@ static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
 
 	skb_reset_network_header(skb);
 	skb_mac_header_rebuild(skb);
-	eth_hdr(skb)->h_proto = skb->protocol;
+	if (skb->mac_len)
+		eth_hdr(skb)->h_proto = skb->protocol;
 
 	err = 0;
 
diff --git a/net/ipv6/xfrm6_mode_tunnel.c b/net/ipv6/xfrm6_mode_tunnel.c
index bb935a3b7fea..de1b0b8c53b0 100644
--- a/net/ipv6/xfrm6_mode_tunnel.c
+++ b/net/ipv6/xfrm6_mode_tunnel.c
@@ -92,7 +92,8 @@ static int xfrm6_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
 
 	skb_reset_network_header(skb);
 	skb_mac_header_rebuild(skb);
-	eth_hdr(skb)->h_proto = skb->protocol;
+	if (skb->mac_len)
+		eth_hdr(skb)->h_proto = skb->protocol;
 
 	err = 0;
 
-- 
cgit v1.2.3


From a560002437d3646dafccecb1bf32d1685112ddda Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Tue, 6 Mar 2018 18:46:39 +0300
Subject: net: Fix hlist corruptions in inet_evict_bucket()

inet_evict_bucket() iterates global list, and
several tasks may call it in parallel. All of
them hash the same fq->list_evictor to different
lists, which leads to list corruption.

This patch makes fq be hashed to expired list
only if this has not been made yet by another
task. Since inet_frag_alloc() allocates fq
using kmem_cache_zalloc(), we may rely on
list_evictor is initially unhashed.

The problem seems to exist before async
pernet_operations, as there was possible to have
exit method to be executed in parallel with
inet_frags::frags_work, so I add two Fixes tags.
This also may go to stable.

Fixes: d1fe19444d82 "inet: frag: don't re-use chainlist for evictor"
Fixes: f84c6821aa54 "net: Convert pernet_subsys, registered from inet_init()"
Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inet_fragment.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 26a3d0315728..e8ec28999f5c 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -119,6 +119,9 @@ out:
 
 static bool inet_fragq_should_evict(const struct inet_frag_queue *q)
 {
+	if (!hlist_unhashed(&q->list_evictor))
+		return false;
+
 	return q->net->low_thresh == 0 ||
 	       frag_mem_limit(q->net) >= q->net->low_thresh;
 }
-- 
cgit v1.2.3


From e05836ac07c77dd90377f8c8140bce2a44af5fe7 Mon Sep 17 00:00:00 2001
From: Soheil Hassas Yeganeh <soheil@google.com>
Date: Tue, 6 Mar 2018 17:15:12 -0500
Subject: tcp: purge write queue upon aborting the connection

When the connection is aborted, there is no point in
keeping the packets on the write queue until the connection
is closed.

Similar to a27fd7a8ed38 ('tcp: purge write queue upon RST'),
this is essential for a correct MSG_ZEROCOPY implementation,
because userspace cannot call close(fd) before receiving
zerocopy signals even when the connection is aborted.

Fixes: f214f915e7db ("tcp: enable MSG_ZEROCOPY")
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c       | 1 +
 net/ipv4/tcp_timer.c | 1 +
 2 files changed, 2 insertions(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 48636aee23c3..8b8059b7af4d 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3566,6 +3566,7 @@ int tcp_abort(struct sock *sk, int err)
 
 	bh_unlock_sock(sk);
 	local_bh_enable();
+	tcp_write_queue_purge(sk);
 	release_sock(sk);
 	return 0;
 }
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 71fc60f1b326..f7d944855f8e 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -34,6 +34,7 @@ static void tcp_write_err(struct sock *sk)
 	sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
 	sk->sk_error_report(sk);
 
+	tcp_write_queue_purge(sk);
 	tcp_done(sk);
 	__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONTIMEOUT);
 }
-- 
cgit v1.2.3


From bf2ae2e4bf9360e07c0cdfa166bcdc0afd92f4ce Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sat, 10 Mar 2018 18:57:50 +0800
Subject: sock_diag: request _diag module only when the family or proto has
 been registered

Now when using 'ss' in iproute, kernel would try to load all _diag
modules, which also causes corresponding family and proto modules
to be loaded as well due to module dependencies.

Like after running 'ss', sctp, dccp, af_packet (if it works as a module)
would be loaded.

For example:

  $ lsmod|grep sctp
  $ ss
  $ lsmod|grep sctp
  sctp_diag              16384  0
  sctp                  323584  5 sctp_diag
  inet_diag              24576  4 raw_diag,tcp_diag,sctp_diag,udp_diag
  libcrc32c              16384  3 nf_conntrack,nf_nat,sctp

As these family and proto modules are loaded unintentionally, it
could cause some problems, like:

- Some debug tools use 'ss' to collect the socket info, which loads all
  those diag and family and protocol modules. It's noisy for identifying
  issues.

- Users usually expect to drop sctp init packet silently when they
  have no sense of sctp protocol instead of sending abort back.

- It wastes resources (especially with multiple netns), and SCTP module
  can't be unloaded once it's loaded.

...

In short, it's really inappropriate to have these family and proto
modules loaded unexpectedly when just doing debugging with inet_diag.

This patch is to introduce sock_load_diag_module() where it loads
the _diag module only when it's corresponding family or proto has
been already registered.

Note that we can't just load _diag module without the family or
proto loaded, as some symbols used in _diag module are from the
family or proto module.

v1->v2:
  - move inet proto check to inet_diag to avoid a compiling err.
v2->v3:
  - define sock_load_diag_module in sock.c and export one symbol
    only.
  - improve the changelog.

Reported-by: Sabrina Dubroca <sd@queasysnail.net>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Phil Sutter <phil@nwl.cc>
Acked-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/net.h  |  1 +
 include/net/sock.h   |  1 +
 net/core/sock.c      | 21 +++++++++++++++++++++
 net/core/sock_diag.c | 12 ++++--------
 net/ipv4/inet_diag.c |  3 +--
 net/socket.c         |  5 +++++
 6 files changed, 33 insertions(+), 10 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/linux/net.h b/include/linux/net.h
index 91216b16feb7..2a0391eea05c 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -222,6 +222,7 @@ enum {
 int sock_wake_async(struct socket_wq *sk_wq, int how, int band);
 int sock_register(const struct net_proto_family *fam);
 void sock_unregister(int family);
+bool sock_is_registered(int family);
 int __sock_create(struct net *net, int family, int type, int proto,
 		  struct socket **res, int kern);
 int sock_create(int family, int type, int proto, struct socket **res);
diff --git a/include/net/sock.h b/include/net/sock.h
index 169c92afcafa..ae23f3b389ca 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1137,6 +1137,7 @@ struct proto {
 
 int proto_register(struct proto *prot, int alloc_slab);
 void proto_unregister(struct proto *prot);
+int sock_load_diag_module(int family, int protocol);
 
 #ifdef SOCK_REFCNT_DEBUG
 static inline void sk_refcnt_debug_inc(struct sock *sk)
diff --git a/net/core/sock.c b/net/core/sock.c
index c501499a04fe..85b0b64e7f9d 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -3261,6 +3261,27 @@ void proto_unregister(struct proto *prot)
 }
 EXPORT_SYMBOL(proto_unregister);
 
+int sock_load_diag_module(int family, int protocol)
+{
+	if (!protocol) {
+		if (!sock_is_registered(family))
+			return -ENOENT;
+
+		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
+				      NETLINK_SOCK_DIAG, family);
+	}
+
+#ifdef CONFIG_INET
+	if (family == AF_INET &&
+	    !rcu_access_pointer(inet_protos[protocol]))
+		return -ENOENT;
+#endif
+
+	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
+			      NETLINK_SOCK_DIAG, family, protocol);
+}
+EXPORT_SYMBOL(sock_load_diag_module);
+
 #ifdef CONFIG_PROC_FS
 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
 	__acquires(proto_list_mutex)
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index 146b50e30659..c37b5be7c5e4 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -220,8 +220,7 @@ static int __sock_diag_cmd(struct sk_buff *skb, struct nlmsghdr *nlh)
 		return -EINVAL;
 
 	if (sock_diag_handlers[req->sdiag_family] == NULL)
-		request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
-				NETLINK_SOCK_DIAG, req->sdiag_family);
+		sock_load_diag_module(req->sdiag_family, 0);
 
 	mutex_lock(&sock_diag_table_mutex);
 	hndl = sock_diag_handlers[req->sdiag_family];
@@ -247,8 +246,7 @@ static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
 	case TCPDIAG_GETSOCK:
 	case DCCPDIAG_GETSOCK:
 		if (inet_rcv_compat == NULL)
-			request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
-					NETLINK_SOCK_DIAG, AF_INET);
+			sock_load_diag_module(AF_INET, 0);
 
 		mutex_lock(&sock_diag_table_mutex);
 		if (inet_rcv_compat != NULL)
@@ -281,14 +279,12 @@ static int sock_diag_bind(struct net *net, int group)
 	case SKNLGRP_INET_TCP_DESTROY:
 	case SKNLGRP_INET_UDP_DESTROY:
 		if (!sock_diag_handlers[AF_INET])
-			request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
-				       NETLINK_SOCK_DIAG, AF_INET);
+			sock_load_diag_module(AF_INET, 0);
 		break;
 	case SKNLGRP_INET6_TCP_DESTROY:
 	case SKNLGRP_INET6_UDP_DESTROY:
 		if (!sock_diag_handlers[AF_INET6])
-			request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
-				       NETLINK_SOCK_DIAG, AF_INET6);
+			sock_load_diag_module(AF_INET6, 0);
 		break;
 	}
 	return 0;
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index a383f299ce24..4e5bc4b2f14e 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -53,8 +53,7 @@ static DEFINE_MUTEX(inet_diag_table_mutex);
 static const struct inet_diag_handler *inet_diag_lock_handler(int proto)
 {
 	if (!inet_diag_table[proto])
-		request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
-			       NETLINK_SOCK_DIAG, AF_INET, proto);
+		sock_load_diag_module(AF_INET, proto);
 
 	mutex_lock(&inet_diag_table_mutex);
 	if (!inet_diag_table[proto])
diff --git a/net/socket.c b/net/socket.c
index a93c99b518ca..08847c3b8c39 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2587,6 +2587,11 @@ void sock_unregister(int family)
 }
 EXPORT_SYMBOL(sock_unregister);
 
+bool sock_is_registered(int family)
+{
+	return family < NPROTO && rcu_access_pointer(net_families[family]);
+}
+
 static int __init sock_init(void)
 {
 	int err;
-- 
cgit v1.2.3


From d52e5a7e7ca49457dd31fc8b42fb7c0d58a31221 Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Wed, 14 Mar 2018 10:21:14 +0100
Subject: ipv4: lock mtu in fnhe when received PMTU < net.ipv4.route.min_pmtu

Prior to the rework of PMTU information storage in commit
2c8cec5c10bc ("ipv4: Cache learned PMTU information in inetpeer."),
when a PMTU event advertising a PMTU smaller than
net.ipv4.route.min_pmtu was received, we would disable setting the DF
flag on packets by locking the MTU metric, and set the PMTU to
net.ipv4.route.min_pmtu.

Since then, we don't disable DF, and set PMTU to
net.ipv4.route.min_pmtu, so the intermediate router that has this link
with a small MTU will have to drop the packets.

This patch reestablishes pre-2.6.39 behavior by splitting
rtable->rt_pmtu into a bitfield with rt_mtu_locked and rt_pmtu.
rt_mtu_locked indicates that we shouldn't set the DF bit on that path,
and is checked in ip_dont_fragment().

One possible workaround is to set net.ipv4.route.min_pmtu to a value low
enough to accommodate the lowest MTU encountered.

Fixes: 2c8cec5c10bc ("ipv4: Cache learned PMTU information in inetpeer.")
Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Reviewed-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h        | 11 +++++++++--
 include/net/ip_fib.h    |  1 +
 include/net/route.h     |  3 ++-
 net/ipv4/route.c        | 26 +++++++++++++++++++-------
 net/ipv4/xfrm4_policy.c |  1 +
 5 files changed, 32 insertions(+), 10 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/ip.h b/include/net/ip.h
index 746abff9ce51..f49b3a576bec 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -328,6 +328,13 @@ int ip_decrease_ttl(struct iphdr *iph)
 	return --iph->ttl;
 }
 
+static inline int ip_mtu_locked(const struct dst_entry *dst)
+{
+	const struct rtable *rt = (const struct rtable *)dst;
+
+	return rt->rt_mtu_locked || dst_metric_locked(dst, RTAX_MTU);
+}
+
 static inline
 int ip_dont_fragment(const struct sock *sk, const struct dst_entry *dst)
 {
@@ -335,7 +342,7 @@ int ip_dont_fragment(const struct sock *sk, const struct dst_entry *dst)
 
 	return  pmtudisc == IP_PMTUDISC_DO ||
 		(pmtudisc == IP_PMTUDISC_WANT &&
-		 !(dst_metric_locked(dst, RTAX_MTU)));
+		 !ip_mtu_locked(dst));
 }
 
 static inline bool ip_sk_accept_pmtu(const struct sock *sk)
@@ -361,7 +368,7 @@ static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst,
 	struct net *net = dev_net(dst->dev);
 
 	if (net->ipv4.sysctl_ip_fwd_use_pmtu ||
-	    dst_metric_locked(dst, RTAX_MTU) ||
+	    ip_mtu_locked(dst) ||
 	    !forwarding)
 		return dst_mtu(dst);
 
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index f80524396c06..77d0a78cf7d2 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -59,6 +59,7 @@ struct fib_nh_exception {
 	int				fnhe_genid;
 	__be32				fnhe_daddr;
 	u32				fnhe_pmtu;
+	bool				fnhe_mtu_locked;
 	__be32				fnhe_gw;
 	unsigned long			fnhe_expires;
 	struct rtable __rcu		*fnhe_rth_input;
diff --git a/include/net/route.h b/include/net/route.h
index 40b870d58f38..20a92ca9e115 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -63,7 +63,8 @@ struct rtable {
 	__be32			rt_gateway;
 
 	/* Miscellaneous cached information */
-	u32			rt_pmtu;
+	u32			rt_mtu_locked:1,
+				rt_pmtu:31;
 
 	u32			rt_table_id;
 
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index f9dbb8cb66bf..299e247b2032 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -634,6 +634,7 @@ static inline u32 fnhe_hashfun(__be32 daddr)
 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 {
 	rt->rt_pmtu = fnhe->fnhe_pmtu;
+	rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
 	rt->dst.expires = fnhe->fnhe_expires;
 
 	if (fnhe->fnhe_gw) {
@@ -644,7 +645,7 @@ static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnh
 }
 
 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
-				  u32 pmtu, unsigned long expires)
+				  u32 pmtu, bool lock, unsigned long expires)
 {
 	struct fnhe_hash_bucket *hash;
 	struct fib_nh_exception *fnhe;
@@ -681,8 +682,10 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 			fnhe->fnhe_genid = genid;
 		if (gw)
 			fnhe->fnhe_gw = gw;
-		if (pmtu)
+		if (pmtu) {
 			fnhe->fnhe_pmtu = pmtu;
+			fnhe->fnhe_mtu_locked = lock;
+		}
 		fnhe->fnhe_expires = max(1UL, expires);
 		/* Update all cached dsts too */
 		rt = rcu_dereference(fnhe->fnhe_rth_input);
@@ -706,6 +709,7 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 		fnhe->fnhe_daddr = daddr;
 		fnhe->fnhe_gw = gw;
 		fnhe->fnhe_pmtu = pmtu;
+		fnhe->fnhe_mtu_locked = lock;
 		fnhe->fnhe_expires = expires;
 
 		/* Exception created; mark the cached routes for the nexthop
@@ -787,7 +791,8 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow
 				struct fib_nh *nh = &FIB_RES_NH(res);
 
 				update_or_create_fnhe(nh, fl4->daddr, new_gw,
-						0, jiffies + ip_rt_gc_timeout);
+						0, false,
+						jiffies + ip_rt_gc_timeout);
 			}
 			if (kill_route)
 				rt->dst.obsolete = DST_OBSOLETE_KILL;
@@ -1009,15 +1014,18 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 {
 	struct dst_entry *dst = &rt->dst;
 	struct fib_result res;
+	bool lock = false;
 
-	if (dst_metric_locked(dst, RTAX_MTU))
+	if (ip_mtu_locked(dst))
 		return;
 
 	if (ipv4_mtu(dst) < mtu)
 		return;
 
-	if (mtu < ip_rt_min_pmtu)
+	if (mtu < ip_rt_min_pmtu) {
+		lock = true;
 		mtu = ip_rt_min_pmtu;
+	}
 
 	if (rt->rt_pmtu == mtu &&
 	    time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
@@ -1027,7 +1035,7 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 	if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
 		struct fib_nh *nh = &FIB_RES_NH(res);
 
-		update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
+		update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock,
 				      jiffies + ip_rt_mtu_expires);
 	}
 	rcu_read_unlock();
@@ -1280,7 +1288,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
 
 	mtu = READ_ONCE(dst->dev->mtu);
 
-	if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
+	if (unlikely(ip_mtu_locked(dst))) {
 		if (rt->rt_uses_gateway && mtu > 576)
 			mtu = 576;
 	}
@@ -1521,6 +1529,7 @@ struct rtable *rt_dst_alloc(struct net_device *dev,
 		rt->rt_is_input = 0;
 		rt->rt_iif = 0;
 		rt->rt_pmtu = 0;
+		rt->rt_mtu_locked = 0;
 		rt->rt_gateway = 0;
 		rt->rt_uses_gateway = 0;
 		rt->rt_table_id = 0;
@@ -2546,6 +2555,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
 		rt->rt_is_input = ort->rt_is_input;
 		rt->rt_iif = ort->rt_iif;
 		rt->rt_pmtu = ort->rt_pmtu;
+		rt->rt_mtu_locked = ort->rt_mtu_locked;
 
 		rt->rt_genid = rt_genid_ipv4(net);
 		rt->rt_flags = ort->rt_flags;
@@ -2648,6 +2658,8 @@ static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
 	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
 	if (rt->rt_pmtu && expires)
 		metrics[RTAX_MTU - 1] = rt->rt_pmtu;
+	if (rt->rt_mtu_locked && expires)
+		metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
 	if (rtnetlink_put_metrics(skb, metrics) < 0)
 		goto nla_put_failure;
 
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 8d33f7b311f4..fbebda67ac1b 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -100,6 +100,7 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
 	xdst->u.rt.rt_gateway = rt->rt_gateway;
 	xdst->u.rt.rt_uses_gateway = rt->rt_uses_gateway;
 	xdst->u.rt.rt_pmtu = rt->rt_pmtu;
+	xdst->u.rt.rt_mtu_locked = rt->rt_mtu_locked;
 	xdst->u.rt.rt_table_id = rt->rt_table_id;
 	INIT_LIST_HEAD(&xdst->u.rt.rt_uncached);
 	rt_add_uncached_list(&xdst->u.rt);
-- 
cgit v1.2.3