From f8c468e8537925e0c4607263f498a1b7c0c8982e Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Wed, 2 Jan 2019 13:01:43 -0800
Subject: net, skbuff: do not prefer skb allocation fails early

Commit dcda9b04713c ("mm, tree wide: replace __GFP_REPEAT by
__GFP_RETRY_MAYFAIL with more useful semantic") replaced __GFP_REPEAT in
alloc_skb_with_frags() with __GFP_RETRY_MAYFAIL when the allocation may
directly reclaim.

The previous behavior would require reclaim up to 1 << order pages for
skb aligned header_len of order > PAGE_ALLOC_COSTLY_ORDER before failing,
otherwise the allocations in alloc_skb() would loop in the page allocator
looking for memory.  __GFP_RETRY_MAYFAIL makes both allocations failable
under memory pressure, including for the HEAD allocation.

This can cause, among many other things, write() to fail with ENOTCONN
during RPC when under memory pressure.

These allocations should succeed as they did previous to dcda9b04713c
even if it requires calling the oom killer and additional looping in the
page allocator to find memory.  There is no way to specify the previous
behavior of __GFP_REPEAT, but it's unlikely to be necessary since the
previous behavior only guaranteed that 1 << order pages would be reclaimed
before failing for order > PAGE_ALLOC_COSTLY_ORDER.  That reclaim is not
guaranteed to be contiguous memory, so repeating for such large orders is
usually not beneficial.

Removing the setting of __GFP_RETRY_MAYFAIL to restore the previous
behavior, specifically not allowing alloc_skb() to fail for small orders
and oom kill if necessary rather than allowing RPCs to fail.

Fixes: dcda9b04713c ("mm, tree wide: replace __GFP_REPEAT by __GFP_RETRY_MAYFAIL with more useful semantic")
Signed-off-by: David Rientjes <rientjes@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'net/core')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 37317ffec146..26d848484912 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -5270,7 +5270,6 @@ struct sk_buff *alloc_skb_with_frags(unsigned long header_len,
 	unsigned long chunk;
 	struct sk_buff *skb;
 	struct page *page;
-	gfp_t gfp_head;
 	int i;
 
 	*errcode = -EMSGSIZE;
@@ -5280,12 +5279,8 @@ struct sk_buff *alloc_skb_with_frags(unsigned long header_len,
 	if (npages > MAX_SKB_FRAGS)
 		return NULL;
 
-	gfp_head = gfp_mask;
-	if (gfp_head & __GFP_DIRECT_RECLAIM)
-		gfp_head |= __GFP_RETRY_MAYFAIL;
-
 	*errcode = -ENOBUFS;
-	skb = alloc_skb(header_len, gfp_head);
+	skb = alloc_skb(header_len, gfp_mask);
 	if (!skb)
 		return NULL;
 
-- 
cgit v1.2.3


From 31aa6503a15ba00182ea6dbbf51afb63bf9e851d Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Tue, 8 Jan 2019 18:12:24 -0800
Subject: bpf: correctly set initial window on active Fast Open sender

The existing BPF TCP initial congestion window (TCP_BPF_IW) does not
to work on (active) Fast Open sender. This is because it changes the
(initial) window only if data_segs_out is zero -- but data_segs_out
is also incremented on SYN-data.  This patch fixes the issue by
proerly accounting for SYN-data additionally.

Fixes: fc7478103c84 ("bpf: Adds support for setting initial cwnd")
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Reviewed-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/core/filter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/filter.c b/net/core/filter.c
index 447dd1bad31f..2b3b436ef545 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4203,7 +4203,7 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
 			/* Only some options are supported */
 			switch (optname) {
 			case TCP_BPF_IW:
-				if (val <= 0 || tp->data_segs_out > 0)
+				if (val <= 0 || tp->data_segs_out > tp->syn_data)
 					ret = -EINVAL;
 				else
 					tp->snd_cwnd = val;
-- 
cgit v1.2.3


From 85704cb8dcfd88d351bfc87faaeba1c8214f3177 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Date: Tue, 8 Jan 2019 12:30:00 +0300
Subject: net/core/neighbour: tell kmemleak about hash tables

This fixes false-positive kmemleak reports about leaked neighbour entries:

unreferenced object 0xffff8885c6e4d0a8 (size 1024):
  comm "softirq", pid 0, jiffies 4294922664 (age 167640.804s)
  hex dump (first 32 bytes):
    00 00 00 00 00 00 00 00 20 2c f3 83 ff ff ff ff  ........ ,......
    08 c0 ef 5f 84 88 ff ff 01 8c 7d 02 01 00 00 00  ..._......}.....
  backtrace:
    [<00000000748509fe>] ip6_finish_output2+0x887/0x1e40
    [<0000000036d7a0d8>] ip6_output+0x1ba/0x600
    [<0000000027ea7dba>] ip6_send_skb+0x92/0x2f0
    [<00000000d6e2111d>] udp_v6_send_skb.isra.24+0x680/0x15e0
    [<000000000668a8be>] udpv6_sendmsg+0x18c9/0x27a0
    [<000000004bd5fa90>] sock_sendmsg+0xb3/0xf0
    [<000000008227b29f>] ___sys_sendmsg+0x745/0x8f0
    [<000000008698009d>] __sys_sendmsg+0xde/0x170
    [<00000000889dacf1>] do_syscall_64+0x9b/0x400
    [<0000000081cdb353>] entry_SYSCALL_64_after_hwframe+0x49/0xbe
    [<000000005767ed39>] 0xffffffffffffffff

Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/neighbour.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'net/core')

diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 763a7b08df67..3e27a779f288 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -18,6 +18,7 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <linux/slab.h>
+#include <linux/kmemleak.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
@@ -443,12 +444,14 @@ static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift)
 	ret = kmalloc(sizeof(*ret), GFP_ATOMIC);
 	if (!ret)
 		return NULL;
-	if (size <= PAGE_SIZE)
+	if (size <= PAGE_SIZE) {
 		buckets = kzalloc(size, GFP_ATOMIC);
-	else
+	} else {
 		buckets = (struct neighbour __rcu **)
 			  __get_free_pages(GFP_ATOMIC | __GFP_ZERO,
 					   get_order(size));
+		kmemleak_alloc(buckets, size, 0, GFP_ATOMIC);
+	}
 	if (!buckets) {
 		kfree(ret);
 		return NULL;
@@ -468,10 +471,12 @@ static void neigh_hash_free_rcu(struct rcu_head *head)
 	size_t size = (1 << nht->hash_shift) * sizeof(struct neighbour *);
 	struct neighbour __rcu **buckets = nht->hash_buckets;
 
-	if (size <= PAGE_SIZE)
+	if (size <= PAGE_SIZE) {
 		kfree(buckets);
-	else
+	} else {
+		kmemleak_free(buckets);
 		free_pages((unsigned long)buckets, get_order(size));
+	}
 	kfree(nht);
 }
 
-- 
cgit v1.2.3


From 01b833ab44c9e484060aad72267fc7e71beb559b Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Date: Mon, 14 Jan 2019 13:38:43 +0300
Subject: net/core/neighbour: fix kmemleak minimal reference count for hash
 tables

This should be 1 for normal allocations, 0 disables leak reporting.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Reported-by: Cong Wang <xiyou.wangcong@gmail.com>
Fixes: 85704cb8dcfd ("net/core/neighbour: tell kmemleak about hash tables")
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/neighbour.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 3e27a779f288..96fdc9134726 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -450,7 +450,7 @@ static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift)
 		buckets = (struct neighbour __rcu **)
 			  __get_free_pages(GFP_ATOMIC | __GFP_ZERO,
 					   get_order(size));
-		kmemleak_alloc(buckets, size, 0, GFP_ATOMIC);
+		kmemleak_alloc(buckets, size, 1, GFP_ATOMIC);
 	}
 	if (!buckets) {
 		kfree(ret);
-- 
cgit v1.2.3


From c61c27687a5abce11431e6de1adb6e36099b9859 Mon Sep 17 00:00:00 2001
From: Mathieu Malaterre <malat@debian.org>
Date: Wed, 16 Jan 2019 20:35:41 +0100
Subject: bpf: Correctly annotate implicit fall through in bpf_base_func_proto
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is a plan to build the kernel with -Wimplicit-fallthrough and
this place in the code produced a warnings (W=1).

To preserve as much of the existing comment only change a ‘:’ into a ‘,’.
This is enough change, to match the regular expression expected by GCC.

This commit removes the following warning:

  net/core/filter.c:5310:6: warning: this statement may fall through [-Wimplicit-fallthrough=]

Signed-off-by: Mathieu Malaterre <malat@debian.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/filter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/filter.c b/net/core/filter.c
index 2b3b436ef545..d9076e609fca 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5309,7 +5309,7 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 	case BPF_FUNC_trace_printk:
 		if (capable(CAP_SYS_ADMIN))
 			return bpf_get_trace_printk_proto();
-		/* else: fall through */
+		/* else, fall through */
 	default:
 		return NULL;
 	}
-- 
cgit v1.2.3


From f4924f24da8c7ef64195096817f3cde324091d97 Mon Sep 17 00:00:00 2001
From: Peter Oskolkov <posk@google.com>
Date: Wed, 16 Jan 2019 08:47:54 -0800
Subject: bpf: bpf_setsockopt: reset sock dst on SO_MARK changes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In sock_setsockopt() (net/core/sock.h), when SO_MARK option is used
to change sk_mark, sk_dst_reset(sk) is called. The same should be
done in bpf_setsockopt().

Fixes: 8c4b4c7e9ff0 ("bpf: Add setsockopt helper function to bpf")
Reported-by: Maciej Żenczykowski <maze@google.com>
Signed-off-by: Peter Oskolkov <posk@google.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Reviewed-by: Maciej Żenczykowski <maze@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/filter.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/filter.c b/net/core/filter.c
index d9076e609fca..d9ea51b47f35 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4132,7 +4132,10 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
 			sk->sk_rcvlowat = val ? : 1;
 			break;
 		case SO_MARK:
-			sk->sk_mark = val;
+			if (sk->sk_mark != val) {
+				sk->sk_mark = val;
+				sk_dst_reset(sk);
+			}
 			break;
 		default:
 			ret = -EINVAL;
-- 
cgit v1.2.3


From e224c390a6259c529f7b2a6bd215a087b3344f5c Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Thu, 17 Jan 2019 08:51:01 -0800
Subject: bpf: fix SO_MAX_PACING_RATE to support TCP internal pacing

If sch_fq packet scheduler is not used, TCP can fallback to
internal pacing, but this requires sk_pacing_status to
be properly set.

Fixes: 8c4b4c7e9ff0 ("bpf: Add setsockopt helper function to bpf")
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Lawrence Brakmo <brakmo@fb.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/filter.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net/core')

diff --git a/net/core/filter.c b/net/core/filter.c
index d9ea51b47f35..dab10d21cae8 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4119,6 +4119,10 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
 			sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
 			break;
 		case SO_MAX_PACING_RATE: /* 32bit version */
+			if (val != ~0U)
+				cmpxchg(&sk->sk_pacing_status,
+					SK_PACING_NONE,
+					SK_PACING_NEEDED);
 			sk->sk_max_pacing_rate = (val == ~0U) ? ~0UL : val;
 			sk->sk_pacing_rate = min(sk->sk_pacing_rate,
 						 sk->sk_max_pacing_rate);
-- 
cgit v1.2.3


From 87fff3cacd0112bcaf42f932c1e44ae32b42f1fb Mon Sep 17 00:00:00 2001
From: Yang Wei <yang.wei9@zte.com.cn>
Date: Thu, 17 Jan 2019 23:11:30 +0800
Subject: neighbour: Do not perturb drop profiles when neigh_probe

Replace the kfree_skb() by consume_skb() to be drop monitor(dropwatch,
perf) friendly.

Signed-off-by: Yang Wei <yang.wei9@zte.com.cn>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/neighbour.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 96fdc9134726..4230400b9a30 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1007,7 +1007,7 @@ static void neigh_probe(struct neighbour *neigh)
 	if (neigh->ops->solicit)
 		neigh->ops->solicit(neigh, skb);
 	atomic_inc(&neigh->probes);
-	kfree_skb(skb);
+	consume_skb(skb);
 }
 
 /* Called when a timer expires for a neighbour entry. */
-- 
cgit v1.2.3


From e7c87bd6cc4ec7b0ac1ed0a88a58f8206c577488 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Tue, 15 Jan 2019 20:19:22 -0500
Subject: bpf: in __bpf_redirect_no_mac pull mac only if present

Syzkaller was able to construct a packet of negative length by
redirecting from bpf_prog_test_run_skb with BPF_PROG_TYPE_LWT_XMIT:

    BUG: KASAN: slab-out-of-bounds in memcpy include/linux/string.h:345 [inline]
    BUG: KASAN: slab-out-of-bounds in skb_copy_from_linear_data include/linux/skbuff.h:3421 [inline]
    BUG: KASAN: slab-out-of-bounds in __pskb_copy_fclone+0x2dd/0xeb0 net/core/skbuff.c:1395
    Read of size 4294967282 at addr ffff8801d798009c by task syz-executor2/12942

    kasan_report.cold.9+0x242/0x309 mm/kasan/report.c:412
    check_memory_region_inline mm/kasan/kasan.c:260 [inline]
    check_memory_region+0x13e/0x1b0 mm/kasan/kasan.c:267
    memcpy+0x23/0x50 mm/kasan/kasan.c:302
    memcpy include/linux/string.h:345 [inline]
    skb_copy_from_linear_data include/linux/skbuff.h:3421 [inline]
    __pskb_copy_fclone+0x2dd/0xeb0 net/core/skbuff.c:1395
    __pskb_copy include/linux/skbuff.h:1053 [inline]
    pskb_copy include/linux/skbuff.h:2904 [inline]
    skb_realloc_headroom+0xe7/0x120 net/core/skbuff.c:1539
    ipip6_tunnel_xmit net/ipv6/sit.c:965 [inline]
    sit_tunnel_xmit+0xe1b/0x30d0 net/ipv6/sit.c:1029
    __netdev_start_xmit include/linux/netdevice.h:4325 [inline]
    netdev_start_xmit include/linux/netdevice.h:4334 [inline]
    xmit_one net/core/dev.c:3219 [inline]
    dev_hard_start_xmit+0x295/0xc90 net/core/dev.c:3235
    __dev_queue_xmit+0x2f0d/0x3950 net/core/dev.c:3805
    dev_queue_xmit+0x17/0x20 net/core/dev.c:3838
    __bpf_tx_skb net/core/filter.c:2016 [inline]
    __bpf_redirect_common net/core/filter.c:2054 [inline]
    __bpf_redirect+0x5cf/0xb20 net/core/filter.c:2061
    ____bpf_clone_redirect net/core/filter.c:2094 [inline]
    bpf_clone_redirect+0x2f6/0x490 net/core/filter.c:2066
    bpf_prog_41f2bcae09cd4ac3+0xb25/0x1000

The generated test constructs a packet with mac header, network
header, skb->data pointing to network header and skb->len 0.

Redirecting to a sit0 through __bpf_redirect_no_mac pulls the
mac length, even though skb->data already is at skb->network_header.
bpf_prog_test_run_skb has already pulled it as LWT_XMIT !is_l2.

Update the offset calculation to pull only if skb->data differs
from skb->network_header, which is not true in this case.

The test itself can be run only from commit 1cf1cae963c2 ("bpf:
introduce BPF_PROG_TEST_RUN command"), but the same type of packets
with skb at network header could already be built from lwt xmit hooks,
so this fix is more relevant to that commit.

Also set the mac header on redirect from LWT_XMIT, as even after this
change to __bpf_redirect_no_mac that field is expected to be set, but
is not yet in ip_finish_output2.

Fixes: 3a0af8fd61f9 ("bpf: BPF for lightweight tunnel infrastructure")
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/filter.c  | 21 +++++++++++----------
 net/core/lwt_bpf.c |  1 +
 2 files changed, 12 insertions(+), 10 deletions(-)

(limited to 'net/core')

diff --git a/net/core/filter.c b/net/core/filter.c
index dab10d21cae8..7559d6835ecb 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2020,18 +2020,19 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
 static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev,
 				 u32 flags)
 {
-	/* skb->mac_len is not set on normal egress */
-	unsigned int mlen = skb->network_header - skb->mac_header;
+	unsigned int mlen = skb_network_offset(skb);
 
-	__skb_pull(skb, mlen);
+	if (mlen) {
+		__skb_pull(skb, mlen);
 
-	/* At ingress, the mac header has already been pulled once.
-	 * At egress, skb_pospull_rcsum has to be done in case that
-	 * the skb is originated from ingress (i.e. a forwarded skb)
-	 * to ensure that rcsum starts at net header.
-	 */
-	if (!skb_at_tc_ingress(skb))
-		skb_postpull_rcsum(skb, skb_mac_header(skb), mlen);
+		/* At ingress, the mac header has already been pulled once.
+		 * At egress, skb_pospull_rcsum has to be done in case that
+		 * the skb is originated from ingress (i.e. a forwarded skb)
+		 * to ensure that rcsum starts at net header.
+		 */
+		if (!skb_at_tc_ingress(skb))
+			skb_postpull_rcsum(skb, skb_mac_header(skb), mlen);
+	}
 	skb_pop_mac_header(skb);
 	skb_reset_mac_len(skb);
 	return flags & BPF_F_INGRESS ?
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
index 3e85437f7106..a648568c5e8f 100644
--- a/net/core/lwt_bpf.c
+++ b/net/core/lwt_bpf.c
@@ -63,6 +63,7 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
 				     lwt->name ? : "<unknown>");
 			ret = BPF_OK;
 		} else {
+			skb_reset_mac_header(skb);
 			ret = skb_do_redirect(skb);
 			if (ret == 0)
 				ret = BPF_REDIRECT;
-- 
cgit v1.2.3


From c9e4576743eeda8d24dedc164d65b78877f9a98c Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Wed, 23 Jan 2019 12:37:19 +0800
Subject: bpf: sock recvbuff must be limited by rmem_max in bpf_setsockopt()

When sock recvbuff is set by bpf_setsockopt(), the value must by
limited by rmem_max. It is the same with sendbuff.

Fixes: 8c4b4c7e9ff0 ("bpf: Add setsockopt helper function to bpf")
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/filter.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net/core')

diff --git a/net/core/filter.c b/net/core/filter.c
index 7559d6835ecb..7a54dc11ac2d 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4112,10 +4112,12 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
 		/* Only some socketops are supported */
 		switch (optname) {
 		case SO_RCVBUF:
+			val = min_t(u32, val, sysctl_rmem_max);
 			sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 			sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
 			break;
 		case SO_SNDBUF:
+			val = min_t(u32, val, sysctl_wmem_max);
 			sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 			sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
 			break;
-- 
cgit v1.2.3


From 1d79895aef18fa05789995d86d523c9b2ee58a02 Mon Sep 17 00:00:00 2001
From: Jakub Sitnicki <jakub@cloudflare.com>
Date: Mon, 28 Jan 2019 10:13:35 +0100
Subject: sk_msg: Always cancel strp work before freeing the psock

Despite having stopped the parser, we still need to deinitialize it
by calling strp_done so that it cancels its work. Otherwise the worker
thread can run after we have freed the parser, and attempt to access
its workqueue resulting in a use-after-free:

==================================================================
BUG: KASAN: use-after-free in pwq_activate_delayed_work+0x1b/0x1d0
Read of size 8 at addr ffff888069975240 by task kworker/u2:2/93

CPU: 0 PID: 93 Comm: kworker/u2:2 Not tainted 5.0.0-rc2-00335-g28f9d1a3d4fe-dirty #14
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-2.fc27 04/01/2014
Workqueue:            (null) (kstrp)
Call Trace:
 print_address_description+0x6e/0x2b0
 ? pwq_activate_delayed_work+0x1b/0x1d0
 kasan_report+0xfd/0x177
 ? pwq_activate_delayed_work+0x1b/0x1d0
 ? pwq_activate_delayed_work+0x1b/0x1d0
 pwq_activate_delayed_work+0x1b/0x1d0
 ? process_one_work+0x4aa/0x660
 pwq_dec_nr_in_flight+0x9b/0x100
 worker_thread+0x82/0x680
 ? process_one_work+0x660/0x660
 kthread+0x1b9/0x1e0
 ? __kthread_create_on_node+0x250/0x250
 ret_from_fork+0x1f/0x30

Allocated by task 111:
 sk_psock_init+0x3c/0x1b0
 sock_map_link.isra.2+0x103/0x4b0
 sock_map_update_common+0x94/0x270
 sock_map_update_elem+0x145/0x160
 __se_sys_bpf+0x152e/0x1e10
 do_syscall_64+0xb2/0x3e0
 entry_SYSCALL_64_after_hwframe+0x44/0xa9

Freed by task 112:
 kfree+0x7f/0x140
 process_one_work+0x40b/0x660
 worker_thread+0x82/0x680
 kthread+0x1b9/0x1e0
 ret_from_fork+0x1f/0x30

The buggy address belongs to the object at ffff888069975180
 which belongs to the cache kmalloc-512 of size 512
The buggy address is located 192 bytes inside of
 512-byte region [ffff888069975180, ffff888069975380)
The buggy address belongs to the page:
page:ffffea0001a65d00 count:1 mapcount:0 mapping:ffff88806d401280 index:0x0 compound_mapcount: 0
flags: 0x4000000000010200(slab|head)
raw: 4000000000010200 dead000000000100 dead000000000200 ffff88806d401280
raw: 0000000000000000 00000000800c000c 00000001ffffffff 0000000000000000
page dumped because: kasan: bad access detected

Memory state around the buggy address:
 ffff888069975100: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
 ffff888069975180: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>ffff888069975200: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
                                           ^
 ffff888069975280: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
 ffff888069975300: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
==================================================================

Reported-by: Marek Majkowski <marek@cloudflare.com>
Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Link: https://lore.kernel.org/netdev/CAJPywTLwgXNEZ2dZVoa=udiZmtrWJ0q5SuBW64aYs0Y1khXX3A@mail.gmail.com
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/skmsg.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net/core')

diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index d6d5c20d7044..8c826603bf36 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -545,8 +545,7 @@ static void sk_psock_destroy_deferred(struct work_struct *gc)
 	struct sk_psock *psock = container_of(gc, struct sk_psock, gc);
 
 	/* No sk_callback_lock since already detached. */
-	if (psock->parser.enabled)
-		strp_done(&psock->parser.strp);
+	strp_done(&psock->parser.strp);
 
 	cancel_work_sync(&psock->work);
 
-- 
cgit v1.2.3


From 35edfdc77f683c8fd27d7732af06cf6489af60a5 Mon Sep 17 00:00:00 2001
From: Josh Elsasser <jelsasser@appneta.com>
Date: Sat, 26 Jan 2019 14:38:33 -0800
Subject: net: set default network namespace in init_dummy_netdev()

Assign a default net namespace to netdevs created by init_dummy_netdev().
Fixes a NULL pointer dereference caused by busy-polling a socket bound to
an iwlwifi wireless device, which bumps the per-net BUSYPOLLRXPACKETS stat
if napi_poll() received packets:

  BUG: unable to handle kernel NULL pointer dereference at 0000000000000190
  IP: napi_busy_loop+0xd6/0x200
  Call Trace:
    sock_poll+0x5e/0x80
    do_sys_poll+0x324/0x5a0
    SyS_poll+0x6c/0xf0
    do_syscall_64+0x6b/0x1f0
    entry_SYSCALL_64_after_hwframe+0x3d/0xa2

Fixes: 7db6b048da3b ("net: Commonize busy polling code to focus on napi_id instead of socket")
Signed-off-by: Josh Elsasser <jelsasser@appneta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 82f20022259d..8e276e0192a1 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -8712,6 +8712,9 @@ int init_dummy_netdev(struct net_device *dev)
 	set_bit(__LINK_STATE_PRESENT, &dev->state);
 	set_bit(__LINK_STATE_START, &dev->state);
 
+	/* napi_busy_loop stats accounting wants this */
+	dev_net_set(dev, &init_net);
+
 	/* Note : We dont allocate pcpu_refcnt for dummy devices,
 	 * because users of this 'device' dont need to change
 	 * its refcount.
-- 
cgit v1.2.3


From 5bf325a53202b8728cf7013b72688c46071e212e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Feb 2019 12:26:27 -0800
Subject: net: fix possible overflow in __sk_mem_raise_allocated()

With many active TCP sockets, fat TCP sockets could fool
__sk_mem_raise_allocated() thanks to an overflow.

They would increase their share of the memory, instead
of decreasing it.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 2 +-
 net/core/sock.c    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/core')

diff --git a/include/net/sock.h b/include/net/sock.h
index 2b229f7be8eb..f43f935cb113 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1277,7 +1277,7 @@ static inline void sk_sockets_allocated_inc(struct sock *sk)
 	percpu_counter_inc(sk->sk_prot->sockets_allocated);
 }
 
-static inline int
+static inline u64
 sk_sockets_allocated_read_positive(struct sock *sk)
 {
 	return percpu_counter_read_positive(sk->sk_prot->sockets_allocated);
diff --git a/net/core/sock.c b/net/core/sock.c
index 6aa2e7e0b4fb..bc3512f230a3 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2380,7 +2380,7 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
 	}
 
 	if (sk_has_memory_pressure(sk)) {
-		int alloc;
+		u64 alloc;
 
 		if (!sk_under_memory_pressure(sk))
 			return 1;
-- 
cgit v1.2.3