From 435ccfa894e35e3d4a1799e6ac030e48a7b69ef5 Mon Sep 17 00:00:00 2001
From: Arjun Roy <arjunroy@google.com>
Date: Fri, 23 Oct 2020 11:47:09 -0700
Subject: tcp: Prevent low rmem stalls with SO_RCVLOWAT.

With SO_RCVLOWAT, under memory pressure,
it is possible to enter a state where:

1. We have not received enough bytes to satisfy SO_RCVLOWAT.
2. We have not entered buffer pressure (see tcp_rmem_pressure()).
3. But, we do not have enough buffer space to accept more packets.

In this case, we advertise 0 rwnd (due to #3) but the application does
not drain the receive queue (no wakeup because of #1 and #2) so the
flow stalls.

Modify the heuristic for SO_RCVLOWAT so that, if we are advertising
rwnd<=rcv_mss, force a wakeup to prevent a stall.

Without this patch, setting tcp_rmem to 6143 and disabling TCP
autotune causes a stalled flow. With this patch, no stall occurs. This
is with RPC-style traffic with large messages.

Fixes: 03f45c883c6f ("tcp: avoid extra wakeups for SO_RCVLOWAT users")
Signed-off-by: Arjun Roy <arjunroy@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20201023184709.217614-1-arjunroy.kdev@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/ipv4/tcp.c       | 2 ++
 net/ipv4/tcp_input.c | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index bae4284bf542..b2bc3d7fe9e8 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -485,6 +485,8 @@ static inline bool tcp_stream_is_readable(const struct tcp_sock *tp,
 			return true;
 		if (tcp_rmem_pressure(sk))
 			return true;
+		if (tcp_receive_window(tp) <= inet_csk(sk)->icsk_ack.rcv_mss)
+			return true;
 	}
 	if (sk->sk_prot->stream_memory_read)
 		return sk->sk_prot->stream_memory_read(sk);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index fc445833b5e5..389d1b340248 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4908,7 +4908,8 @@ void tcp_data_ready(struct sock *sk)
 	int avail = tp->rcv_nxt - tp->copied_seq;
 
 	if (avail < sk->sk_rcvlowat && !tcp_rmem_pressure(sk) &&
-	    !sock_flag(sk, SOCK_DONE))
+	    !sock_flag(sk, SOCK_DONE) &&
+	    tcp_receive_window(tp) > inet_csk(sk)->icsk_ack.rcv_mss)
 		return;
 
 	sk->sk_data_ready(sk);
-- 
cgit v1.2.3


From af545bb5ee53f5261db631db2ac4cde54038bdaf Mon Sep 17 00:00:00 2001
From: Jeff Vander Stoep <jeffv@google.com>
Date: Fri, 23 Oct 2020 16:37:57 +0200
Subject: vsock: use ns_capable_noaudit() on socket create

During __vsock_create() CAP_NET_ADMIN is used to determine if the
vsock_sock->trusted should be set to true. This value is used later
for determing if a remote connection should be allowed to connect
to a restricted VM. Unfortunately, if the caller doesn't have
CAP_NET_ADMIN, an audit message such as an selinux denial is
generated even if the caller does not want a trusted socket.

Logging errors on success is confusing. To avoid this, switch the
capable(CAP_NET_ADMIN) check to the noaudit version.

Reported-by: Roman Kiryanov <rkir@google.com>
https://android-review.googlesource.com/c/device/generic/goldfish/+/1468545/
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Reviewed-by: James Morris <jamorris@linux.microsoft.com>
Link: https://lore.kernel.org/r/20201023143757.377574-1-jeffv@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/vmw_vsock/af_vsock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 9e93bc201cc0..b4d7b8aba003 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -739,7 +739,7 @@ static struct sock *__vsock_create(struct net *net,
 		vsk->buffer_min_size = psk->buffer_min_size;
 		vsk->buffer_max_size = psk->buffer_max_size;
 	} else {
-		vsk->trusted = capable(CAP_NET_ADMIN);
+		vsk->trusted = ns_capable_noaudit(&init_user_ns, CAP_NET_ADMIN);
 		vsk->owner = get_current_cred();
 		vsk->connect_timeout = VSOCK_DEFAULT_CONNECT_TIMEOUT;
 		vsk->buffer_size = VSOCK_DEFAULT_BUFFER_SIZE;
-- 
cgit v1.2.3


From 4a9baf45fd72615a804947a8495b73c4a0a4cb54 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Fri, 23 Oct 2020 20:48:28 +0200
Subject: net/smc: fix null pointer dereference in smc_listen_decline()

smc_listen_work() calls smc_listen_decline() on label out_decl,
providing the ini pointer variable. But this pointer can still be null
when the label out_decl is reached.
Fix this by checking the ini variable in smc_listen_work() and call
smc_listen_decline() with the result directly.

Fixes: a7c9c5f4af7f ("net/smc: CLC accept / confirm V2")
Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/smc/af_smc.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 82be0bd0f6e8..e9f487c8c6d5 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -1317,10 +1317,10 @@ static void smc_listen_out_err(struct smc_sock *new_smc)
 
 /* listen worker: decline and fall back if possible */
 static void smc_listen_decline(struct smc_sock *new_smc, int reason_code,
-			       struct smc_init_info *ini, u8 version)
+			       int local_first, u8 version)
 {
 	/* RDMA setup failed, switch back to TCP */
-	if (ini->first_contact_local)
+	if (local_first)
 		smc_lgr_cleanup_early(&new_smc->conn);
 	else
 		smc_conn_free(&new_smc->conn);
@@ -1768,7 +1768,8 @@ static void smc_listen_work(struct work_struct *work)
 out_unlock:
 	mutex_unlock(&smc_server_lgr_pending);
 out_decl:
-	smc_listen_decline(new_smc, rc, ini, version);
+	smc_listen_decline(new_smc, rc, ini ? ini->first_contact_local : 0,
+			   version);
 out_free:
 	kfree(ini);
 	kfree(buf);
-- 
cgit v1.2.3


From 96d6fded958d971a3695009e0ed43aca6c598283 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Fri, 23 Oct 2020 20:48:29 +0200
Subject: net/smc: fix suppressed return code

The patch that repaired the invalid return code in smcd_new_buf_create()
missed to take care of errno ENOSPC which has a special meaning that no
more DMBEs can be registered on the device. Fix that by keeping this
errno value during the translation of the return code.

Fixes: 6b1bbf94ab36 ("net/smc: fix invalid return code in smcd_new_buf_create()")
Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/smc/smc_core.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index d790c43c473f..2b19863f7171 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -1615,8 +1615,11 @@ static struct smc_buf_desc *smcd_new_buf_create(struct smc_link_group *lgr,
 		rc = smc_ism_register_dmb(lgr, bufsize, buf_desc);
 		if (rc) {
 			kfree(buf_desc);
-			return (rc == -ENOMEM) ? ERR_PTR(-EAGAIN) :
-						 ERR_PTR(-EIO);
+			if (rc == -ENOMEM)
+				return ERR_PTR(-EAGAIN);
+			if (rc == -ENOSPC)
+				return ERR_PTR(-ENOSPC);
+			return ERR_PTR(-EIO);
 		}
 		buf_desc->pages = virt_to_page(buf_desc->cpu_addr);
 		/* CDC header stored in buf. So, pretend it was smaller */
-- 
cgit v1.2.3


From 6c211809c87f0de939f3bd60ceec05338ae6eba1 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Mon, 26 Oct 2020 11:00:59 +0300
Subject: devlink: Fix some error codes

These paths don't set the error codes.  It's especially important in
devlink_nl_region_notify_build() where it leads to a NULL dereference in
the caller.

Fixes: 544e7c33ec2f ("net: devlink: Add support for port regions")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Link: https://lore.kernel.org/r/20201026080059.GA1628785@mwanda
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/core/devlink.c | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/core/devlink.c b/net/core/devlink.c
index a578634052a3..925da7c0fcf3 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -4213,10 +4213,12 @@ static int devlink_nl_region_fill(struct sk_buff *msg, struct devlink *devlink,
 	if (err)
 		goto nla_put_failure;
 
-	if (region->port)
-		if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX,
-				region->port->index))
+	if (region->port) {
+		err = nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX,
+				  region->port->index);
+		if (err)
 			goto nla_put_failure;
+	}
 
 	err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME, region->ops->name);
 	if (err)
@@ -4265,10 +4267,12 @@ devlink_nl_region_notify_build(struct devlink_region *region,
 	if (err)
 		goto out_cancel_msg;
 
-	if (region->port)
-		if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX,
-				region->port->index))
+	if (region->port) {
+		err = nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX,
+				  region->port->index);
+		if (err)
 			goto out_cancel_msg;
+	}
 
 	err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME,
 			     region->ops->name);
@@ -4962,10 +4966,12 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb,
 	if (err)
 		goto nla_put_failure;
 
-	if (region->port)
-		if (nla_put_u32(skb, DEVLINK_ATTR_PORT_INDEX,
-				region->port->index))
+	if (region->port) {
+		err = nla_put_u32(skb, DEVLINK_ATTR_PORT_INDEX,
+				  region->port->index);
+		if (err)
 			goto nla_put_failure;
+	}
 
 	err = nla_put_string(skb, DEVLINK_ATTR_REGION_NAME, region_name);
 	if (err)
-- 
cgit v1.2.3


From 0d8cb9464a7d9c3e6349db3f7719a80f3793347e Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Mon, 26 Oct 2020 11:01:27 +0300
Subject: devlink: Unlock on error in dumpit()

This needs to unlock before returning.

Fixes: 544e7c33ec2f ("net: devlink: Add support for port regions")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Link: https://lore.kernel.org/r/20201026080127.GB1628785@mwanda
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/core/devlink.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/devlink.c b/net/core/devlink.c
index 925da7c0fcf3..a932d95be798 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -4919,8 +4919,10 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb,
 		index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
 
 		port = devlink_port_get_by_index(devlink, index);
-		if (!port)
-			return -ENODEV;
+		if (!port) {
+			err = -ENODEV;
+			goto out_unlock;
+		}
 	}
 
 	region_name = nla_data(attrs[DEVLINK_ATTR_REGION_NAME]);
-- 
cgit v1.2.3


From 501b72ae2472a15a80c0f4063ee8341870e1ef55 Mon Sep 17 00:00:00 2001
From: Guillaume Nault <gnault@redhat.com>
Date: Mon, 26 Oct 2020 11:29:45 +0100
Subject: net/sched: act_mpls: Add softdep on mpls_gso.ko

TCA_MPLS_ACT_PUSH and TCA_MPLS_ACT_MAC_PUSH might be used on gso
packets. Such packets will thus require mpls_gso.ko for segmentation.

v2: Drop dependency on CONFIG_NET_MPLS_GSO in Kconfig (from Jakub and
    David).

Fixes: 2a2ea50870ba ("net: sched: add mpls manipulation actions to TC")
Signed-off-by: Guillaume Nault <gnault@redhat.com>
Link: https://lore.kernel.org/r/1f6cab15bbd15666795061c55563aaf6a386e90e.1603708007.git.gnault@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/sched/act_mpls.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c
index f40bf9771cb9..5c7456e5b5cf 100644
--- a/net/sched/act_mpls.c
+++ b/net/sched/act_mpls.c
@@ -426,6 +426,7 @@ static void __exit mpls_cleanup_module(void)
 module_init(mpls_init_module);
 module_exit(mpls_cleanup_module);
 
+MODULE_SOFTDEP("post: mpls_gso");
 MODULE_AUTHOR("Netronome Systems <oss-drivers@netronome.com>");
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("MPLS manipulation actions");
-- 
cgit v1.2.3


From d6535dca28859d8d9ef80894eb287b2ac35a32e8 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Mon, 26 Oct 2020 14:33:27 +0200
Subject: net: protect tcf_block_unbind with block lock

The tcf_block_unbind() expects that the caller will take block->cb_lock
before calling it, however the code took RTNL lock and dropped cb_lock
instead. This causes to the following kernel panic.

 WARNING: CPU: 1 PID: 13524 at net/sched/cls_api.c:1488 tcf_block_unbind+0x2db/0x420
 Modules linked in: mlx5_ib mlx5_core mlxfw ptp pps_core act_mirred act_tunnel_key cls_flower vxlan ip6_udp_tunnel udp_tunnel dummy sch_ingress openvswitch nsh xt_conntrack xt_MASQUERADE nf_conntrack_netlink nfnetlink xt_addrtype iptable_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 br_netfilter rpcrdma rdma_ucm ib_iser libiscsi scsi_transport_iscsi ib_umad ib_ipoib rdma_cm iw_cm ib_cm ib_uverbs ib_core overlay [last unloaded: mlxfw]
 CPU: 1 PID: 13524 Comm: test-ecmp-add-v Tainted: G        W         5.9.0+ #1
 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
 RIP: 0010:tcf_block_unbind+0x2db/0x420
 Code: ff 48 83 c4 40 5b 5d 41 5c 41 5d 41 5e 41 5f c3 49 8d bc 24 30 01 00 00 be ff ff ff ff e8 7d 7f 70 00 85 c0 0f 85 7b fd ff ff <0f> 0b e9 74 fd ff ff 48 c7 c7 dc 6a 24 84 e8 02 ec fe fe e9 55 fd
 RSP: 0018:ffff888117d17968 EFLAGS: 00010246
 RAX: 0000000000000000 RBX: ffff88812f713c00 RCX: 1ffffffff0848d5b
 RDX: 0000000000000001 RSI: ffff88814fbc8130 RDI: ffff888107f2b878
 RBP: 1ffff11022fa2f3f R08: 0000000000000000 R09: ffffffff84115a87
 R10: fffffbfff0822b50 R11: ffff888107f2b898 R12: ffff88814fbc8000
 R13: ffff88812f713c10 R14: ffff888117d17a38 R15: ffff88814fbc80c0
 FS:  00007f6593d36740(0000) GS:ffff8882a4f00000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 00005607a00758f8 CR3: 0000000131aea006 CR4: 0000000000170ea0
 Call Trace:
  tc_block_indr_cleanup+0x3e0/0x5a0
  ? tcf_block_unbind+0x420/0x420
  ? __mutex_unlock_slowpath+0xe7/0x610
  flow_indr_dev_unregister+0x5e2/0x930
  ? mlx5e_restore_tunnel+0xdf0/0xdf0 [mlx5_core]
  ? mlx5e_restore_tunnel+0xdf0/0xdf0 [mlx5_core]
  ? flow_indr_block_cb_alloc+0x3c0/0x3c0
  ? mlx5_db_free+0x37c/0x4b0 [mlx5_core]
  mlx5e_cleanup_rep_tx+0x8b/0xc0 [mlx5_core]
  mlx5e_detach_netdev+0xe5/0x120 [mlx5_core]
  mlx5e_vport_rep_unload+0x155/0x260 [mlx5_core]
  esw_offloads_disable+0x227/0x2b0 [mlx5_core]
  mlx5_eswitch_disable_locked.cold+0x38e/0x699 [mlx5_core]
  mlx5_eswitch_disable+0x94/0xf0 [mlx5_core]
  mlx5_device_disable_sriov+0x183/0x1f0 [mlx5_core]
  mlx5_core_sriov_configure+0xfd/0x230 [mlx5_core]
  sriov_numvfs_store+0x261/0x2f0
  ? sriov_drivers_autoprobe_store+0x110/0x110
  ? sysfs_file_ops+0x170/0x170
  ? sysfs_file_ops+0x117/0x170
  ? sysfs_file_ops+0x170/0x170
  kernfs_fop_write+0x1ff/0x3f0
  ? rcu_read_lock_any_held+0x6e/0x90
  vfs_write+0x1f3/0x620
  ksys_write+0xf9/0x1d0
  ? __x64_sys_read+0xb0/0xb0
  ? lockdep_hardirqs_on_prepare+0x273/0x3f0
  ? syscall_enter_from_user_mode+0x1d/0x50
  do_syscall_64+0x2d/0x40
  entry_SYSCALL_64_after_hwframe+0x44/0xa9

<...>

 ---[ end trace bfdd028ada702879 ]---

Fixes: 0fdcf78d5973 ("net: use flow_indr_dev_setup_offload()")
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Link: https://lore.kernel.org/r/20201026123327.1141066-1-leon@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/sched/cls_api.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index faeabff283a2..838b3fd94d77 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -652,12 +652,12 @@ static void tc_block_indr_cleanup(struct flow_block_cb *block_cb)
 			       block_cb->indr.binder_type,
 			       &block->flow_block, tcf_block_shared(block),
 			       &extack);
+	rtnl_lock();
 	down_write(&block->cb_lock);
 	list_del(&block_cb->driver_list);
 	list_move(&block_cb->list, &bo.cb_list);
-	up_write(&block->cb_lock);
-	rtnl_lock();
 	tcf_block_unbind(block, &bo);
+	up_write(&block->cb_lock);
 	rtnl_unlock();
 }
 
-- 
cgit v1.2.3


From ceb1eb2fb609c88363e06618b8d4bbf7815a4e03 Mon Sep 17 00:00:00 2001
From: Tung Nguyen <tung.q.nguyen@dektech.com.au>
Date: Tue, 27 Oct 2020 10:24:03 +0700
Subject: tipc: fix memory leak caused by tipc_buf_append()

Commit ed42989eab57 ("tipc: fix the skb_unshare() in tipc_buf_append()")
replaced skb_unshare() with skb_copy() to not reduce the data reference
counter of the original skb intentionally. This is not the correct
way to handle the cloned skb because it causes memory leak in 2
following cases:
 1/ Sending multicast messages via broadcast link
  The original skb list is cloned to the local skb list for local
  destination. After that, the data reference counter of each skb
  in the original list has the value of 2. This causes each skb not
  to be freed after receiving ACK:
  tipc_link_advance_transmq()
  {
   ...
   /* release skb */
   __skb_unlink(skb, &l->transmq);
   kfree_skb(skb); <-- memory exists after being freed
  }

 2/ Sending multicast messages via replicast link
  Similar to the above case, each skb cannot be freed after purging
  the skb list:
  tipc_mcast_xmit()
  {
   ...
   __skb_queue_purge(pkts); <-- memory exists after being freed
  }

This commit fixes this issue by using skb_unshare() instead. Besides,
to avoid use-after-free error reported by KASAN, the pointer to the
fragment is set to NULL before calling skb_unshare() to make sure that
the original skb is not freed after freeing the fragment 2 times in
case skb_unshare() returns NULL.

Fixes: ed42989eab57 ("tipc: fix the skb_unshare() in tipc_buf_append()")
Acked-by: Jon Maloy <jmaloy@redhat.com>
Reported-by: Thang Hoang Ngo <thang.h.ngo@dektech.com.au>
Signed-off-by: Tung Nguyen <tung.q.nguyen@dektech.com.au>
Reviewed-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
Link: https://lore.kernel.org/r/20201027032403.1823-1-tung.q.nguyen@dektech.com.au
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/tipc/msg.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/tipc/msg.c b/net/tipc/msg.c
index 2a78aa701572..32c79c59052b 100644
--- a/net/tipc/msg.c
+++ b/net/tipc/msg.c
@@ -150,12 +150,11 @@ int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf)
 	if (fragid == FIRST_FRAGMENT) {
 		if (unlikely(head))
 			goto err;
-		if (skb_cloned(frag))
-			frag = skb_copy(frag, GFP_ATOMIC);
+		*buf = NULL;
+		frag = skb_unshare(frag, GFP_ATOMIC);
 		if (unlikely(!frag))
 			goto err;
 		head = *headbuf = frag;
-		*buf = NULL;
 		TIPC_SKB_CB(head)->tail = NULL;
 		if (skb_is_nonlinear(head)) {
 			skb_walk_frags(head, tail) {
-- 
cgit v1.2.3


From 9c3f94e1681bb0ebd93390f014082042d8bc067a Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Tue, 27 Oct 2020 15:59:14 +0100
Subject: mptcp: add missing memory scheduling in the rx path

When moving the skbs from the subflow into the msk receive
queue, we must schedule there the required amount of memory.

Try to borrow the required memory from the subflow, if needed,
so that we leverage the existing TCP heuristic.

Fixes: 6771bfd9ee24 ("mptcp: update mptcp ack sequence from work queue")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Link: https://lore.kernel.org/r/f6143a6193a083574f11b00dbf7b5ad151bc4ff4.1603810630.git.pabeni@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/mptcp/protocol.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'net')

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 185dacb39781..e7419fd15d84 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -274,6 +274,15 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
 	skb_ext_reset(skb);
 	skb_orphan(skb);
 
+	/* try to fetch required memory from subflow */
+	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
+		if (ssk->sk_forward_alloc < skb->truesize)
+			goto drop;
+		__sk_mem_reclaim(ssk, skb->truesize);
+		if (!sk_rmem_schedule(sk, skb, skb->truesize))
+			goto drop;
+	}
+
 	/* the skb map_seq accounts for the skb offset:
 	 * mptcp_subflow_get_mapped_dsn() is based on the current tp->copied_seq
 	 * value
@@ -301,6 +310,7 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
 	 * will retransmit as needed, if needed.
 	 */
 	MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
+drop:
 	mptcp_drop(sk, skb);
 	return false;
 }
-- 
cgit v1.2.3


From eadd1befdd778a1eca57fad058782bd22b4db804 Mon Sep 17 00:00:00 2001
From: Aleksandr Nogikh <nogikh@google.com>
Date: Wed, 28 Oct 2020 17:07:31 +0000
Subject: netem: fix zero division in tabledist

Currently it is possible to craft a special netlink RTM_NEWQDISC
command that can result in jitter being equal to 0x80000000. It is
enough to set the 32 bit jitter to 0x02000000 (it will later be
multiplied by 2^6) or just set the 64 bit jitter via
TCA_NETEM_JITTER64. This causes an overflow during the generation of
uniformly distributed numbers in tabledist(), which in turn leads to
division by zero (sigma != 0, but sigma * 2 is 0).

The related fragment of code needs 32-bit division - see commit
9b0ed89 ("netem: remove unnecessary 64 bit modulus"), so switching to
64 bit is not an option.

Fix the issue by keeping the value of jitter within the range that can
be adequately handled by tabledist() - [0;INT_MAX]. As negative std
deviation makes no sense, take the absolute value of the passed value
and cap it at INT_MAX. Inside tabledist(), switch to unsigned 32 bit
arithmetic in order to prevent overflows.

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Aleksandr Nogikh <nogikh@google.com>
Reported-by: syzbot+ec762a6342ad0d3c0d8f@syzkaller.appspotmail.com
Acked-by: Stephen Hemminger <stephen@networkplumber.org>
Link: https://lore.kernel.org/r/20201028170731.1383332-1-aleksandrnogikh@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/sched/sch_netem.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 84f82771cdf5..0c345e43a09a 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -330,7 +330,7 @@ static s64 tabledist(s64 mu, s32 sigma,
 
 	/* default uniform distribution */
 	if (dist == NULL)
-		return ((rnd % (2 * sigma)) + mu) - sigma;
+		return ((rnd % (2 * (u32)sigma)) + mu) - sigma;
 
 	t = dist->table[rnd % dist->size];
 	x = (sigma % NETEM_DIST_SCALE) * t;
@@ -812,6 +812,10 @@ static void get_slot(struct netem_sched_data *q, const struct nlattr *attr)
 		q->slot_config.max_packets = INT_MAX;
 	if (q->slot_config.max_bytes == 0)
 		q->slot_config.max_bytes = INT_MAX;
+
+	/* capping dist_jitter to the range acceptable by tabledist() */
+	q->slot_config.dist_jitter = min_t(__s64, INT_MAX, abs(q->slot_config.dist_jitter));
+
 	q->slot.packets_left = q->slot_config.max_packets;
 	q->slot.bytes_left = q->slot_config.max_bytes;
 	if (q->slot_config.min_delay | q->slot_config.max_delay |
@@ -1037,6 +1041,9 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt,
 	if (tb[TCA_NETEM_SLOT])
 		get_slot(q, tb[TCA_NETEM_SLOT]);
 
+	/* capping jitter to the range acceptable by tabledist() */
+	q->jitter = min_t(s64, abs(q->jitter), INT_MAX);
+
 	return ret;
 
 get_table_failure:
-- 
cgit v1.2.3