From 435ccfa894e35e3d4a1799e6ac030e48a7b69ef5 Mon Sep 17 00:00:00 2001 From: Arjun Roy Date: Fri, 23 Oct 2020 11:47:09 -0700 Subject: tcp: Prevent low rmem stalls with SO_RCVLOWAT. With SO_RCVLOWAT, under memory pressure, it is possible to enter a state where: 1. We have not received enough bytes to satisfy SO_RCVLOWAT. 2. We have not entered buffer pressure (see tcp_rmem_pressure()). 3. But, we do not have enough buffer space to accept more packets. In this case, we advertise 0 rwnd (due to #3) but the application does not drain the receive queue (no wakeup because of #1 and #2) so the flow stalls. Modify the heuristic for SO_RCVLOWAT so that, if we are advertising rwnd<=rcv_mss, force a wakeup to prevent a stall. Without this patch, setting tcp_rmem to 6143 and disabling TCP autotune causes a stalled flow. With this patch, no stall occurs. This is with RPC-style traffic with large messages. Fixes: 03f45c883c6f ("tcp: avoid extra wakeups for SO_RCVLOWAT users") Signed-off-by: Arjun Roy Acked-by: Soheil Hassas Yeganeh Acked-by: Neal Cardwell Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20201023184709.217614-1-arjunroy.kdev@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp.c | 2 ++ net/ipv4/tcp_input.c | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index bae4284bf542..b2bc3d7fe9e8 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -485,6 +485,8 @@ static inline bool tcp_stream_is_readable(const struct tcp_sock *tp, return true; if (tcp_rmem_pressure(sk)) return true; + if (tcp_receive_window(tp) <= inet_csk(sk)->icsk_ack.rcv_mss) + return true; } if (sk->sk_prot->stream_memory_read) return sk->sk_prot->stream_memory_read(sk); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index fc445833b5e5..389d1b340248 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4908,7 +4908,8 @@ void tcp_data_ready(struct sock *sk) int avail = tp->rcv_nxt - tp->copied_seq; if (avail < sk->sk_rcvlowat && !tcp_rmem_pressure(sk) && - !sock_flag(sk, SOCK_DONE)) + !sock_flag(sk, SOCK_DONE) && + tcp_receive_window(tp) > inet_csk(sk)->icsk_ack.rcv_mss) return; sk->sk_data_ready(sk); -- cgit v1.2.3 From af545bb5ee53f5261db631db2ac4cde54038bdaf Mon Sep 17 00:00:00 2001 From: Jeff Vander Stoep Date: Fri, 23 Oct 2020 16:37:57 +0200 Subject: vsock: use ns_capable_noaudit() on socket create During __vsock_create() CAP_NET_ADMIN is used to determine if the vsock_sock->trusted should be set to true. This value is used later for determing if a remote connection should be allowed to connect to a restricted VM. Unfortunately, if the caller doesn't have CAP_NET_ADMIN, an audit message such as an selinux denial is generated even if the caller does not want a trusted socket. Logging errors on success is confusing. To avoid this, switch the capable(CAP_NET_ADMIN) check to the noaudit version. Reported-by: Roman Kiryanov https://android-review.googlesource.com/c/device/generic/goldfish/+/1468545/ Signed-off-by: Jeff Vander Stoep Reviewed-by: James Morris Link: https://lore.kernel.org/r/20201023143757.377574-1-jeffv@google.com Signed-off-by: Jakub Kicinski --- net/vmw_vsock/af_vsock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 9e93bc201cc0..b4d7b8aba003 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -739,7 +739,7 @@ static struct sock *__vsock_create(struct net *net, vsk->buffer_min_size = psk->buffer_min_size; vsk->buffer_max_size = psk->buffer_max_size; } else { - vsk->trusted = capable(CAP_NET_ADMIN); + vsk->trusted = ns_capable_noaudit(&init_user_ns, CAP_NET_ADMIN); vsk->owner = get_current_cred(); vsk->connect_timeout = VSOCK_DEFAULT_CONNECT_TIMEOUT; vsk->buffer_size = VSOCK_DEFAULT_BUFFER_SIZE; -- cgit v1.2.3 From 4a9baf45fd72615a804947a8495b73c4a0a4cb54 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Fri, 23 Oct 2020 20:48:28 +0200 Subject: net/smc: fix null pointer dereference in smc_listen_decline() smc_listen_work() calls smc_listen_decline() on label out_decl, providing the ini pointer variable. But this pointer can still be null when the label out_decl is reached. Fix this by checking the ini variable in smc_listen_work() and call smc_listen_decline() with the result directly. Fixes: a7c9c5f4af7f ("net/smc: CLC accept / confirm V2") Signed-off-by: Karsten Graul Signed-off-by: Jakub Kicinski --- net/smc/af_smc.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 82be0bd0f6e8..e9f487c8c6d5 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1317,10 +1317,10 @@ static void smc_listen_out_err(struct smc_sock *new_smc) /* listen worker: decline and fall back if possible */ static void smc_listen_decline(struct smc_sock *new_smc, int reason_code, - struct smc_init_info *ini, u8 version) + int local_first, u8 version) { /* RDMA setup failed, switch back to TCP */ - if (ini->first_contact_local) + if (local_first) smc_lgr_cleanup_early(&new_smc->conn); else smc_conn_free(&new_smc->conn); @@ -1768,7 +1768,8 @@ static void smc_listen_work(struct work_struct *work) out_unlock: mutex_unlock(&smc_server_lgr_pending); out_decl: - smc_listen_decline(new_smc, rc, ini, version); + smc_listen_decline(new_smc, rc, ini ? ini->first_contact_local : 0, + version); out_free: kfree(ini); kfree(buf); -- cgit v1.2.3 From 96d6fded958d971a3695009e0ed43aca6c598283 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Fri, 23 Oct 2020 20:48:29 +0200 Subject: net/smc: fix suppressed return code The patch that repaired the invalid return code in smcd_new_buf_create() missed to take care of errno ENOSPC which has a special meaning that no more DMBEs can be registered on the device. Fix that by keeping this errno value during the translation of the return code. Fixes: 6b1bbf94ab36 ("net/smc: fix invalid return code in smcd_new_buf_create()") Signed-off-by: Karsten Graul Signed-off-by: Jakub Kicinski --- net/smc/smc_core.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index d790c43c473f..2b19863f7171 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1615,8 +1615,11 @@ static struct smc_buf_desc *smcd_new_buf_create(struct smc_link_group *lgr, rc = smc_ism_register_dmb(lgr, bufsize, buf_desc); if (rc) { kfree(buf_desc); - return (rc == -ENOMEM) ? ERR_PTR(-EAGAIN) : - ERR_PTR(-EIO); + if (rc == -ENOMEM) + return ERR_PTR(-EAGAIN); + if (rc == -ENOSPC) + return ERR_PTR(-ENOSPC); + return ERR_PTR(-EIO); } buf_desc->pages = virt_to_page(buf_desc->cpu_addr); /* CDC header stored in buf. So, pretend it was smaller */ -- cgit v1.2.3 From 6c211809c87f0de939f3bd60ceec05338ae6eba1 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 26 Oct 2020 11:00:59 +0300 Subject: devlink: Fix some error codes These paths don't set the error codes. It's especially important in devlink_nl_region_notify_build() where it leads to a NULL dereference in the caller. Fixes: 544e7c33ec2f ("net: devlink: Add support for port regions") Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/20201026080059.GA1628785@mwanda Signed-off-by: Jakub Kicinski --- net/core/devlink.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/core/devlink.c b/net/core/devlink.c index a578634052a3..925da7c0fcf3 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -4213,10 +4213,12 @@ static int devlink_nl_region_fill(struct sk_buff *msg, struct devlink *devlink, if (err) goto nla_put_failure; - if (region->port) - if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, - region->port->index)) + if (region->port) { + err = nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, + region->port->index); + if (err) goto nla_put_failure; + } err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME, region->ops->name); if (err) @@ -4265,10 +4267,12 @@ devlink_nl_region_notify_build(struct devlink_region *region, if (err) goto out_cancel_msg; - if (region->port) - if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, - region->port->index)) + if (region->port) { + err = nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, + region->port->index); + if (err) goto out_cancel_msg; + } err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME, region->ops->name); @@ -4962,10 +4966,12 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, if (err) goto nla_put_failure; - if (region->port) - if (nla_put_u32(skb, DEVLINK_ATTR_PORT_INDEX, - region->port->index)) + if (region->port) { + err = nla_put_u32(skb, DEVLINK_ATTR_PORT_INDEX, + region->port->index); + if (err) goto nla_put_failure; + } err = nla_put_string(skb, DEVLINK_ATTR_REGION_NAME, region_name); if (err) -- cgit v1.2.3 From 0d8cb9464a7d9c3e6349db3f7719a80f3793347e Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 26 Oct 2020 11:01:27 +0300 Subject: devlink: Unlock on error in dumpit() This needs to unlock before returning. Fixes: 544e7c33ec2f ("net: devlink: Add support for port regions") Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/20201026080127.GB1628785@mwanda Signed-off-by: Jakub Kicinski --- net/core/devlink.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/devlink.c b/net/core/devlink.c index 925da7c0fcf3..a932d95be798 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -4919,8 +4919,10 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); port = devlink_port_get_by_index(devlink, index); - if (!port) - return -ENODEV; + if (!port) { + err = -ENODEV; + goto out_unlock; + } } region_name = nla_data(attrs[DEVLINK_ATTR_REGION_NAME]); -- cgit v1.2.3 From 501b72ae2472a15a80c0f4063ee8341870e1ef55 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Mon, 26 Oct 2020 11:29:45 +0100 Subject: net/sched: act_mpls: Add softdep on mpls_gso.ko TCA_MPLS_ACT_PUSH and TCA_MPLS_ACT_MAC_PUSH might be used on gso packets. Such packets will thus require mpls_gso.ko for segmentation. v2: Drop dependency on CONFIG_NET_MPLS_GSO in Kconfig (from Jakub and David). Fixes: 2a2ea50870ba ("net: sched: add mpls manipulation actions to TC") Signed-off-by: Guillaume Nault Link: https://lore.kernel.org/r/1f6cab15bbd15666795061c55563aaf6a386e90e.1603708007.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- net/sched/act_mpls.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c index f40bf9771cb9..5c7456e5b5cf 100644 --- a/net/sched/act_mpls.c +++ b/net/sched/act_mpls.c @@ -426,6 +426,7 @@ static void __exit mpls_cleanup_module(void) module_init(mpls_init_module); module_exit(mpls_cleanup_module); +MODULE_SOFTDEP("post: mpls_gso"); MODULE_AUTHOR("Netronome Systems "); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("MPLS manipulation actions"); -- cgit v1.2.3 From d6535dca28859d8d9ef80894eb287b2ac35a32e8 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 26 Oct 2020 14:33:27 +0200 Subject: net: protect tcf_block_unbind with block lock The tcf_block_unbind() expects that the caller will take block->cb_lock before calling it, however the code took RTNL lock and dropped cb_lock instead. This causes to the following kernel panic. WARNING: CPU: 1 PID: 13524 at net/sched/cls_api.c:1488 tcf_block_unbind+0x2db/0x420 Modules linked in: mlx5_ib mlx5_core mlxfw ptp pps_core act_mirred act_tunnel_key cls_flower vxlan ip6_udp_tunnel udp_tunnel dummy sch_ingress openvswitch nsh xt_conntrack xt_MASQUERADE nf_conntrack_netlink nfnetlink xt_addrtype iptable_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 br_netfilter rpcrdma rdma_ucm ib_iser libiscsi scsi_transport_iscsi ib_umad ib_ipoib rdma_cm iw_cm ib_cm ib_uverbs ib_core overlay [last unloaded: mlxfw] CPU: 1 PID: 13524 Comm: test-ecmp-add-v Tainted: G W 5.9.0+ #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 RIP: 0010:tcf_block_unbind+0x2db/0x420 Code: ff 48 83 c4 40 5b 5d 41 5c 41 5d 41 5e 41 5f c3 49 8d bc 24 30 01 00 00 be ff ff ff ff e8 7d 7f 70 00 85 c0 0f 85 7b fd ff ff <0f> 0b e9 74 fd ff ff 48 c7 c7 dc 6a 24 84 e8 02 ec fe fe e9 55 fd RSP: 0018:ffff888117d17968 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff88812f713c00 RCX: 1ffffffff0848d5b RDX: 0000000000000001 RSI: ffff88814fbc8130 RDI: ffff888107f2b878 RBP: 1ffff11022fa2f3f R08: 0000000000000000 R09: ffffffff84115a87 R10: fffffbfff0822b50 R11: ffff888107f2b898 R12: ffff88814fbc8000 R13: ffff88812f713c10 R14: ffff888117d17a38 R15: ffff88814fbc80c0 FS: 00007f6593d36740(0000) GS:ffff8882a4f00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00005607a00758f8 CR3: 0000000131aea006 CR4: 0000000000170ea0 Call Trace: tc_block_indr_cleanup+0x3e0/0x5a0 ? tcf_block_unbind+0x420/0x420 ? __mutex_unlock_slowpath+0xe7/0x610 flow_indr_dev_unregister+0x5e2/0x930 ? mlx5e_restore_tunnel+0xdf0/0xdf0 [mlx5_core] ? mlx5e_restore_tunnel+0xdf0/0xdf0 [mlx5_core] ? flow_indr_block_cb_alloc+0x3c0/0x3c0 ? mlx5_db_free+0x37c/0x4b0 [mlx5_core] mlx5e_cleanup_rep_tx+0x8b/0xc0 [mlx5_core] mlx5e_detach_netdev+0xe5/0x120 [mlx5_core] mlx5e_vport_rep_unload+0x155/0x260 [mlx5_core] esw_offloads_disable+0x227/0x2b0 [mlx5_core] mlx5_eswitch_disable_locked.cold+0x38e/0x699 [mlx5_core] mlx5_eswitch_disable+0x94/0xf0 [mlx5_core] mlx5_device_disable_sriov+0x183/0x1f0 [mlx5_core] mlx5_core_sriov_configure+0xfd/0x230 [mlx5_core] sriov_numvfs_store+0x261/0x2f0 ? sriov_drivers_autoprobe_store+0x110/0x110 ? sysfs_file_ops+0x170/0x170 ? sysfs_file_ops+0x117/0x170 ? sysfs_file_ops+0x170/0x170 kernfs_fop_write+0x1ff/0x3f0 ? rcu_read_lock_any_held+0x6e/0x90 vfs_write+0x1f3/0x620 ksys_write+0xf9/0x1d0 ? __x64_sys_read+0xb0/0xb0 ? lockdep_hardirqs_on_prepare+0x273/0x3f0 ? syscall_enter_from_user_mode+0x1d/0x50 do_syscall_64+0x2d/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 <...> ---[ end trace bfdd028ada702879 ]--- Fixes: 0fdcf78d5973 ("net: use flow_indr_dev_setup_offload()") Signed-off-by: Leon Romanovsky Link: https://lore.kernel.org/r/20201026123327.1141066-1-leon@kernel.org Signed-off-by: Jakub Kicinski --- net/sched/cls_api.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index faeabff283a2..838b3fd94d77 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -652,12 +652,12 @@ static void tc_block_indr_cleanup(struct flow_block_cb *block_cb) block_cb->indr.binder_type, &block->flow_block, tcf_block_shared(block), &extack); + rtnl_lock(); down_write(&block->cb_lock); list_del(&block_cb->driver_list); list_move(&block_cb->list, &bo.cb_list); - up_write(&block->cb_lock); - rtnl_lock(); tcf_block_unbind(block, &bo); + up_write(&block->cb_lock); rtnl_unlock(); } -- cgit v1.2.3 From ceb1eb2fb609c88363e06618b8d4bbf7815a4e03 Mon Sep 17 00:00:00 2001 From: Tung Nguyen Date: Tue, 27 Oct 2020 10:24:03 +0700 Subject: tipc: fix memory leak caused by tipc_buf_append() Commit ed42989eab57 ("tipc: fix the skb_unshare() in tipc_buf_append()") replaced skb_unshare() with skb_copy() to not reduce the data reference counter of the original skb intentionally. This is not the correct way to handle the cloned skb because it causes memory leak in 2 following cases: 1/ Sending multicast messages via broadcast link The original skb list is cloned to the local skb list for local destination. After that, the data reference counter of each skb in the original list has the value of 2. This causes each skb not to be freed after receiving ACK: tipc_link_advance_transmq() { ... /* release skb */ __skb_unlink(skb, &l->transmq); kfree_skb(skb); <-- memory exists after being freed } 2/ Sending multicast messages via replicast link Similar to the above case, each skb cannot be freed after purging the skb list: tipc_mcast_xmit() { ... __skb_queue_purge(pkts); <-- memory exists after being freed } This commit fixes this issue by using skb_unshare() instead. Besides, to avoid use-after-free error reported by KASAN, the pointer to the fragment is set to NULL before calling skb_unshare() to make sure that the original skb is not freed after freeing the fragment 2 times in case skb_unshare() returns NULL. Fixes: ed42989eab57 ("tipc: fix the skb_unshare() in tipc_buf_append()") Acked-by: Jon Maloy Reported-by: Thang Hoang Ngo Signed-off-by: Tung Nguyen Reviewed-by: Xin Long Acked-by: Cong Wang Link: https://lore.kernel.org/r/20201027032403.1823-1-tung.q.nguyen@dektech.com.au Signed-off-by: Jakub Kicinski --- net/tipc/msg.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 2a78aa701572..32c79c59052b 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -150,12 +150,11 @@ int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf) if (fragid == FIRST_FRAGMENT) { if (unlikely(head)) goto err; - if (skb_cloned(frag)) - frag = skb_copy(frag, GFP_ATOMIC); + *buf = NULL; + frag = skb_unshare(frag, GFP_ATOMIC); if (unlikely(!frag)) goto err; head = *headbuf = frag; - *buf = NULL; TIPC_SKB_CB(head)->tail = NULL; if (skb_is_nonlinear(head)) { skb_walk_frags(head, tail) { -- cgit v1.2.3 From 9c3f94e1681bb0ebd93390f014082042d8bc067a Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 27 Oct 2020 15:59:14 +0100 Subject: mptcp: add missing memory scheduling in the rx path When moving the skbs from the subflow into the msk receive queue, we must schedule there the required amount of memory. Try to borrow the required memory from the subflow, if needed, so that we leverage the existing TCP heuristic. Fixes: 6771bfd9ee24 ("mptcp: update mptcp ack sequence from work queue") Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Link: https://lore.kernel.org/r/f6143a6193a083574f11b00dbf7b5ad151bc4ff4.1603810630.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 185dacb39781..e7419fd15d84 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -274,6 +274,15 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, skb_ext_reset(skb); skb_orphan(skb); + /* try to fetch required memory from subflow */ + if (!sk_rmem_schedule(sk, skb, skb->truesize)) { + if (ssk->sk_forward_alloc < skb->truesize) + goto drop; + __sk_mem_reclaim(ssk, skb->truesize); + if (!sk_rmem_schedule(sk, skb, skb->truesize)) + goto drop; + } + /* the skb map_seq accounts for the skb offset: * mptcp_subflow_get_mapped_dsn() is based on the current tp->copied_seq * value @@ -301,6 +310,7 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, * will retransmit as needed, if needed. */ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); +drop: mptcp_drop(sk, skb); return false; } -- cgit v1.2.3 From eadd1befdd778a1eca57fad058782bd22b4db804 Mon Sep 17 00:00:00 2001 From: Aleksandr Nogikh Date: Wed, 28 Oct 2020 17:07:31 +0000 Subject: netem: fix zero division in tabledist Currently it is possible to craft a special netlink RTM_NEWQDISC command that can result in jitter being equal to 0x80000000. It is enough to set the 32 bit jitter to 0x02000000 (it will later be multiplied by 2^6) or just set the 64 bit jitter via TCA_NETEM_JITTER64. This causes an overflow during the generation of uniformly distributed numbers in tabledist(), which in turn leads to division by zero (sigma != 0, but sigma * 2 is 0). The related fragment of code needs 32-bit division - see commit 9b0ed89 ("netem: remove unnecessary 64 bit modulus"), so switching to 64 bit is not an option. Fix the issue by keeping the value of jitter within the range that can be adequately handled by tabledist() - [0;INT_MAX]. As negative std deviation makes no sense, take the absolute value of the passed value and cap it at INT_MAX. Inside tabledist(), switch to unsigned 32 bit arithmetic in order to prevent overflows. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Aleksandr Nogikh Reported-by: syzbot+ec762a6342ad0d3c0d8f@syzkaller.appspotmail.com Acked-by: Stephen Hemminger Link: https://lore.kernel.org/r/20201028170731.1383332-1-aleksandrnogikh@gmail.com Signed-off-by: Jakub Kicinski --- net/sched/sch_netem.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 84f82771cdf5..0c345e43a09a 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -330,7 +330,7 @@ static s64 tabledist(s64 mu, s32 sigma, /* default uniform distribution */ if (dist == NULL) - return ((rnd % (2 * sigma)) + mu) - sigma; + return ((rnd % (2 * (u32)sigma)) + mu) - sigma; t = dist->table[rnd % dist->size]; x = (sigma % NETEM_DIST_SCALE) * t; @@ -812,6 +812,10 @@ static void get_slot(struct netem_sched_data *q, const struct nlattr *attr) q->slot_config.max_packets = INT_MAX; if (q->slot_config.max_bytes == 0) q->slot_config.max_bytes = INT_MAX; + + /* capping dist_jitter to the range acceptable by tabledist() */ + q->slot_config.dist_jitter = min_t(__s64, INT_MAX, abs(q->slot_config.dist_jitter)); + q->slot.packets_left = q->slot_config.max_packets; q->slot.bytes_left = q->slot_config.max_bytes; if (q->slot_config.min_delay | q->slot_config.max_delay | @@ -1037,6 +1041,9 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt, if (tb[TCA_NETEM_SLOT]) get_slot(q, tb[TCA_NETEM_SLOT]); + /* capping jitter to the range acceptable by tabledist() */ + q->jitter = min_t(s64, abs(q->jitter), INT_MAX); + return ret; get_table_failure: -- cgit v1.2.3