From a43608fa77213ad5ac5f75994254b9f65d57cfa0 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Wed, 24 Oct 2018 10:27:12 +0200 Subject: can: raw: check for CAN FD capable netdev in raw_sendmsg() When the socket is CAN FD enabled it can handle CAN FD frame transmissions. Add an additional check in raw_sendmsg() as a CAN2.0 CAN driver (non CAN FD) should never see a CAN FD frame. Due to the commonly used can_dropped_invalid_skb() function the CAN 2.0 driver would drop that CAN FD frame anyway - but with this patch the user gets a proper -EINVAL return code. Signed-off-by: Oliver Hartkopp Cc: linux-stable Signed-off-by: Marc Kleine-Budde --- net/can/raw.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/can/raw.c b/net/can/raw.c index 1051eee82581..3aab7664933f 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -745,18 +745,19 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) } else ifindex = ro->ifindex; - if (ro->fd_frames) { + dev = dev_get_by_index(sock_net(sk), ifindex); + if (!dev) + return -ENXIO; + + err = -EINVAL; + if (ro->fd_frames && dev->mtu == CANFD_MTU) { if (unlikely(size != CANFD_MTU && size != CAN_MTU)) - return -EINVAL; + goto put_dev; } else { if (unlikely(size != CAN_MTU)) - return -EINVAL; + goto put_dev; } - dev = dev_get_by_index(sock_net(sk), ifindex); - if (!dev) - return -ENXIO; - skb = sock_alloc_send_skb(sk, size + sizeof(struct can_skb_priv), msg->msg_flags & MSG_DONTWAIT, &err); if (!skb) -- cgit v1.2.3 From f4156f9656feac21f4de712fac94fae964c5d402 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Tue, 30 Oct 2018 12:17:10 +0100 Subject: batman-adv: Use explicit tvlv padding for ELP packets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The announcement messages of batman-adv COMPAT_VERSION 15 have the possibility to announce additional information via a dynamic TVLV part. This part is optional for the ELP packets and currently not parsed by the Linux implementation. Still out-of-tree versions are using it to transport things like neighbor hashes to optimize the rebroadcast behavior. Since the ELP broadcast packets are smaller than the minimal ethernet packet, it often has to be padded. This is often done (as specified in RFC894) with octets of zero and thus work perfectly fine with the TVLV part (making it a zero length and thus empty). But not all ethernet compatible hardware seems to follow this advice. To avoid ambiguous situations when parsing the TVLV header, just force the 4 bytes (TVLV length + padding) after the required ELP header to zero. Fixes: d6f94d91f766 ("batman-adv: ELP - adding basic infrastructure") Reported-by: Linus Lüssing Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/bat_v_elp.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index 9f481cfdf77d..e8090f099eb8 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -352,19 +352,21 @@ out: */ int batadv_v_elp_iface_enable(struct batadv_hard_iface *hard_iface) { + static const size_t tvlv_padding = sizeof(__be32); struct batadv_elp_packet *elp_packet; unsigned char *elp_buff; u32 random_seqno; size_t size; int res = -ENOMEM; - size = ETH_HLEN + NET_IP_ALIGN + BATADV_ELP_HLEN; + size = ETH_HLEN + NET_IP_ALIGN + BATADV_ELP_HLEN + tvlv_padding; hard_iface->bat_v.elp_skb = dev_alloc_skb(size); if (!hard_iface->bat_v.elp_skb) goto out; skb_reserve(hard_iface->bat_v.elp_skb, ETH_HLEN + NET_IP_ALIGN); - elp_buff = skb_put_zero(hard_iface->bat_v.elp_skb, BATADV_ELP_HLEN); + elp_buff = skb_put_zero(hard_iface->bat_v.elp_skb, + BATADV_ELP_HLEN + tvlv_padding); elp_packet = (struct batadv_elp_packet *)elp_buff; elp_packet->packet_type = BATADV_ELP; -- cgit v1.2.3 From d7d8bbb40a5b1f682ee6589e212934f4c6b8ad60 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Wed, 7 Nov 2018 23:09:12 +0100 Subject: batman-adv: Expand merged fragment buffer for full packet MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The complete size ("total_size") of the fragmented packet is stored in the fragment header and in the size of the fragment chain. When the fragments are ready for merge, the skbuff's tail of the first fragment is expanded to have enough room after the data pointer for at least total_size. This means that it gets expanded by total_size - first_skb->len. But this is ignoring the fact that after expanding the buffer, the fragment header is pulled by from this buffer. Assuming that the tailroom of the buffer was already 0, the buffer after the data pointer of the skbuff is now only total_size - len(fragment_header) large. When the merge function is then processing the remaining fragments, the code to copy the data over to the merged skbuff will cause an skb_over_panic when it tries to actually put enough data to fill the total_size bytes of the packet. The size of the skb_pull must therefore also be taken into account when the buffer's tailroom is expanded. Fixes: 610bfc6bc99b ("batman-adv: Receive fragmented packets and merge") Reported-by: Martin Weinelt Co-authored-by: Linus Lüssing Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/fragmentation.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index 0fddc17106bd..5b71a289d04f 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -275,7 +275,7 @@ batadv_frag_merge_packets(struct hlist_head *chain) kfree(entry); packet = (struct batadv_frag_packet *)skb_out->data; - size = ntohs(packet->total_size); + size = ntohs(packet->total_size) + hdr_size; /* Make room for the rest of the fragments. */ if (pskb_expand_head(skb_out, 0, size - skb_out->len, GFP_ATOMIC) < 0) { -- cgit v1.2.3 From f8504f4ca0a0e9f84546ef86e00b24d2ea9a0bd2 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 13 Nov 2018 01:08:25 +0800 Subject: l2tp: fix a sock refcnt leak in l2tp_tunnel_register This issue happens when trying to add an existent tunnel. It doesn't call sock_put() before returning -EEXIST to release the sock refcnt that was held by calling sock_hold() before the existence check. This patch is to fix it by holding the sock after doing the existence check. Fixes: f6cd651b056f ("l2tp: fix race in duplicate tunnel detection") Reported-by: Jianlin Shi Signed-off-by: Xin Long Reviewed-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 82cdf9020b53..26f1d435696a 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1490,12 +1490,7 @@ int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net, goto err_sock; } - sk = sock->sk; - - sock_hold(sk); - tunnel->sock = sk; tunnel->l2tp_net = net; - pn = l2tp_pernet(net); spin_lock_bh(&pn->l2tp_tunnel_list_lock); @@ -1510,6 +1505,10 @@ int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net, list_add_rcu(&tunnel->list, &pn->l2tp_tunnel_list); spin_unlock_bh(&pn->l2tp_tunnel_list_lock); + sk = sock->sk; + sock_hold(sk); + tunnel->sock = sk; + if (tunnel->encap == L2TP_ENCAPTYPE_UDP) { struct udp_tunnel_sock_cfg udp_cfg = { .sk_user_data = tunnel, -- cgit v1.2.3 From 7150ceaacb27f7b3bf494e72cd4be4e11612dfff Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 12 Nov 2018 22:33:22 +0000 Subject: rxrpc: Fix life check The life-checking function, which is used by kAFS to make sure that a call is still live in the event of a pending signal, only samples the received packet serial number counter; it doesn't actually provoke a change in the counter, rather relying on the server to happen to give us a packet in the time window. Fix this by adding a function to force a ping to be transmitted. kAFS then keeps track of whether there's been a stall, and if so, uses the new function to ping the server, resetting the timeout to allow the reply to come back. If there's a stall, a ping and the call is *still* stalled in the same place after another period, then the call will be aborted. Fixes: bc5e3a546d55 ("rxrpc: Use MSG_WAITALL to tell sendmsg() to temporarily ignore signals") Fixes: f4d15fb6f99a ("rxrpc: Provide functions for allowing cleaner handling of signals") Signed-off-by: David Howells Signed-off-by: David S. Miller --- Documentation/networking/rxrpc.txt | 17 +++++++++++------ fs/afs/rxrpc.c | 11 ++++++++++- include/net/af_rxrpc.h | 3 ++- include/trace/events/rxrpc.h | 2 ++ net/rxrpc/af_rxrpc.c | 27 +++++++++++++++++++++++---- 5 files changed, 48 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/Documentation/networking/rxrpc.txt b/Documentation/networking/rxrpc.txt index 605e00cdd6be..89f1302d593a 100644 --- a/Documentation/networking/rxrpc.txt +++ b/Documentation/networking/rxrpc.txt @@ -1056,18 +1056,23 @@ The kernel interface functions are as follows: u32 rxrpc_kernel_check_life(struct socket *sock, struct rxrpc_call *call); + void rxrpc_kernel_probe_life(struct socket *sock, + struct rxrpc_call *call); - This returns a number that is updated when ACKs are received from the peer - (notably including PING RESPONSE ACKs which we can elicit by sending PING - ACKs to see if the call still exists on the server). The caller should - compare the numbers of two calls to see if the call is still alive after - waiting for a suitable interval. + The first function returns a number that is updated when ACKs are received + from the peer (notably including PING RESPONSE ACKs which we can elicit by + sending PING ACKs to see if the call still exists on the server). The + caller should compare the numbers of two calls to see if the call is still + alive after waiting for a suitable interval. This allows the caller to work out if the server is still contactable and if the call is still alive on the server whilst waiting for the server to process a client operation. - This function may transmit a PING ACK. + The second function causes a ping ACK to be transmitted to try to provoke + the peer into responding, which would then cause the value returned by the + first function to change. Note that this must be called in TASK_RUNNING + state. (*) Get reply timestamp. diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 59970886690f..a7b44863d502 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -576,6 +576,7 @@ static long afs_wait_for_call_to_complete(struct afs_call *call, { signed long rtt2, timeout; long ret; + bool stalled = false; u64 rtt; u32 life, last_life; @@ -609,12 +610,20 @@ static long afs_wait_for_call_to_complete(struct afs_call *call, life = rxrpc_kernel_check_life(call->net->socket, call->rxcall); if (timeout == 0 && - life == last_life && signal_pending(current)) + life == last_life && signal_pending(current)) { + if (stalled) break; + __set_current_state(TASK_RUNNING); + rxrpc_kernel_probe_life(call->net->socket, call->rxcall); + timeout = rtt2; + stalled = true; + continue; + } if (life != last_life) { timeout = rtt2; last_life = life; + stalled = false; } timeout = schedule_timeout(timeout); diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index de587948042a..1adefe42c0a6 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -77,7 +77,8 @@ int rxrpc_kernel_retry_call(struct socket *, struct rxrpc_call *, struct sockaddr_rxrpc *, struct key *); int rxrpc_kernel_check_call(struct socket *, struct rxrpc_call *, enum rxrpc_call_completion *, u32 *); -u32 rxrpc_kernel_check_life(struct socket *, struct rxrpc_call *); +u32 rxrpc_kernel_check_life(const struct socket *, const struct rxrpc_call *); +void rxrpc_kernel_probe_life(struct socket *, struct rxrpc_call *); u32 rxrpc_kernel_get_epoch(struct socket *, struct rxrpc_call *); bool rxrpc_kernel_get_reply_time(struct socket *, struct rxrpc_call *, ktime_t *); diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 573d5b901fb1..5b50fe4906d2 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -181,6 +181,7 @@ enum rxrpc_timer_trace { enum rxrpc_propose_ack_trace { rxrpc_propose_ack_client_tx_end, rxrpc_propose_ack_input_data, + rxrpc_propose_ack_ping_for_check_life, rxrpc_propose_ack_ping_for_keepalive, rxrpc_propose_ack_ping_for_lost_ack, rxrpc_propose_ack_ping_for_lost_reply, @@ -380,6 +381,7 @@ enum rxrpc_tx_point { #define rxrpc_propose_ack_traces \ EM(rxrpc_propose_ack_client_tx_end, "ClTxEnd") \ EM(rxrpc_propose_ack_input_data, "DataIn ") \ + EM(rxrpc_propose_ack_ping_for_check_life, "ChkLife") \ EM(rxrpc_propose_ack_ping_for_keepalive, "KeepAlv") \ EM(rxrpc_propose_ack_ping_for_lost_ack, "LostAck") \ EM(rxrpc_propose_ack_ping_for_lost_reply, "LostRpl") \ diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 64362d078da8..a2522f9d71e2 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -375,16 +375,35 @@ EXPORT_SYMBOL(rxrpc_kernel_end_call); * getting ACKs from the server. Returns a number representing the life state * which can be compared to that returned by a previous call. * - * If this is a client call, ping ACKs will be sent to the server to find out - * whether it's still responsive and whether the call is still alive on the - * server. + * If the life state stalls, rxrpc_kernel_probe_life() should be called and + * then 2RTT waited. */ -u32 rxrpc_kernel_check_life(struct socket *sock, struct rxrpc_call *call) +u32 rxrpc_kernel_check_life(const struct socket *sock, + const struct rxrpc_call *call) { return call->acks_latest; } EXPORT_SYMBOL(rxrpc_kernel_check_life); +/** + * rxrpc_kernel_probe_life - Poke the peer to see if it's still alive + * @sock: The socket the call is on + * @call: The call to check + * + * In conjunction with rxrpc_kernel_check_life(), allow a kernel service to + * find out whether a call is still alive by pinging it. This should cause the + * life state to be bumped in about 2*RTT. + * + * The must be called in TASK_RUNNING state on pain of might_sleep() objecting. + */ +void rxrpc_kernel_probe_life(struct socket *sock, struct rxrpc_call *call) +{ + rxrpc_propose_ACK(call, RXRPC_ACK_PING, 0, 0, true, false, + rxrpc_propose_ack_ping_for_check_life); + rxrpc_send_ack_packet(call, true, NULL); +} +EXPORT_SYMBOL(rxrpc_kernel_probe_life); + /** * rxrpc_kernel_get_epoch - Retrieve the epoch value from a call. * @sock: The socket the call is on -- cgit v1.2.3 From 08e14fe429a07475ee9f29a283945d602e4a6d92 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 12 Nov 2018 16:17:16 -0800 Subject: net_sched: sch_fq: ensure maxrate fq parameter applies to EDT flows When EDT conversion happened, fq lost the ability to enfore a maxrate for all flows. It kept it for non EDT flows. This commit restores the functionality. Tested: tc qd replace dev eth0 root fq maxrate 500Mbit netperf -P0 -H host -- -O THROUGHPUT 489.75 Fixes: ab408b6dc744 ("tcp: switch tcp and sch_fq to new earliest departure time model") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_fq.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 4b1af706896c..25a7cf6d380f 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -469,22 +469,29 @@ begin: goto begin; } prefetch(&skb->end); - f->credit -= qdisc_pkt_len(skb); + plen = qdisc_pkt_len(skb); + f->credit -= plen; - if (ktime_to_ns(skb->tstamp) || !q->rate_enable) + if (!q->rate_enable) goto out; rate = q->flow_max_rate; - if (skb->sk) - rate = min(skb->sk->sk_pacing_rate, rate); - - if (rate <= q->low_rate_threshold) { - f->credit = 0; - plen = qdisc_pkt_len(skb); - } else { - plen = max(qdisc_pkt_len(skb), q->quantum); - if (f->credit > 0) - goto out; + + /* If EDT time was provided for this skb, we need to + * update f->time_next_packet only if this qdisc enforces + * a flow max rate. + */ + if (!skb->tstamp) { + if (skb->sk) + rate = min(skb->sk->sk_pacing_rate, rate); + + if (rate <= q->low_rate_threshold) { + f->credit = 0; + } else { + plen = max(plen, q->quantum); + if (f->credit > 0) + goto out; + } } if (rate != ~0UL) { u64 len = (u64)plen * NSEC_PER_SEC; -- cgit v1.2.3 From 761f60261b4401aa368d71d431b4c218af0efcee Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 14 Nov 2018 00:48:28 +0800 Subject: ipv6: fix a dst leak when removing its exception These is no need to hold dst before calling rt6_remove_exception_rt(). The call to dst_hold_safe() in ip6_link_failure() was for ip6_del_rt(), which has been removed in Commit 93531c674315 ("net/ipv6: separate handling of FIB entries from dst based routes"). Otherwise, it will cause a dst leak. This patch is to simply remove the dst_hold_safe() call before calling rt6_remove_exception_rt() and also do the same in ip6_del_cached_rt(). It's safe, because the removal of the exception that holds its dst's refcnt is protected by rt6_exception_lock. Fixes: 93531c674315 ("net/ipv6: separate handling of FIB entries from dst based routes") Fixes: 23fb93a4d3f1 ("net/ipv6: Cleanup exception and cache route handling") Reported-by: Li Shuang Signed-off-by: Xin Long Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 2a7423c39456..14b422f35504 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2232,8 +2232,7 @@ static void ip6_link_failure(struct sk_buff *skb) if (rt) { rcu_read_lock(); if (rt->rt6i_flags & RTF_CACHE) { - if (dst_hold_safe(&rt->dst)) - rt6_remove_exception_rt(rt); + rt6_remove_exception_rt(rt); } else { struct fib6_info *from; struct fib6_node *fn; @@ -3214,8 +3213,8 @@ static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg) if (cfg->fc_flags & RTF_GATEWAY && !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) goto out; - if (dst_hold_safe(&rt->dst)) - rc = rt6_remove_exception_rt(rt); + + rc = rt6_remove_exception_rt(rt); out: return rc; } -- cgit v1.2.3 From 19ab69107d3ecfb7cd3e38ad262a881be40c01a3 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Wed, 14 Nov 2018 12:17:25 +0100 Subject: net/sched: act_pedit: fix memory leak when IDR allocation fails tcf_idr_check_alloc() can return a negative value, on allocation failures (-ENOMEM) or IDR exhaustion (-ENOSPC): don't leak keys_ex in these cases. Fixes: 0190c1d452a9 ("net: sched: atomically check-allocate action") Signed-off-by: Davide Caratti Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/act_pedit.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index da3dd0f68cc2..2b372a06b432 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -201,7 +201,8 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, goto out_release; } } else { - return err; + ret = err; + goto out_free; } p = to_pedit(*a); -- cgit v1.2.3 From 95506588d2c1d72ca29adef8ae9bf771bcfb4ced Mon Sep 17 00:00:00 2001 From: Slavomir Kaslev Date: Fri, 16 Nov 2018 11:27:53 +0200 Subject: socket: do a generic_file_splice_read when proto_ops has no splice_read splice(2) fails with -EINVAL when called reading on a socket with no splice_read set in its proto_ops (such as vsock sockets). Switch this to fallbacks to a generic_file_splice_read instead. Signed-off-by: Slavomir Kaslev Signed-off-by: David S. Miller --- net/socket.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/socket.c b/net/socket.c index 593826e11a53..334fcc617ef2 100644 --- a/net/socket.c +++ b/net/socket.c @@ -853,7 +853,7 @@ static ssize_t sock_splice_read(struct file *file, loff_t *ppos, struct socket *sock = file->private_data; if (unlikely(!sock->ops->splice_read)) - return -EINVAL; + return generic_file_splice_read(file, ppos, pipe, len, flags); return sock->ops->splice_read(sock, ppos, pipe, len, flags); } -- cgit v1.2.3 From 9d332e69c1dc74dcd748de7cbd2dac5c61bda265 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Fri, 16 Nov 2018 18:50:01 +0200 Subject: net: bridge: fix vlan stats use-after-free on destruction Syzbot reported a use-after-free of the global vlan context on port vlan destruction. When I added per-port vlan stats I missed the fact that the global vlan context can be freed before the per-port vlan rcu callback. There're a few different ways to deal with this, I've chosen to add a new private flag that is set only when per-port stats are allocated so we can directly check it on destruction without dereferencing the global context at all. The new field in net_bridge_vlan uses a hole. v2: cosmetic change, move the check to br_process_vlan_info where the other checks are done v3: add change log in the patch, add private (in-kernel only) flags in a hole in net_bridge_vlan struct and use that instead of mixing user-space flags with private flags Fixes: 9163a0fc1f0c ("net: bridge: add support for per-port vlan stats") Reported-by: syzbot+04681da557a0e49a52e5@syzkaller.appspotmail.com Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_private.h | 7 +++++++ net/bridge/br_vlan.c | 3 ++- 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 2920e06a5403..04c19a37e500 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -102,12 +102,18 @@ struct br_tunnel_info { struct metadata_dst *tunnel_dst; }; +/* private vlan flags */ +enum { + BR_VLFLAG_PER_PORT_STATS = BIT(0), +}; + /** * struct net_bridge_vlan - per-vlan entry * * @vnode: rhashtable member * @vid: VLAN id * @flags: bridge vlan flags + * @priv_flags: private (in-kernel) bridge vlan flags * @stats: per-cpu VLAN statistics * @br: if MASTER flag set, this points to a bridge struct * @port: if MASTER flag unset, this points to a port struct @@ -127,6 +133,7 @@ struct net_bridge_vlan { struct rhash_head tnode; u16 vid; u16 flags; + u16 priv_flags; struct br_vlan_stats __percpu *stats; union { struct net_bridge *br; diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 8c9297a01947..e84be08b8285 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -197,7 +197,7 @@ static void nbp_vlan_rcu_free(struct rcu_head *rcu) v = container_of(rcu, struct net_bridge_vlan, rcu); WARN_ON(br_vlan_is_master(v)); /* if we had per-port stats configured then free them here */ - if (v->brvlan->stats != v->stats) + if (v->priv_flags & BR_VLFLAG_PER_PORT_STATS) free_percpu(v->stats); v->stats = NULL; kfree(v); @@ -264,6 +264,7 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags) err = -ENOMEM; goto out_filt; } + v->priv_flags |= BR_VLFLAG_PER_PORT_STATS; } else { v->stats = masterv->stats; } -- cgit v1.2.3 From 16f7eb2b77b55da816c4e207f3f9440a8cafc00a Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Fri, 16 Nov 2018 16:58:19 +0100 Subject: ip_tunnel: don't force DF when MTU is locked The various types of tunnels running over IPv4 can ask to set the DF bit to do PMTU discovery. However, PMTU discovery is subject to the threshold set by the net.ipv4.route.min_pmtu sysctl, and is also disabled on routes with "mtu lock". In those cases, we shouldn't set the DF bit. This patch makes setting the DF bit conditional on the route's MTU locking state. This issue seems to be older than git history. Signed-off-by: Sabrina Dubroca Reviewed-by: Stefano Brivio Signed-off-by: David S. Miller --- net/ipv4/ip_tunnel_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index dde671e97829..c248e0dccbe1 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -80,7 +80,7 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, iph->version = 4; iph->ihl = sizeof(struct iphdr) >> 2; - iph->frag_off = df; + iph->frag_off = ip_mtu_locked(&rt->dst) ? 0 : df; iph->protocol = proto; iph->tos = tos; iph->daddr = dst; -- cgit v1.2.3 From 33d9a2c72f086cbf1087b2fd2d1a15aa9df14a7f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 17 Nov 2018 21:57:02 -0800 Subject: net-gro: reset skb->pkt_type in napi_reuse_skb() eth_type_trans() assumes initial value for skb->pkt_type is PACKET_HOST. This is indeed the value right after a fresh skb allocation. However, it is possible that GRO merged a packet with a different value (like PACKET_OTHERHOST in case macvlan is used), so we need to make sure napi->skb will have pkt_type set back to PACKET_HOST. Otherwise, valid packets might be dropped by the stack because their pkt_type is not PACKET_HOST. napi_reuse_skb() was added in commit 96e93eab2033 ("gro: Add internal interfaces for VLAN"), but this bug always has been there. Fixes: 96e93eab2033 ("gro: Add internal interfaces for VLAN") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 0ffcbdd55fa9..066aa902d85c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5655,6 +5655,10 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) skb->vlan_tci = 0; skb->dev = napi->dev; skb->skb_iif = 0; + + /* eth_type_trans() assumes pkt_type is PACKET_HOST */ + skb->pkt_type = PACKET_HOST; + skb->encapsulation = 0; skb_shinfo(skb)->gso_type = 0; skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); -- cgit v1.2.3 From adba75be0d23cca92a028749d92c60c8909bbdb3 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Fri, 16 Nov 2018 16:55:04 -0500 Subject: tipc: fix lockdep warning when reinitilaizing sockets We get the following warning: [ 47.926140] 32-bit node address hash set to 2010a0a [ 47.927202] [ 47.927433] ================================ [ 47.928050] WARNING: inconsistent lock state [ 47.928661] 4.19.0+ #37 Tainted: G E [ 47.929346] -------------------------------- [ 47.929954] inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage. [ 47.930116] swapper/3/0 [HC0[0]:SC1[3]:HE1:SE0] takes: [ 47.930116] 00000000af8bc31e (&(&ht->lock)->rlock){+.?.}, at: rhashtable_walk_enter+0x36/0xb0 [ 47.930116] {SOFTIRQ-ON-W} state was registered at: [ 47.930116] _raw_spin_lock+0x29/0x60 [ 47.930116] rht_deferred_worker+0x556/0x810 [ 47.930116] process_one_work+0x1f5/0x540 [ 47.930116] worker_thread+0x64/0x3e0 [ 47.930116] kthread+0x112/0x150 [ 47.930116] ret_from_fork+0x3a/0x50 [ 47.930116] irq event stamp: 14044 [ 47.930116] hardirqs last enabled at (14044): [] __local_bh_enable_ip+0x7a/0xf0 [ 47.938117] hardirqs last disabled at (14043): [] __local_bh_enable_ip+0x41/0xf0 [ 47.938117] softirqs last enabled at (14028): [] irq_enter+0x5e/0x60 [ 47.938117] softirqs last disabled at (14029): [] irq_exit+0xb5/0xc0 [ 47.938117] [ 47.938117] other info that might help us debug this: [ 47.938117] Possible unsafe locking scenario: [ 47.938117] [ 47.938117] CPU0 [ 47.938117] ---- [ 47.938117] lock(&(&ht->lock)->rlock); [ 47.938117] [ 47.938117] lock(&(&ht->lock)->rlock); [ 47.938117] [ 47.938117] *** DEADLOCK *** [ 47.938117] [ 47.938117] 2 locks held by swapper/3/0: [ 47.938117] #0: 0000000062c64f90 ((&d->timer)){+.-.}, at: call_timer_fn+0x5/0x280 [ 47.938117] #1: 00000000ee39619c (&(&d->lock)->rlock){+.-.}, at: tipc_disc_timeout+0xc8/0x540 [tipc] [ 47.938117] [ 47.938117] stack backtrace: [ 47.938117] CPU: 3 PID: 0 Comm: swapper/3 Tainted: G E 4.19.0+ #37 [ 47.938117] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 47.938117] Call Trace: [ 47.938117] [ 47.938117] dump_stack+0x5e/0x8b [ 47.938117] print_usage_bug+0x1ed/0x1ff [ 47.938117] mark_lock+0x5b5/0x630 [ 47.938117] __lock_acquire+0x4c0/0x18f0 [ 47.938117] ? lock_acquire+0xa6/0x180 [ 47.938117] lock_acquire+0xa6/0x180 [ 47.938117] ? rhashtable_walk_enter+0x36/0xb0 [ 47.938117] _raw_spin_lock+0x29/0x60 [ 47.938117] ? rhashtable_walk_enter+0x36/0xb0 [ 47.938117] rhashtable_walk_enter+0x36/0xb0 [ 47.938117] tipc_sk_reinit+0xb0/0x410 [tipc] [ 47.938117] ? mark_held_locks+0x6f/0x90 [ 47.938117] ? __local_bh_enable_ip+0x7a/0xf0 [ 47.938117] ? lockdep_hardirqs_on+0x20/0x1a0 [ 47.938117] tipc_net_finalize+0xbf/0x180 [tipc] [ 47.938117] tipc_disc_timeout+0x509/0x540 [tipc] [ 47.938117] ? call_timer_fn+0x5/0x280 [ 47.938117] ? tipc_disc_msg_xmit.isra.19+0xa0/0xa0 [tipc] [ 47.938117] ? tipc_disc_msg_xmit.isra.19+0xa0/0xa0 [tipc] [ 47.938117] call_timer_fn+0xa1/0x280 [ 47.938117] ? tipc_disc_msg_xmit.isra.19+0xa0/0xa0 [tipc] [ 47.938117] run_timer_softirq+0x1f2/0x4d0 [ 47.938117] __do_softirq+0xfc/0x413 [ 47.938117] irq_exit+0xb5/0xc0 [ 47.938117] smp_apic_timer_interrupt+0xac/0x210 [ 47.938117] apic_timer_interrupt+0xf/0x20 [ 47.938117] [ 47.938117] RIP: 0010:default_idle+0x1c/0x140 [ 47.938117] Code: 90 90 90 90 90 90 90 90 90 90 90 90 90 90 0f 1f 44 00 00 41 54 55 53 65 8b 2d d8 2b 74 65 0f 1f 44 00 00 e8 c6 2c 8b ff fb f4 <65> 8b 2d c5 2b 74 65 0f 1f 44 00 00 5b 5d 41 5c c3 65 8b 05 b4 2b [ 47.938117] RSP: 0018:ffffaf6ac0207ec8 EFLAGS: 00000206 ORIG_RAX: ffffffffffffff13 [ 47.938117] RAX: ffff8f5b3735e200 RBX: 0000000000000003 RCX: 0000000000000001 [ 47.938117] RDX: 0000000000000001 RSI: 0000000000000001 RDI: ffff8f5b3735e200 [ 47.938117] RBP: 0000000000000003 R08: 0000000000000001 R09: 0000000000000000 [ 47.938117] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 [ 47.938117] R13: 0000000000000000 R14: ffff8f5b3735e200 R15: ffff8f5b3735e200 [ 47.938117] ? default_idle+0x1a/0x140 [ 47.938117] do_idle+0x1bc/0x280 [ 47.938117] cpu_startup_entry+0x19/0x20 [ 47.938117] start_secondary+0x187/0x1c0 [ 47.938117] secondary_startup_64+0xa4/0xb0 The reason seems to be that tipc_net_finalize()->tipc_sk_reinit() is calling the function rhashtable_walk_enter() within a timer interrupt. We fix this by executing tipc_net_finalize() in work queue context. Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/discover.c | 19 ++++++++++--------- net/tipc/net.c | 45 +++++++++++++++++++++++++++++++++++++-------- net/tipc/net.h | 2 +- 3 files changed, 48 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/tipc/discover.c b/net/tipc/discover.c index 2830709957bd..c138d68e8a69 100644 --- a/net/tipc/discover.c +++ b/net/tipc/discover.c @@ -166,7 +166,8 @@ static bool tipc_disc_addr_trial_msg(struct tipc_discoverer *d, /* Apply trial address if we just left trial period */ if (!trial && !self) { - tipc_net_finalize(net, tn->trial_addr); + tipc_sched_net_finalize(net, tn->trial_addr); + msg_set_prevnode(buf_msg(d->skb), tn->trial_addr); msg_set_type(buf_msg(d->skb), DSC_REQ_MSG); } @@ -300,14 +301,12 @@ static void tipc_disc_timeout(struct timer_list *t) goto exit; } - /* Trial period over ? */ - if (!time_before(jiffies, tn->addr_trial_end)) { - /* Did we just leave it ? */ - if (!tipc_own_addr(net)) - tipc_net_finalize(net, tn->trial_addr); - - msg_set_type(buf_msg(d->skb), DSC_REQ_MSG); - msg_set_prevnode(buf_msg(d->skb), tipc_own_addr(net)); + /* Did we just leave trial period ? */ + if (!time_before(jiffies, tn->addr_trial_end) && !tipc_own_addr(net)) { + mod_timer(&d->timer, jiffies + TIPC_DISC_INIT); + spin_unlock_bh(&d->lock); + tipc_sched_net_finalize(net, tn->trial_addr); + return; } /* Adjust timeout interval according to discovery phase */ @@ -319,6 +318,8 @@ static void tipc_disc_timeout(struct timer_list *t) d->timer_intv = TIPC_DISC_SLOW; else if (!d->num_nodes && d->timer_intv > TIPC_DISC_FAST) d->timer_intv = TIPC_DISC_FAST; + msg_set_type(buf_msg(d->skb), DSC_REQ_MSG); + msg_set_prevnode(buf_msg(d->skb), tn->trial_addr); } mod_timer(&d->timer, jiffies + d->timer_intv); diff --git a/net/tipc/net.c b/net/tipc/net.c index 62199cf5a56c..f076edb74338 100644 --- a/net/tipc/net.c +++ b/net/tipc/net.c @@ -104,6 +104,14 @@ * - A local spin_lock protecting the queue of subscriber events. */ +struct tipc_net_work { + struct work_struct work; + struct net *net; + u32 addr; +}; + +static void tipc_net_finalize(struct net *net, u32 addr); + int tipc_net_init(struct net *net, u8 *node_id, u32 addr) { if (tipc_own_id(net)) { @@ -119,17 +127,38 @@ int tipc_net_init(struct net *net, u8 *node_id, u32 addr) return 0; } -void tipc_net_finalize(struct net *net, u32 addr) +static void tipc_net_finalize(struct net *net, u32 addr) { struct tipc_net *tn = tipc_net(net); - if (!cmpxchg(&tn->node_addr, 0, addr)) { - tipc_set_node_addr(net, addr); - tipc_named_reinit(net); - tipc_sk_reinit(net); - tipc_nametbl_publish(net, TIPC_CFG_SRV, addr, addr, - TIPC_CLUSTER_SCOPE, 0, addr); - } + if (cmpxchg(&tn->node_addr, 0, addr)) + return; + tipc_set_node_addr(net, addr); + tipc_named_reinit(net); + tipc_sk_reinit(net); + tipc_nametbl_publish(net, TIPC_CFG_SRV, addr, addr, + TIPC_CLUSTER_SCOPE, 0, addr); +} + +static void tipc_net_finalize_work(struct work_struct *work) +{ + struct tipc_net_work *fwork; + + fwork = container_of(work, struct tipc_net_work, work); + tipc_net_finalize(fwork->net, fwork->addr); + kfree(fwork); +} + +void tipc_sched_net_finalize(struct net *net, u32 addr) +{ + struct tipc_net_work *fwork = kzalloc(sizeof(*fwork), GFP_ATOMIC); + + if (!fwork) + return; + INIT_WORK(&fwork->work, tipc_net_finalize_work); + fwork->net = net; + fwork->addr = addr; + schedule_work(&fwork->work); } void tipc_net_stop(struct net *net) diff --git a/net/tipc/net.h b/net/tipc/net.h index 09ad02b50bb1..b7f2e364eb99 100644 --- a/net/tipc/net.h +++ b/net/tipc/net.h @@ -42,7 +42,7 @@ extern const struct nla_policy tipc_nl_net_policy[]; int tipc_net_init(struct net *net, u8 *node_id, u32 addr); -void tipc_net_finalize(struct net *net, u32 addr); +void tipc_sched_net_finalize(struct net *net, u32 addr); void tipc_net_stop(struct net *net); int tipc_nl_net_dump(struct sk_buff *skb, struct netlink_callback *cb); int tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info); -- cgit v1.2.3 From 1c1274a56999fbdf9cf84e332b28448bb2d55221 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Sat, 17 Nov 2018 12:17:06 -0500 Subject: tipc: don't assume linear buffer when reading ancillary data The code for reading ancillary data from a received buffer is assuming the buffer is linear. To make this assumption true we have to linearize the buffer before message data is read. Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/socket.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 636e6131769d..b57b1be7252b 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1555,16 +1555,17 @@ static void tipc_sk_set_orig_addr(struct msghdr *m, struct sk_buff *skb) /** * tipc_sk_anc_data_recv - optionally capture ancillary data for received message * @m: descriptor for message info - * @msg: received message header + * @skb: received message buffer * @tsk: TIPC port associated with message * * Note: Ancillary data is not captured if not requested by receiver. * * Returns 0 if successful, otherwise errno */ -static int tipc_sk_anc_data_recv(struct msghdr *m, struct tipc_msg *msg, +static int tipc_sk_anc_data_recv(struct msghdr *m, struct sk_buff *skb, struct tipc_sock *tsk) { + struct tipc_msg *msg; u32 anc_data[3]; u32 err; u32 dest_type; @@ -1573,6 +1574,7 @@ static int tipc_sk_anc_data_recv(struct msghdr *m, struct tipc_msg *msg, if (likely(m->msg_controllen == 0)) return 0; + msg = buf_msg(skb); /* Optionally capture errored message object(s) */ err = msg ? msg_errcode(msg) : 0; @@ -1583,6 +1585,9 @@ static int tipc_sk_anc_data_recv(struct msghdr *m, struct tipc_msg *msg, if (res) return res; if (anc_data[1]) { + if (skb_linearize(skb)) + return -ENOMEM; + msg = buf_msg(skb); res = put_cmsg(m, SOL_TIPC, TIPC_RETDATA, anc_data[1], msg_data(msg)); if (res) @@ -1744,9 +1749,10 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m, /* Collect msg meta data, including error code and rejected data */ tipc_sk_set_orig_addr(m, skb); - rc = tipc_sk_anc_data_recv(m, hdr, tsk); + rc = tipc_sk_anc_data_recv(m, skb, tsk); if (unlikely(rc)) goto exit; + hdr = buf_msg(skb); /* Capture data if non-error msg, otherwise just set return value */ if (likely(!err)) { @@ -1856,9 +1862,10 @@ static int tipc_recvstream(struct socket *sock, struct msghdr *m, /* Collect msg meta data, incl. error code and rejected data */ if (!copied) { tipc_sk_set_orig_addr(m, skb); - rc = tipc_sk_anc_data_recv(m, hdr, tsk); + rc = tipc_sk_anc_data_recv(m, skb, tsk); if (rc) break; + hdr = buf_msg(skb); } /* Copy data if msg ok, otherwise return error/partial data */ -- cgit v1.2.3 From 7ddacfa564870cdd97275fd87decb6174abc6380 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sun, 18 Nov 2018 10:45:30 -0800 Subject: ipv6: Fix PMTU updates for UDP/raw sockets in presence of VRF Preethi reported that PMTU discovery for UDP/raw applications is not working in the presence of VRF when the socket is not bound to a device. The problem is that ip6_sk_update_pmtu does not consider the L3 domain of the skb device if the socket is not bound. Update the function to set oif to the L3 master device if relevant. Fixes: ca254490c8df ("net: Add VRF support to IPv6 stack") Reported-by: Preethi Ramachandra Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 14b422f35504..059f0531f7c1 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2359,10 +2359,13 @@ EXPORT_SYMBOL_GPL(ip6_update_pmtu); void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) { + int oif = sk->sk_bound_dev_if; struct dst_entry *dst; - ip6_update_pmtu(skb, sock_net(sk), mtu, - sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid); + if (!oif && skb->dev) + oif = l3mdev_master_ifindex(skb->dev); + + ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid); dst = __sk_dst_get(sk); if (!dst || !dst->obsolete || -- cgit v1.2.3