From 90db6d772f749e38171d04619a5e3cd8804a6d02 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 10 Mar 2020 09:41:48 -0700 Subject: bpf, sockmap: Remove bucket->lock from sock_{hash|map}_free The bucket->lock is not needed in the sock_hash_free and sock_map_free calls, in fact it is causing a splat due to being inside rcu block. | BUG: sleeping function called from invalid context at net/core/sock.c:2935 | in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 62, name: kworker/0:1 | 3 locks held by kworker/0:1/62: | #0: ffff88813b019748 ((wq_completion)events){+.+.}, at: process_one_work+0x1d7/0x5e0 | #1: ffffc900000abe50 ((work_completion)(&map->work)){+.+.}, at: process_one_work+0x1d7/0x5e0 | #2: ffff8881381f6df8 (&stab->lock){+...}, at: sock_map_free+0x26/0x180 | CPU: 0 PID: 62 Comm: kworker/0:1 Not tainted 5.5.0-04008-g7b083332376e #454 | Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_073836-buildvm-ppc64le-16.ppc.fedoraproject.org-3.fc31 04/01/2014 | Workqueue: events bpf_map_free_deferred | Call Trace: | dump_stack+0x71/0xa0 | ___might_sleep.cold+0xa6/0xb6 | lock_sock_nested+0x28/0x90 | sock_map_free+0x5f/0x180 | bpf_map_free_deferred+0x58/0x80 | process_one_work+0x260/0x5e0 | worker_thread+0x4d/0x3e0 | kthread+0x108/0x140 | ? process_one_work+0x5e0/0x5e0 | ? kthread_park+0x90/0x90 | ret_from_fork+0x3a/0x50 The reason we have stab->lock and bucket->locks in sockmap code is to handle checking EEXIST in update/delete cases. We need to be careful during an update operation that we check for EEXIST and we need to ensure that the psock object is not in some partial state of removal/insertion while we do this. So both map_update_common and sock_map_delete need to guard from being run together potentially deleting an entry we are checking, etc. But by the time we get to the tear-down code in sock_{ma[|hash}_free we have already disconnected the map and we just did synchronize_rcu() in the line above so no updates/deletes should be in flight. Because of this we can drop the bucket locks from the map free'ing code, noting no update/deletes can be in-flight. Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Reported-by: Jakub Sitnicki Suggested-by: Jakub Sitnicki Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/158385850787.30597.8346421465837046618.stgit@john-Precision-5820-Tower --- net/core/sock_map.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 085cef5857bb..b70c844a88ec 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -233,8 +233,11 @@ static void sock_map_free(struct bpf_map *map) struct bpf_stab *stab = container_of(map, struct bpf_stab, map); int i; + /* After the sync no updates or deletes will be in-flight so it + * is safe to walk map and remove entries without risking a race + * in EEXIST update case. + */ synchronize_rcu(); - raw_spin_lock_bh(&stab->lock); for (i = 0; i < stab->map.max_entries; i++) { struct sock **psk = &stab->sks[i]; struct sock *sk; @@ -248,7 +251,6 @@ static void sock_map_free(struct bpf_map *map) release_sock(sk); } } - raw_spin_unlock_bh(&stab->lock); /* wait for psock readers accessing its map link */ synchronize_rcu(); @@ -863,10 +865,13 @@ static void sock_hash_free(struct bpf_map *map) struct hlist_node *node; int i; + /* After the sync no updates or deletes will be in-flight so it + * is safe to walk map and remove entries without risking a race + * in EEXIST update case. + */ synchronize_rcu(); for (i = 0; i < htab->buckets_num; i++) { bucket = sock_hash_select_bucket(htab, i); - raw_spin_lock_bh(&bucket->lock); hlist_for_each_entry_safe(elem, node, &bucket->head, node) { hlist_del_rcu(&elem->node); lock_sock(elem->sk); @@ -875,7 +880,6 @@ static void sock_hash_free(struct bpf_map *map) rcu_read_unlock(); release_sock(elem->sk); } - raw_spin_unlock_bh(&bucket->lock); } /* wait for psock readers accessing its map link */ -- cgit v1.2.3 From 158fe6665389964a1de212818b4a5c52b7f7aff4 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 13 Mar 2020 09:05:38 +0000 Subject: rxrpc: Abstract out the calculation of whether there's Tx space Abstract out the calculation of there being sufficient Tx buffer space. This is reproduced several times in the rxrpc sendmsg code. Signed-off-by: David Howells --- net/rxrpc/sendmsg.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 813fd6888142..a13051d38097 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -17,6 +17,21 @@ #include #include "ar-internal.h" +/* + * Return true if there's sufficient Tx queue space. + */ +static bool rxrpc_check_tx_space(struct rxrpc_call *call, rxrpc_seq_t *_tx_win) +{ + unsigned int win_size = + min_t(unsigned int, call->tx_winsize, + call->cong_cwnd + call->cong_extra); + rxrpc_seq_t tx_win = READ_ONCE(call->tx_hard_ack); + + if (_tx_win) + *_tx_win = tx_win; + return call->tx_top - tx_win < win_size; +} + /* * Wait for space to appear in the Tx queue or a signal to occur. */ @@ -26,9 +41,7 @@ static int rxrpc_wait_for_tx_window_intr(struct rxrpc_sock *rx, { for (;;) { set_current_state(TASK_INTERRUPTIBLE); - if (call->tx_top - call->tx_hard_ack < - min_t(unsigned int, call->tx_winsize, - call->cong_cwnd + call->cong_extra)) + if (rxrpc_check_tx_space(call, NULL)) return 0; if (call->state >= RXRPC_CALL_COMPLETE) @@ -68,9 +81,7 @@ static int rxrpc_wait_for_tx_window_nonintr(struct rxrpc_sock *rx, set_current_state(TASK_UNINTERRUPTIBLE); tx_win = READ_ONCE(call->tx_hard_ack); - if (call->tx_top - tx_win < - min_t(unsigned int, call->tx_winsize, - call->cong_cwnd + call->cong_extra)) + if (rxrpc_check_tx_space(call, &tx_win)) return 0; if (call->state >= RXRPC_CALL_COMPLETE) @@ -302,9 +313,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, _debug("alloc"); - if (call->tx_top - call->tx_hard_ack >= - min_t(unsigned int, call->tx_winsize, - call->cong_cwnd + call->cong_extra)) { + if (!rxrpc_check_tx_space(call, NULL)) { ret = -EAGAIN; if (msg->msg_flags & MSG_DONTWAIT) goto maybe_error; -- cgit v1.2.3 From e138aa7d3271ac1b0690ae2c9b04d51468dce1d6 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 13 Mar 2020 09:22:09 +0000 Subject: rxrpc: Fix call interruptibility handling Fix the interruptibility of kernel-initiated client calls so that they're either only interruptible when they're waiting for a call slot to come available or they're not interruptible at all. Either way, they're not interruptible during transmission. This should help prevent StoreData calls from being interrupted when writeback is in progress. It doesn't, however, handle interruption during the receive phase. Userspace-initiated calls are still interruptable. After the signal has been handled, sendmsg() will return the amount of data copied out of the buffer and userspace can perform another sendmsg() call to continue transmission. Fixes: bc5e3a546d55 ("rxrpc: Use MSG_WAITALL to tell sendmsg() to temporarily ignore signals") Signed-off-by: David Howells --- fs/afs/rxrpc.c | 3 ++- include/net/af_rxrpc.h | 8 +++++++- net/rxrpc/af_rxrpc.c | 4 ++-- net/rxrpc/ar-internal.h | 4 ++-- net/rxrpc/call_object.c | 3 +-- net/rxrpc/conn_client.c | 13 ++++++++++--- net/rxrpc/sendmsg.c | 44 ++++++++++++++++++++++++++++++++++++-------- 7 files changed, 60 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 58d396592250..4c28712bb7f6 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -413,7 +413,8 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp) afs_wake_up_async_call : afs_wake_up_call_waiter), call->upgrade, - call->intr, + (call->intr ? RXRPC_PREINTERRUPTIBLE : + RXRPC_UNINTERRUPTIBLE), call->debug_id); if (IS_ERR(rxcall)) { ret = PTR_ERR(rxcall); diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index 1abae3c340a5..8e547b4d88c8 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -16,6 +16,12 @@ struct sock; struct socket; struct rxrpc_call; +enum rxrpc_interruptibility { + RXRPC_INTERRUPTIBLE, /* Call is interruptible */ + RXRPC_PREINTERRUPTIBLE, /* Call can be cancelled whilst waiting for a slot */ + RXRPC_UNINTERRUPTIBLE, /* Call should not be interruptible at all */ +}; + /* * Debug ID counter for tracing. */ @@ -41,7 +47,7 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *, gfp_t, rxrpc_notify_rx_t, bool, - bool, + enum rxrpc_interruptibility, unsigned int); int rxrpc_kernel_send_data(struct socket *, struct rxrpc_call *, struct msghdr *, size_t, diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index fe42f986cd94..7603cf811f75 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -285,7 +285,7 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, gfp_t gfp, rxrpc_notify_rx_t notify_rx, bool upgrade, - bool intr, + enum rxrpc_interruptibility interruptibility, unsigned int debug_id) { struct rxrpc_conn_parameters cp; @@ -310,7 +310,7 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, memset(&p, 0, sizeof(p)); p.user_call_ID = user_call_ID; p.tx_total_len = tx_total_len; - p.intr = intr; + p.interruptibility = interruptibility; memset(&cp, 0, sizeof(cp)); cp.local = rx->local; diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 7d730c438404..1f72f43b082d 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -489,7 +489,6 @@ enum rxrpc_call_flag { RXRPC_CALL_BEGAN_RX_TIMER, /* We began the expect_rx_by timer */ RXRPC_CALL_RX_HEARD, /* The peer responded at least once to this call */ RXRPC_CALL_RX_UNDERRUN, /* Got data underrun */ - RXRPC_CALL_IS_INTR, /* The call is interruptible */ RXRPC_CALL_DISCONNECTED, /* The call has been disconnected */ }; @@ -598,6 +597,7 @@ struct rxrpc_call { atomic_t usage; u16 service_id; /* service ID */ u8 security_ix; /* Security type */ + enum rxrpc_interruptibility interruptibility; /* At what point call may be interrupted */ u32 call_id; /* call ID on connection */ u32 cid; /* connection ID plus channel index */ int debug_id; /* debug ID for printks */ @@ -721,7 +721,7 @@ struct rxrpc_call_params { u32 normal; /* Max time since last call packet (msec) */ } timeouts; u8 nr_timeouts; /* Number of timeouts specified */ - bool intr; /* The call is interruptible */ + enum rxrpc_interruptibility interruptibility; /* How is interruptible is the call? */ }; struct rxrpc_send_params { diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index c9f34b0a11df..f07970207b54 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -237,8 +237,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, return call; } - if (p->intr) - __set_bit(RXRPC_CALL_IS_INTR, &call->flags); + call->interruptibility = p->interruptibility; call->tx_total_len = p->tx_total_len; trace_rxrpc_call(call->debug_id, rxrpc_call_new_client, atomic_read(&call->usage), diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index ea7d4c21f889..f2a1a5dbb5a7 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -655,13 +655,20 @@ static int rxrpc_wait_for_channel(struct rxrpc_call *call, gfp_t gfp) add_wait_queue_exclusive(&call->waitq, &myself); for (;;) { - if (test_bit(RXRPC_CALL_IS_INTR, &call->flags)) + switch (call->interruptibility) { + case RXRPC_INTERRUPTIBLE: + case RXRPC_PREINTERRUPTIBLE: set_current_state(TASK_INTERRUPTIBLE); - else + break; + case RXRPC_UNINTERRUPTIBLE: + default: set_current_state(TASK_UNINTERRUPTIBLE); + break; + } if (call->call_id) break; - if (test_bit(RXRPC_CALL_IS_INTR, &call->flags) && + if ((call->interruptibility == RXRPC_INTERRUPTIBLE || + call->interruptibility == RXRPC_PREINTERRUPTIBLE) && signal_pending(current)) { ret = -ERESTARTSYS; break; diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index a13051d38097..1eccfb92c9e1 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -62,7 +62,7 @@ static int rxrpc_wait_for_tx_window_intr(struct rxrpc_sock *rx, * Wait for space to appear in the Tx queue uninterruptibly, but with * a timeout of 2*RTT if no progress was made and a signal occurred. */ -static int rxrpc_wait_for_tx_window_nonintr(struct rxrpc_sock *rx, +static int rxrpc_wait_for_tx_window_waitall(struct rxrpc_sock *rx, struct rxrpc_call *call) { rxrpc_seq_t tx_start, tx_win; @@ -87,8 +87,7 @@ static int rxrpc_wait_for_tx_window_nonintr(struct rxrpc_sock *rx, if (call->state >= RXRPC_CALL_COMPLETE) return call->error; - if (test_bit(RXRPC_CALL_IS_INTR, &call->flags) && - timeout == 0 && + if (timeout == 0 && tx_win == tx_start && signal_pending(current)) return -EINTR; @@ -102,6 +101,26 @@ static int rxrpc_wait_for_tx_window_nonintr(struct rxrpc_sock *rx, } } +/* + * Wait for space to appear in the Tx queue uninterruptibly. + */ +static int rxrpc_wait_for_tx_window_nonintr(struct rxrpc_sock *rx, + struct rxrpc_call *call, + long *timeo) +{ + for (;;) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (rxrpc_check_tx_space(call, NULL)) + return 0; + + if (call->state >= RXRPC_CALL_COMPLETE) + return call->error; + + trace_rxrpc_transmit(call, rxrpc_transmit_wait); + *timeo = schedule_timeout(*timeo); + } +} + /* * wait for space to appear in the transmit/ACK window * - caller holds the socket locked @@ -119,10 +138,19 @@ static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx, add_wait_queue(&call->waitq, &myself); - if (waitall) - ret = rxrpc_wait_for_tx_window_nonintr(rx, call); - else - ret = rxrpc_wait_for_tx_window_intr(rx, call, timeo); + switch (call->interruptibility) { + case RXRPC_INTERRUPTIBLE: + if (waitall) + ret = rxrpc_wait_for_tx_window_waitall(rx, call); + else + ret = rxrpc_wait_for_tx_window_intr(rx, call, timeo); + break; + case RXRPC_PREINTERRUPTIBLE: + case RXRPC_UNINTERRUPTIBLE: + default: + ret = rxrpc_wait_for_tx_window_nonintr(rx, call, timeo); + break; + } remove_wait_queue(&call->waitq, &myself); set_current_state(TASK_RUNNING); @@ -628,7 +656,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) .call.tx_total_len = -1, .call.user_call_ID = 0, .call.nr_timeouts = 0, - .call.intr = true, + .call.interruptibility = RXRPC_INTERRUPTIBLE, .abort_code = 0, .command = RXRPC_CMD_SEND_DATA, .exclusive = false, -- cgit v1.2.3 From 498b577660f08cef5d9e78e0ed6dcd4c0939e98c Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 13 Mar 2020 17:30:27 +0000 Subject: rxrpc: Fix sendmsg(MSG_WAITALL) handling Fix the handling of sendmsg() with MSG_WAITALL for userspace to round the timeout for when a signal occurs up to at least two jiffies as a 1 jiffy timeout may end up being effectively 0 if jiffies wraps at the wrong time. Fixes: bc5e3a546d55 ("rxrpc: Use MSG_WAITALL to tell sendmsg() to temporarily ignore signals") Signed-off-by: David Howells --- net/rxrpc/sendmsg.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 1eccfb92c9e1..0fcf157aa09f 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -71,8 +71,8 @@ static int rxrpc_wait_for_tx_window_waitall(struct rxrpc_sock *rx, rtt = READ_ONCE(call->peer->rtt); rtt2 = nsecs_to_jiffies64(rtt) * 2; - if (rtt2 < 1) - rtt2 = 1; + if (rtt2 < 2) + rtt2 = 2; timeout = rtt2; tx_start = READ_ONCE(call->tx_hard_ack); -- cgit v1.2.3 From 7d7587db0d7fd1138f2afcffdc46a8e15630b944 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 12 Mar 2020 21:40:06 +0000 Subject: afs: Fix client call Rx-phase signal handling Fix the handling of signals in client rxrpc calls made by the afs filesystem. Ignore signals completely, leaving call abandonment or connection loss to be detected by timeouts inside AF_RXRPC. Allowing a filesystem call to be interrupted after the entire request has been transmitted and an abort sent means that the server may or may not have done the action - and we don't know. It may even be worse than that for older servers. Fixes: bc5e3a546d55 ("rxrpc: Use MSG_WAITALL to tell sendmsg() to temporarily ignore signals") Signed-off-by: David Howells --- fs/afs/rxrpc.c | 34 ++-------------------------------- include/net/af_rxrpc.h | 4 +--- net/rxrpc/af_rxrpc.c | 33 +++------------------------------ net/rxrpc/ar-internal.h | 1 - net/rxrpc/input.c | 1 - 5 files changed, 6 insertions(+), 67 deletions(-) (limited to 'net') diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 972e3aafa361..1ecc67da6c1a 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -604,11 +604,7 @@ call_complete: long afs_wait_for_call_to_complete(struct afs_call *call, struct afs_addr_cursor *ac) { - signed long rtt2, timeout; long ret; - bool stalled = false; - u64 rtt; - u32 life, last_life; bool rxrpc_complete = false; DECLARE_WAITQUEUE(myself, current); @@ -619,14 +615,6 @@ long afs_wait_for_call_to_complete(struct afs_call *call, if (ret < 0) goto out; - rtt = rxrpc_kernel_get_rtt(call->net->socket, call->rxcall); - rtt2 = nsecs_to_jiffies64(rtt) * 2; - if (rtt2 < 2) - rtt2 = 2; - - timeout = rtt2; - rxrpc_kernel_check_life(call->net->socket, call->rxcall, &last_life); - add_wait_queue(&call->waitq, &myself); for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); @@ -637,37 +625,19 @@ long afs_wait_for_call_to_complete(struct afs_call *call, call->need_attention = false; __set_current_state(TASK_RUNNING); afs_deliver_to_call(call); - timeout = rtt2; continue; } if (afs_check_call_state(call, AFS_CALL_COMPLETE)) break; - if (!rxrpc_kernel_check_life(call->net->socket, call->rxcall, &life)) { + if (!rxrpc_kernel_check_life(call->net->socket, call->rxcall)) { /* rxrpc terminated the call. */ rxrpc_complete = true; break; } - if (call->intr && timeout == 0 && - life == last_life && signal_pending(current)) { - if (stalled) - break; - __set_current_state(TASK_RUNNING); - rxrpc_kernel_probe_life(call->net->socket, call->rxcall); - timeout = rtt2; - stalled = true; - continue; - } - - if (life != last_life) { - timeout = rtt2; - last_life = life; - stalled = false; - } - - timeout = schedule_timeout(timeout); + schedule(); } remove_wait_queue(&call->waitq, &myself); diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index 8e547b4d88c8..04e97bab6f28 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -64,9 +64,7 @@ int rxrpc_kernel_charge_accept(struct socket *, rxrpc_notify_rx_t, rxrpc_user_attach_call_t, unsigned long, gfp_t, unsigned int); void rxrpc_kernel_set_tx_length(struct socket *, struct rxrpc_call *, s64); -bool rxrpc_kernel_check_life(const struct socket *, const struct rxrpc_call *, - u32 *); -void rxrpc_kernel_probe_life(struct socket *, struct rxrpc_call *); +bool rxrpc_kernel_check_life(const struct socket *, const struct rxrpc_call *); u32 rxrpc_kernel_get_epoch(struct socket *, struct rxrpc_call *); bool rxrpc_kernel_get_reply_time(struct socket *, struct rxrpc_call *, ktime_t *); diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 7603cf811f75..15ee92d79581 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -371,44 +371,17 @@ EXPORT_SYMBOL(rxrpc_kernel_end_call); * rxrpc_kernel_check_life - Check to see whether a call is still alive * @sock: The socket the call is on * @call: The call to check - * @_life: Where to store the life value * - * Allow a kernel service to find out whether a call is still alive - ie. we're - * getting ACKs from the server. Passes back in *_life a number representing - * the life state which can be compared to that returned by a previous call and - * return true if the call is still alive. - * - * If the life state stalls, rxrpc_kernel_probe_life() should be called and - * then 2RTT waited. + * Allow a kernel service to find out whether a call is still alive - + * ie. whether it has completed. */ bool rxrpc_kernel_check_life(const struct socket *sock, - const struct rxrpc_call *call, - u32 *_life) + const struct rxrpc_call *call) { - *_life = call->acks_latest; return call->state != RXRPC_CALL_COMPLETE; } EXPORT_SYMBOL(rxrpc_kernel_check_life); -/** - * rxrpc_kernel_probe_life - Poke the peer to see if it's still alive - * @sock: The socket the call is on - * @call: The call to check - * - * In conjunction with rxrpc_kernel_check_life(), allow a kernel service to - * find out whether a call is still alive by pinging it. This should cause the - * life state to be bumped in about 2*RTT. - * - * The must be called in TASK_RUNNING state on pain of might_sleep() objecting. - */ -void rxrpc_kernel_probe_life(struct socket *sock, struct rxrpc_call *call) -{ - rxrpc_propose_ACK(call, RXRPC_ACK_PING, 0, true, false, - rxrpc_propose_ack_ping_for_check_life); - rxrpc_send_ack_packet(call, true, NULL); -} -EXPORT_SYMBOL(rxrpc_kernel_probe_life); - /** * rxrpc_kernel_get_epoch - Retrieve the epoch value from a call. * @sock: The socket the call is on diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 1f72f43b082d..3eb1ab40ca5c 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -675,7 +675,6 @@ struct rxrpc_call { /* transmission-phase ACK management */ ktime_t acks_latest_ts; /* Timestamp of latest ACK received */ - rxrpc_serial_t acks_latest; /* serial number of latest ACK received */ rxrpc_seq_t acks_lowest_nak; /* Lowest NACK in the buffer (or ==tx_hard_ack) */ rxrpc_seq_t acks_lost_top; /* tx_top at the time lost-ack ping sent */ rxrpc_serial_t acks_lost_ping; /* Serial number of probe ACK */ diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index ef10fbf71b15..69e09d69c896 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -882,7 +882,6 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) before(prev_pkt, call->ackr_prev_seq)) goto out; call->acks_latest_ts = skb->tstamp; - call->acks_latest = sp->hdr.serial; call->ackr_first_seq = first_soft_ack; call->ackr_prev_seq = prev_pkt; -- cgit v1.2.3 From b1be2e8cd290f620777bfdb8aa00890cd2fa02b5 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 11 Mar 2020 22:42:27 -0700 Subject: net_sched: hold rtnl lock in tcindex_partial_destroy_work() syzbot reported a use-after-free in tcindex_dump(). This is due to the lack of RTNL in the deferred rcu work. We queue this work with RTNL in tcindex_change(), later, tcindex_dump() is called: fh = tp->ops->get(tp, t->tcm_handle); ... err = tp->ops->change(..., &fh, ...); tfilter_notify(..., fh, ...); but there is nothing to serialize the pending tcindex_partial_destroy_work() with tcindex_dump(). Fix this by simply holding RTNL in tcindex_partial_destroy_work(), so that it won't be called until RTNL is released after tc_new_tfilter() is completed. Reported-and-tested-by: syzbot+653090db2562495901dc@syzkaller.appspotmail.com Fixes: 3d210534cc93 ("net_sched: fix a race condition in tcindex_destroy()") Cc: Jamal Hadi Salim Cc: Jiri Pirko Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_tcindex.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index 09b7dc5fe7e0..f2cb24b6f0cf 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -261,8 +261,10 @@ static void tcindex_partial_destroy_work(struct work_struct *work) struct tcindex_data, rwork); + rtnl_lock(); kfree(p->perfect); kfree(p); + rtnl_unlock(); } static void tcindex_free_perfect_hash(struct tcindex_data *cp) -- cgit v1.2.3 From 0d1c3530e1bd38382edef72591b78e877e0edcd3 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 11 Mar 2020 22:42:28 -0700 Subject: net_sched: keep alloc_hash updated after hash allocation In commit 599be01ee567 ("net_sched: fix an OOB access in cls_tcindex") I moved cp->hash calculation before the first tcindex_alloc_perfect_hash(), but cp->alloc_hash is left untouched. This difference could lead to another out of bound access. cp->alloc_hash should always be the size allocated, we should update it after this tcindex_alloc_perfect_hash(). Reported-and-tested-by: syzbot+dcc34d54d68ef7d2d53d@syzkaller.appspotmail.com Reported-and-tested-by: syzbot+c72da7b9ed57cde6fca2@syzkaller.appspotmail.com Fixes: 599be01ee567 ("net_sched: fix an OOB access in cls_tcindex") Cc: Jamal Hadi Salim Cc: Jiri Pirko Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_tcindex.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index f2cb24b6f0cf..9904299424a1 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -359,6 +359,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, if (tcindex_alloc_perfect_hash(net, cp) < 0) goto errout; + cp->alloc_hash = cp->hash; for (i = 0; i < min(cp->hash, p->hash); i++) cp->perfect[i].res = p->perfect[i].res; balloc = 1; -- cgit v1.2.3 From 13d0f7b814d9b4c67e60d8c2820c86ea181e7d99 Mon Sep 17 00:00:00 2001 From: Bruno Meneguele Date: Thu, 12 Mar 2020 20:08:20 -0300 Subject: net/bpfilter: fix dprintf usage for /dev/kmsg The bpfilter UMH code was recently changed to log its informative messages to /dev/kmsg, however this interface doesn't support SEEK_CUR yet, used by dprintf(). As result dprintf() returns -EINVAL and doesn't log anything. However there already had some discussions about supporting SEEK_CUR into /dev/kmsg interface in the past it wasn't concluded. Since the only user of that from userspace perspective inside the kernel is the bpfilter UMH (userspace) module it's better to correct it here instead waiting a conclusion on the interface. Fixes: 36c4357c63f3 ("net: bpfilter: print umh messages to /dev/kmsg") Signed-off-by: Bruno Meneguele Signed-off-by: David S. Miller --- net/bpfilter/main.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/bpfilter/main.c b/net/bpfilter/main.c index 77396a098fbe..efea4874743e 100644 --- a/net/bpfilter/main.c +++ b/net/bpfilter/main.c @@ -10,7 +10,7 @@ #include #include "msgfmt.h" -int debug_fd; +FILE *debug_f; static int handle_get_cmd(struct mbox_request *cmd) { @@ -35,9 +35,10 @@ static void loop(void) struct mbox_reply reply; int n; + fprintf(debug_f, "testing the buffer\n"); n = read(0, &req, sizeof(req)); if (n != sizeof(req)) { - dprintf(debug_fd, "invalid request %d\n", n); + fprintf(debug_f, "invalid request %d\n", n); return; } @@ -47,7 +48,7 @@ static void loop(void) n = write(1, &reply, sizeof(reply)); if (n != sizeof(reply)) { - dprintf(debug_fd, "reply failed %d\n", n); + fprintf(debug_f, "reply failed %d\n", n); return; } } @@ -55,9 +56,10 @@ static void loop(void) int main(void) { - debug_fd = open("/dev/kmsg", 00000002); - dprintf(debug_fd, "Started bpfilter\n"); + debug_f = fopen("/dev/kmsg", "w"); + setvbuf(debug_f, 0, _IOLBF, 0); + fprintf(debug_f, "Started bpfilter\n"); loop(); - close(debug_fd); + fclose(debug_f); return 0; } -- cgit v1.2.3 From e1f8f78ffe9854308b9e12a73ebe4e909074fc33 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Fri, 13 Mar 2020 13:39:36 +0200 Subject: net: ip_gre: Separate ERSPAN newlink / changelink callbacks ERSPAN shares most of the code path with GRE and gretap code. While that helps keep the code compact, it is also error prone. Currently a broken userspace can turn a gretap tunnel into a de facto ERSPAN one by passing IFLA_GRE_ERSPAN_VER. There has been a similar issue in ip6gretap in the past. To prevent these problems in future, split the newlink and changelink code paths. Split the ERSPAN code out of ipgre_netlink_parms() into a new function erspan_netlink_parms(). Extract a piece of common logic from ipgre_newlink() and ipgre_changelink() into ipgre_newlink_encap_setup(). Add erspan_newlink() and erspan_changelink(). Fixes: 84e54fe0a5ea ("gre: introduce native tunnel support for ERSPAN") Signed-off-by: Petr Machata Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 103 ++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 85 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 8274f98c511c..7765c65fc7d2 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1153,6 +1153,22 @@ static int ipgre_netlink_parms(struct net_device *dev, if (data[IFLA_GRE_FWMARK]) *fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]); + return 0; +} + +static int erspan_netlink_parms(struct net_device *dev, + struct nlattr *data[], + struct nlattr *tb[], + struct ip_tunnel_parm *parms, + __u32 *fwmark) +{ + struct ip_tunnel *t = netdev_priv(dev); + int err; + + err = ipgre_netlink_parms(dev, data, tb, parms, fwmark); + if (err) + return err; + if (data[IFLA_GRE_ERSPAN_VER]) { t->erspan_ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]); @@ -1276,45 +1292,70 @@ static void ipgre_tap_setup(struct net_device *dev) ip_tunnel_setup(dev, gre_tap_net_id); } -static int ipgre_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], - struct netlink_ext_ack *extack) +static int +ipgre_newlink_encap_setup(struct net_device *dev, struct nlattr *data[]) { - struct ip_tunnel_parm p; struct ip_tunnel_encap ipencap; - __u32 fwmark = 0; - int err; if (ipgre_netlink_encap_parms(data, &ipencap)) { struct ip_tunnel *t = netdev_priv(dev); - err = ip_tunnel_encap_setup(t, &ipencap); + int err = ip_tunnel_encap_setup(t, &ipencap); if (err < 0) return err; } + return 0; +} + +static int ipgre_newlink(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + struct ip_tunnel_parm p; + __u32 fwmark = 0; + int err; + + err = ipgre_newlink_encap_setup(dev, data); + if (err) + return err; + err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark); if (err < 0) return err; return ip_tunnel_newlink(dev, tb, &p, fwmark); } +static int erspan_newlink(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + struct ip_tunnel_parm p; + __u32 fwmark = 0; + int err; + + err = ipgre_newlink_encap_setup(dev, data); + if (err) + return err; + + err = erspan_netlink_parms(dev, data, tb, &p, &fwmark); + if (err) + return err; + return ip_tunnel_newlink(dev, tb, &p, fwmark); +} + static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); - struct ip_tunnel_encap ipencap; __u32 fwmark = t->fwmark; struct ip_tunnel_parm p; int err; - if (ipgre_netlink_encap_parms(data, &ipencap)) { - err = ip_tunnel_encap_setup(t, &ipencap); - - if (err < 0) - return err; - } + err = ipgre_newlink_encap_setup(dev, data); + if (err) + return err; err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark); if (err < 0) @@ -1327,8 +1368,34 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], t->parms.i_flags = p.i_flags; t->parms.o_flags = p.o_flags; - if (strcmp(dev->rtnl_link_ops->kind, "erspan")) - ipgre_link_update(dev, !tb[IFLA_MTU]); + ipgre_link_update(dev, !tb[IFLA_MTU]); + + return 0; +} + +static int erspan_changelink(struct net_device *dev, struct nlattr *tb[], + struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + struct ip_tunnel *t = netdev_priv(dev); + __u32 fwmark = t->fwmark; + struct ip_tunnel_parm p; + int err; + + err = ipgre_newlink_encap_setup(dev, data); + if (err) + return err; + + err = erspan_netlink_parms(dev, data, tb, &p, &fwmark); + if (err < 0) + return err; + + err = ip_tunnel_changelink(dev, tb, &p, fwmark); + if (err < 0) + return err; + + t->parms.i_flags = p.i_flags; + t->parms.o_flags = p.o_flags; return 0; } @@ -1519,8 +1586,8 @@ static struct rtnl_link_ops erspan_link_ops __read_mostly = { .priv_size = sizeof(struct ip_tunnel), .setup = erspan_setup, .validate = erspan_validate, - .newlink = ipgre_newlink, - .changelink = ipgre_changelink, + .newlink = erspan_newlink, + .changelink = erspan_changelink, .dellink = ip_tunnel_dellink, .get_size = ipgre_get_size, .fill_info = ipgre_fill_info, -- cgit v1.2.3 From 61fad6816fc10fb8793a925d5c1256d1c3db0cd2 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 13 Mar 2020 12:18:09 -0400 Subject: net/packet: tpacket_rcv: avoid a producer race condition PACKET_RX_RING can cause multiple writers to access the same slot if a fast writer wraps the ring while a slow writer is still copying. This is particularly likely with few, large, slots (e.g., GSO packets). Synchronize kernel thread ownership of rx ring slots with a bitmap. Writers acquire a slot race-free by testing tp_status TP_STATUS_KERNEL while holding the sk receive queue lock. They release this lock before copying and set tp_status to TP_STATUS_USER to release to userspace when done. During copying, another writer may take the lock, also see TP_STATUS_KERNEL, and start writing to the same slot. Introduce a new rx_owner_map bitmap with a bit per slot. To acquire a slot, test and set with the lock held. To release race-free, update tp_status and owner bit as a transaction, so take the lock again. This is the one of a variety of discussed options (see Link below): * instead of a shadow ring, embed the data in the slot itself, such as in tp_padding. But any test for this field may match a value left by userspace, causing deadlock. * avoid the lock on release. This leaves a small race if releasing the shadow slot before setting TP_STATUS_USER. The below reproducer showed that this race is not academic. If releasing the slot after tp_status, the race is more subtle. See the first link for details. * add a new tp_status TP_KERNEL_OWNED to avoid the transactional store of two fields. But, legacy applications may interpret all non-zero tp_status as owned by the user. As libpcap does. So this is possible only opt-in by newer processes. It can be added as an optional mode. * embed the struct at the tail of pg_vec to avoid extra allocation. The implementation proved no less complex than a separate field. The additional locking cost on release adds contention, no different than scaling on multicore or multiqueue h/w. In practice, below reproducer nor small packet tcpdump showed a noticeable change in perf report in cycles spent in spinlock. Where contention is problematic, packet sockets support mitigation through PACKET_FANOUT. And we can consider adding opt-in state TP_KERNEL_OWNED. Easy to reproduce by running multiple netperf or similar TCP_STREAM flows concurrently with `tcpdump -B 129 -n greater 60000`. Based on an earlier patchset by Jon Rosen. See links below. I believe this issue goes back to the introduction of tpacket_rcv, which predates git history. Link: https://www.mail-archive.com/netdev@vger.kernel.org/msg237222.html Suggested-by: Jon Rosen Signed-off-by: Willem de Bruijn Signed-off-by: Jon Rosen Signed-off-by: David S. Miller --- net/packet/af_packet.c | 21 +++++++++++++++++++++ net/packet/internal.h | 5 ++++- 2 files changed, 25 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index e5b0986215d2..29bd405adbbd 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2173,6 +2173,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct timespec64 ts; __u32 ts_status; bool is_drop_n_account = false; + unsigned int slot_id = 0; bool do_vnet = false; /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT. @@ -2275,6 +2276,13 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, if (!h.raw) goto drop_n_account; + if (po->tp_version <= TPACKET_V2) { + slot_id = po->rx_ring.head; + if (test_bit(slot_id, po->rx_ring.rx_owner_map)) + goto drop_n_account; + __set_bit(slot_id, po->rx_ring.rx_owner_map); + } + if (do_vnet && virtio_net_hdr_from_skb(skb, h.raw + macoff - sizeof(struct virtio_net_hdr), @@ -2380,7 +2388,10 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, #endif if (po->tp_version <= TPACKET_V2) { + spin_lock(&sk->sk_receive_queue.lock); __packet_set_status(po, h.raw, status); + __clear_bit(slot_id, po->rx_ring.rx_owner_map); + spin_unlock(&sk->sk_receive_queue.lock); sk->sk_data_ready(sk); } else { prb_clear_blk_fill_status(&po->rx_ring); @@ -4277,6 +4288,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, { struct pgv *pg_vec = NULL; struct packet_sock *po = pkt_sk(sk); + unsigned long *rx_owner_map = NULL; int was_running, order = 0; struct packet_ring_buffer *rb; struct sk_buff_head *rb_queue; @@ -4362,6 +4374,12 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, } break; default: + if (!tx_ring) { + rx_owner_map = bitmap_alloc(req->tp_frame_nr, + GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO); + if (!rx_owner_map) + goto out_free_pg_vec; + } break; } } @@ -4391,6 +4409,8 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, err = 0; spin_lock_bh(&rb_queue->lock); swap(rb->pg_vec, pg_vec); + if (po->tp_version <= TPACKET_V2) + swap(rb->rx_owner_map, rx_owner_map); rb->frame_max = (req->tp_frame_nr - 1); rb->head = 0; rb->frame_size = req->tp_frame_size; @@ -4422,6 +4442,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, } out_free_pg_vec: + bitmap_free(rx_owner_map); if (pg_vec) free_pg_vec(pg_vec, order, req->tp_block_nr); out: diff --git a/net/packet/internal.h b/net/packet/internal.h index 82fb2b10f790..907f4cd2a718 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -70,7 +70,10 @@ struct packet_ring_buffer { unsigned int __percpu *pending_refcnt; - struct tpacket_kbdq_core prb_bdqc; + union { + unsigned long *rx_owner_map; + struct tpacket_kbdq_core prb_bdqc; + }; }; extern struct mutex fanout_mutex; -- cgit v1.2.3 From 173756b86803655d70af7732079b3aa935e6ab68 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Fri, 13 Mar 2020 06:50:14 +0000 Subject: hsr: use rcu_read_lock() in hsr_get_node_{list/status}() hsr_get_node_{list/status}() are not under rtnl_lock() because they are callback functions of generic netlink. But they use __dev_get_by_index() without rtnl_lock(). So, it would use unsafe data. In order to fix it, rcu_read_lock() and dev_get_by_index_rcu() are used instead of __dev_get_by_index(). Fixes: f421436a591d ("net/hsr: Add support for the High-availability Seamless Redundancy protocol (HSRv0)") Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- net/hsr/hsr_framereg.c | 9 ++------- net/hsr/hsr_netlink.c | 39 +++++++++++++++++++++------------------ 2 files changed, 23 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index 3ba7f61be107..a64bb64935a6 100644 --- a/net/hsr/hsr_framereg.c +++ b/net/hsr/hsr_framereg.c @@ -482,12 +482,9 @@ int hsr_get_node_data(struct hsr_priv *hsr, struct hsr_port *port; unsigned long tdiff; - rcu_read_lock(); node = find_node_by_addr_A(&hsr->node_db, addr); - if (!node) { - rcu_read_unlock(); - return -ENOENT; /* No such entry */ - } + if (!node) + return -ENOENT; ether_addr_copy(addr_b, node->macaddress_B); @@ -522,7 +519,5 @@ int hsr_get_node_data(struct hsr_priv *hsr, *addr_b_ifindex = -1; } - rcu_read_unlock(); - return 0; } diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c index 8dc0547f01d0..d6760df2ad1f 100644 --- a/net/hsr/hsr_netlink.c +++ b/net/hsr/hsr_netlink.c @@ -251,15 +251,16 @@ static int hsr_get_node_status(struct sk_buff *skb_in, struct genl_info *info) if (!na) goto invalid; - hsr_dev = __dev_get_by_index(genl_info_net(info), - nla_get_u32(info->attrs[HSR_A_IFINDEX])); + rcu_read_lock(); + hsr_dev = dev_get_by_index_rcu(genl_info_net(info), + nla_get_u32(info->attrs[HSR_A_IFINDEX])); if (!hsr_dev) - goto invalid; + goto rcu_unlock; if (!is_hsr_master(hsr_dev)) - goto invalid; + goto rcu_unlock; /* Send reply */ - skb_out = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + skb_out = genlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC); if (!skb_out) { res = -ENOMEM; goto fail; @@ -313,12 +314,10 @@ static int hsr_get_node_status(struct sk_buff *skb_in, struct genl_info *info) res = nla_put_u16(skb_out, HSR_A_IF1_SEQ, hsr_node_if1_seq); if (res < 0) goto nla_put_failure; - rcu_read_lock(); port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_A); if (port) res = nla_put_u32(skb_out, HSR_A_IF1_IFINDEX, port->dev->ifindex); - rcu_read_unlock(); if (res < 0) goto nla_put_failure; @@ -328,20 +327,22 @@ static int hsr_get_node_status(struct sk_buff *skb_in, struct genl_info *info) res = nla_put_u16(skb_out, HSR_A_IF2_SEQ, hsr_node_if2_seq); if (res < 0) goto nla_put_failure; - rcu_read_lock(); port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B); if (port) res = nla_put_u32(skb_out, HSR_A_IF2_IFINDEX, port->dev->ifindex); - rcu_read_unlock(); if (res < 0) goto nla_put_failure; + rcu_read_unlock(); + genlmsg_end(skb_out, msg_head); genlmsg_unicast(genl_info_net(info), skb_out, info->snd_portid); return 0; +rcu_unlock: + rcu_read_unlock(); invalid: netlink_ack(skb_in, nlmsg_hdr(skb_in), -EINVAL, NULL); return 0; @@ -351,6 +352,7 @@ nla_put_failure: /* Fall through */ fail: + rcu_read_unlock(); return res; } @@ -377,15 +379,16 @@ static int hsr_get_node_list(struct sk_buff *skb_in, struct genl_info *info) if (!na) goto invalid; - hsr_dev = __dev_get_by_index(genl_info_net(info), - nla_get_u32(info->attrs[HSR_A_IFINDEX])); + rcu_read_lock(); + hsr_dev = dev_get_by_index_rcu(genl_info_net(info), + nla_get_u32(info->attrs[HSR_A_IFINDEX])); if (!hsr_dev) - goto invalid; + goto rcu_unlock; if (!is_hsr_master(hsr_dev)) - goto invalid; + goto rcu_unlock; /* Send reply */ - skb_out = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + skb_out = genlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC); if (!skb_out) { res = -ENOMEM; goto fail; @@ -405,14 +408,11 @@ static int hsr_get_node_list(struct sk_buff *skb_in, struct genl_info *info) hsr = netdev_priv(hsr_dev); - rcu_read_lock(); pos = hsr_get_next_node(hsr, NULL, addr); while (pos) { res = nla_put(skb_out, HSR_A_NODE_ADDR, ETH_ALEN, addr); - if (res < 0) { - rcu_read_unlock(); + if (res < 0) goto nla_put_failure; - } pos = hsr_get_next_node(hsr, pos, addr); } rcu_read_unlock(); @@ -422,6 +422,8 @@ static int hsr_get_node_list(struct sk_buff *skb_in, struct genl_info *info) return 0; +rcu_unlock: + rcu_read_unlock(); invalid: netlink_ack(skb_in, nlmsg_hdr(skb_in), -EINVAL, NULL); return 0; @@ -431,6 +433,7 @@ nla_put_failure: /* Fall through */ fail: + rcu_read_unlock(); return res; } -- cgit v1.2.3 From ca19c70f5225771c05bcdcb832b4eb84d7271c5e Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Fri, 13 Mar 2020 06:50:24 +0000 Subject: hsr: add restart routine into hsr_get_node_list() The hsr_get_node_list() is to send node addresses to the userspace. If there are so many nodes, it could fail because of buffer size. In order to avoid this failure, the restart routine is added. Fixes: f421436a591d ("net/hsr: Add support for the High-availability Seamless Redundancy protocol (HSRv0)") Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- net/hsr/hsr_netlink.c | 38 ++++++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c index d6760df2ad1f..726bfe923999 100644 --- a/net/hsr/hsr_netlink.c +++ b/net/hsr/hsr_netlink.c @@ -360,16 +360,14 @@ fail: */ static int hsr_get_node_list(struct sk_buff *skb_in, struct genl_info *info) { - /* For receiving */ - struct nlattr *na; + unsigned char addr[ETH_ALEN]; struct net_device *hsr_dev; - - /* For sending */ struct sk_buff *skb_out; - void *msg_head; struct hsr_priv *hsr; - void *pos; - unsigned char addr[ETH_ALEN]; + bool restart = false; + struct nlattr *na; + void *pos = NULL; + void *msg_head; int res; if (!info) @@ -387,8 +385,9 @@ static int hsr_get_node_list(struct sk_buff *skb_in, struct genl_info *info) if (!is_hsr_master(hsr_dev)) goto rcu_unlock; +restart: /* Send reply */ - skb_out = genlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC); + skb_out = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!skb_out) { res = -ENOMEM; goto fail; @@ -402,17 +401,28 @@ static int hsr_get_node_list(struct sk_buff *skb_in, struct genl_info *info) goto nla_put_failure; } - res = nla_put_u32(skb_out, HSR_A_IFINDEX, hsr_dev->ifindex); - if (res < 0) - goto nla_put_failure; + if (!restart) { + res = nla_put_u32(skb_out, HSR_A_IFINDEX, hsr_dev->ifindex); + if (res < 0) + goto nla_put_failure; + } hsr = netdev_priv(hsr_dev); - pos = hsr_get_next_node(hsr, NULL, addr); + if (!pos) + pos = hsr_get_next_node(hsr, NULL, addr); while (pos) { res = nla_put(skb_out, HSR_A_NODE_ADDR, ETH_ALEN, addr); - if (res < 0) + if (res < 0) { + if (res == -EMSGSIZE) { + genlmsg_end(skb_out, msg_head); + genlmsg_unicast(genl_info_net(info), skb_out, + info->snd_portid); + restart = true; + goto restart; + } goto nla_put_failure; + } pos = hsr_get_next_node(hsr, pos, addr); } rcu_read_unlock(); @@ -429,7 +439,7 @@ invalid: return 0; nla_put_failure: - kfree_skb(skb_out); + nlmsg_free(skb_out); /* Fall through */ fail: -- cgit v1.2.3 From 09e91dbea0aa32be02d8877bd50490813de56b9a Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Fri, 13 Mar 2020 06:50:33 +0000 Subject: hsr: set .netnsok flag The hsr module has been supporting the list and status command. (HSR_C_GET_NODE_LIST and HSR_C_GET_NODE_STATUS) These commands send node information to the user-space via generic netlink. But, in the non-init_net namespace, these commands are not allowed because .netnsok flag is false. So, there is no way to get node information in the non-init_net namespace. Fixes: f421436a591d ("net/hsr: Add support for the High-availability Seamless Redundancy protocol (HSRv0)") Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- net/hsr/hsr_netlink.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c index 726bfe923999..fae21c863b1f 100644 --- a/net/hsr/hsr_netlink.c +++ b/net/hsr/hsr_netlink.c @@ -470,6 +470,7 @@ static struct genl_family hsr_genl_family __ro_after_init = { .version = 1, .maxattr = HSR_A_MAX, .policy = hsr_genl_policy, + .netnsok = true, .module = THIS_MODULE, .ops = hsr_ops, .n_ops = ARRAY_SIZE(hsr_ops), -- cgit v1.2.3 From ef299cc3fa1a9e1288665a9fdc8bff55629fd359 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Fri, 13 Mar 2020 22:29:54 -0700 Subject: net_sched: cls_route: remove the right filter from hashtable route4_change() allocates a new filter and copies values from the old one. After the new filter is inserted into the hash table, the old filter should be removed and freed, as the final step of the update. However, the current code mistakenly removes the new one. This looks apparently wrong to me, and it causes double "free" and use-after-free too, as reported by syzbot. Reported-and-tested-by: syzbot+f9b32aaacd60305d9687@syzkaller.appspotmail.com Reported-and-tested-by: syzbot+2f8c233f131943d6056d@syzkaller.appspotmail.com Reported-and-tested-by: syzbot+9c2df9fd5e9445b74e01@syzkaller.appspotmail.com Fixes: 1109c00547fc ("net: sched: RCU cls_route") Cc: Jamal Hadi Salim Cc: Jiri Pirko Cc: John Fastabend Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_route.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index 6f8786b06bde..5efa3e7ace15 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -534,8 +534,8 @@ static int route4_change(struct net *net, struct sk_buff *in_skb, fp = &b->ht[h]; for (pfp = rtnl_dereference(*fp); pfp; fp = &pfp->next, pfp = rtnl_dereference(*fp)) { - if (pfp == f) { - *fp = f->next; + if (pfp == fold) { + rcu_assign_pointer(*fp, fold->next); break; } } -- cgit v1.2.3 From fe2a31d790f81bd14a76de3d3b87f4f1362f60cd Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Sun, 15 Mar 2020 18:17:43 +0100 Subject: netlink: allow extack cookie also for error messages Commit ba0dc5f6e0ba ("netlink: allow sending extended ACK with cookie on success") introduced a cookie which can be sent to userspace as part of extended ack message in the form of NLMSGERR_ATTR_COOKIE attribute. Currently the cookie is ignored if error code is non-zero but there is no technical reason for such limitation and it can be useful to provide machine parseable information as part of an error message. Include NLMSGERR_ATTR_COOKIE whenever the cookie has been set, regardless of error code. Signed-off-by: Michal Kubecek Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 43 +++++++++++++++++-------------------------- 1 file changed, 17 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 5313f1cec170..2f234791b879 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2392,19 +2392,14 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err, if (nlk_has_extack && extack && extack->_msg) tlvlen += nla_total_size(strlen(extack->_msg) + 1); - if (err) { - if (!(nlk->flags & NETLINK_F_CAP_ACK)) - payload += nlmsg_len(nlh); - else - flags |= NLM_F_CAPPED; - if (nlk_has_extack && extack && extack->bad_attr) - tlvlen += nla_total_size(sizeof(u32)); - } else { + if (err && !(nlk->flags & NETLINK_F_CAP_ACK)) + payload += nlmsg_len(nlh); + else flags |= NLM_F_CAPPED; - - if (nlk_has_extack && extack && extack->cookie_len) - tlvlen += nla_total_size(extack->cookie_len); - } + if (err && nlk_has_extack && extack && extack->bad_attr) + tlvlen += nla_total_size(sizeof(u32)); + if (nlk_has_extack && extack && extack->cookie_len) + tlvlen += nla_total_size(extack->cookie_len); if (tlvlen) flags |= NLM_F_ACK_TLVS; @@ -2427,20 +2422,16 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err, WARN_ON(nla_put_string(skb, NLMSGERR_ATTR_MSG, extack->_msg)); } - if (err) { - if (extack->bad_attr && - !WARN_ON((u8 *)extack->bad_attr < in_skb->data || - (u8 *)extack->bad_attr >= in_skb->data + - in_skb->len)) - WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_OFFS, - (u8 *)extack->bad_attr - - (u8 *)nlh)); - } else { - if (extack->cookie_len) - WARN_ON(nla_put(skb, NLMSGERR_ATTR_COOKIE, - extack->cookie_len, - extack->cookie)); - } + if (err && extack->bad_attr && + !WARN_ON((u8 *)extack->bad_attr < in_skb->data || + (u8 *)extack->bad_attr >= in_skb->data + + in_skb->len)) + WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_OFFS, + (u8 *)extack->bad_attr - + (u8 *)nlh)); + if (extack->cookie_len) + WARN_ON(nla_put(skb, NLMSGERR_ATTR_COOKIE, + extack->cookie_len, extack->cookie)); } nlmsg_end(skb, rep); -- cgit v1.2.3 From 2363d73a2f3e92787f336721c40918ba2eb0c74c Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Sun, 15 Mar 2020 18:17:53 +0100 Subject: ethtool: reject unrecognized request flags As pointed out by Jakub Kicinski, we ethtool netlink code should respond with an error if request head has flags set which are not recognized by kernel, either as a mistake or because it expects functionality introduced in later kernel versions. To avoid unnecessary roundtrips, use extack cookie to provide the information about supported request flags. Signed-off-by: Michal Kubecek Signed-off-by: David S. Miller --- net/ethtool/netlink.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 180c194fab07..fc9e0b806889 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -40,6 +40,7 @@ int ethnl_parse_header(struct ethnl_req_info *req_info, struct nlattr *tb[ETHTOOL_A_HEADER_MAX + 1]; const struct nlattr *devname_attr; struct net_device *dev = NULL; + u32 flags = 0; int ret; if (!header) { @@ -50,8 +51,17 @@ int ethnl_parse_header(struct ethnl_req_info *req_info, ethnl_header_policy, extack); if (ret < 0) return ret; - devname_attr = tb[ETHTOOL_A_HEADER_DEV_NAME]; + if (tb[ETHTOOL_A_HEADER_FLAGS]) { + flags = nla_get_u32(tb[ETHTOOL_A_HEADER_FLAGS]); + if (flags & ~ETHTOOL_FLAG_ALL) { + NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_HEADER_FLAGS], + "unrecognized request flags"); + nl_set_extack_cookie_u32(extack, ETHTOOL_FLAG_ALL); + return -EOPNOTSUPP; + } + } + devname_attr = tb[ETHTOOL_A_HEADER_DEV_NAME]; if (tb[ETHTOOL_A_HEADER_DEV_INDEX]) { u32 ifindex = nla_get_u32(tb[ETHTOOL_A_HEADER_DEV_INDEX]); @@ -90,9 +100,7 @@ int ethnl_parse_header(struct ethnl_req_info *req_info, } req_info->dev = dev; - if (tb[ETHTOOL_A_HEADER_FLAGS]) - req_info->flags = nla_get_u32(tb[ETHTOOL_A_HEADER_FLAGS]); - + req_info->flags = flags; return 0; } -- cgit v1.2.3 From 32ca98feab8c9076c89c0697c5a85e46fece809d Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Mon, 16 Mar 2020 19:53:00 +0200 Subject: net: ip_gre: Accept IFLA_INFO_DATA-less configuration The fix referenced below causes a crash when an ERSPAN tunnel is created without passing IFLA_INFO_DATA. Fix by validating passed-in data in the same way as ipgre does. Fixes: e1f8f78ffe98 ("net: ip_gre: Separate ERSPAN newlink / changelink callbacks") Reported-by: syzbot+1b4ebf4dae4e510dd219@syzkaller.appspotmail.com Signed-off-by: Petr Machata Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 7765c65fc7d2..029b24eeafba 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1168,6 +1168,8 @@ static int erspan_netlink_parms(struct net_device *dev, err = ipgre_netlink_parms(dev, data, tb, parms, fwmark); if (err) return err; + if (!data) + return 0; if (data[IFLA_GRE_ERSPAN_VER]) { t->erspan_ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]); -- cgit v1.2.3 From 2de9780f75076c1a1f122cbd39df0fa545284724 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 17 Mar 2020 15:54:20 +0100 Subject: net: core: dev.c: fix a documentation warning There's a markup for link with is "foo_". On this kernel-doc comment, we don't want this, but instead, place a literal reference. So, escape the literal with ``foo``, in order to avoid this warning: ./net/core/dev.c:5195: WARNING: Unknown target name: "page_is". Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index c6c985fe7b1b..402a986659cf 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5195,7 +5195,7 @@ static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc) * * More direct receive version of netif_receive_skb(). It should * only be used by callers that have a need to skip RPS and Generic XDP. - * Caller must also take care of handling if (page_is_)pfmemalloc. + * Caller must also take care of handling if ``(page_is_)pfmemalloc``. * * This function may only be called from softirq context and interrupts * should be enabled. -- cgit v1.2.3 From dd2af10402684cb5840a127caec9e7cdcff6d167 Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Wed, 18 Mar 2020 12:50:33 +0200 Subject: net/sched: act_ct: Fix leak of ct zone template on replace Currently, on replace, the previous action instance params is swapped with a newly allocated params. The old params is only freed (via kfree_rcu), without releasing the allocated ct zone template related to it. Call tcf_ct_params_free (via call_rcu) for the old params, so it will release it. Fixes: b57dc7c13ea9 ("net/sched: Introduce action ct") Signed-off-by: Paul Blakey Signed-off-by: David S. Miller --- net/sched/act_ct.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index f685c0d73708..41114b463161 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -739,7 +739,7 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla, if (goto_ch) tcf_chain_put_by_act(goto_ch); if (params) - kfree_rcu(params, rcu); + call_rcu(¶ms->rcu, tcf_ct_params_free); if (res == ACT_P_CREATED) tcf_idr_insert(tn, *a); -- cgit v1.2.3 From 61abaf02d2ec3d4575c66fa1b4a863877736b932 Mon Sep 17 00:00:00 2001 From: Haishuang Yan Date: Tue, 17 Mar 2020 10:02:52 +0800 Subject: netfilter: flowtable: reload ip{v6}h in nf_flow_nat_ip{v6} Since nf_flow_snat_port and nf_flow_snat_ip{v6} call pskb_may_pull() which may change skb->data, so we need to reload ip{v6}h at the right place. Fixes: a908fdec3dda ("netfilter: nf_flow_table: move ipv6 offload hook code to nf_flow_table") Fixes: 7d2086871762 ("netfilter: nf_flow_table: move ipv4 offload hook code to nf_flow_table") Signed-off-by: Haishuang Yan Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_ip.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 9e563fd3da0f..22caab7bb755 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -146,11 +146,13 @@ static int nf_flow_nat_ip(const struct flow_offload *flow, struct sk_buff *skb, if (test_bit(NF_FLOW_SNAT, &flow->flags) && (nf_flow_snat_port(flow, skb, thoff, iph->protocol, dir) < 0 || - nf_flow_snat_ip(flow, skb, iph, thoff, dir) < 0)) + nf_flow_snat_ip(flow, skb, ip_hdr(skb), thoff, dir) < 0)) return -1; + + iph = ip_hdr(skb); if (test_bit(NF_FLOW_DNAT, &flow->flags) && (nf_flow_dnat_port(flow, skb, thoff, iph->protocol, dir) < 0 || - nf_flow_dnat_ip(flow, skb, iph, thoff, dir) < 0)) + nf_flow_dnat_ip(flow, skb, ip_hdr(skb), thoff, dir) < 0)) return -1; return 0; @@ -426,11 +428,13 @@ static int nf_flow_nat_ipv6(const struct flow_offload *flow, if (test_bit(NF_FLOW_SNAT, &flow->flags) && (nf_flow_snat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 || - nf_flow_snat_ipv6(flow, skb, ip6h, thoff, dir) < 0)) + nf_flow_snat_ipv6(flow, skb, ipv6_hdr(skb), thoff, dir) < 0)) return -1; + + ip6h = ipv6_hdr(skb); if (test_bit(NF_FLOW_DNAT, &flow->flags) && (nf_flow_dnat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 || - nf_flow_dnat_ipv6(flow, skb, ip6h, thoff, dir) < 0)) + nf_flow_dnat_ipv6(flow, skb, ipv6_hdr(skb), thoff, dir) < 0)) return -1; return 0; -- cgit v1.2.3 From 41e9ec5a54f95eee1a57c8d26ab70e0492548c1b Mon Sep 17 00:00:00 2001 From: Haishuang Yan Date: Tue, 17 Mar 2020 10:02:53 +0800 Subject: netfilter: flowtable: reload ip{v6}h in nf_flow_tuple_ip{v6} Since pskb_may_pull may change skb->data, so we need to reload ip{v6}h at the right place. Fixes: a908fdec3dda ("netfilter: nf_flow_table: move ipv6 offload hook code to nf_flow_table") Fixes: 7d2086871762 ("netfilter: nf_flow_table: move ipv4 offload hook code to nf_flow_table") Signed-off-by: Haishuang Yan Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_ip.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 22caab7bb755..ba775aecd89a 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -191,6 +191,7 @@ static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev, if (!pskb_may_pull(skb, thoff + sizeof(*ports))) return -1; + iph = ip_hdr(skb); ports = (struct flow_ports *)(skb_network_header(skb) + thoff); tuple->src_v4.s_addr = iph->saddr; @@ -463,6 +464,7 @@ static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev, if (!pskb_may_pull(skb, thoff + sizeof(*ports))) return -1; + ip6h = ipv6_hdr(skb); ports = (struct flow_ports *)(skb_network_header(skb) + thoff); tuple->src_v6 = ip6h->saddr; -- cgit v1.2.3 From c921ffe853332584eae4f5905cb2a14a7b3c9932 Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Thu, 19 Mar 2020 11:52:25 +0200 Subject: netfilter: flowtable: Fix flushing of offloaded flows on free Freeing a flowtable with offloaded flows, the flow are deleted from hardware but are not deleted from the flow table, leaking them, and leaving their offload bit on. Add a second pass of the disabled gc to delete the these flows from the flow table before freeing it. Fixes: c29f74e0df7a ("netfilter: nf_flow_table: hardware offload support") Signed-off-by: Paul Blakey Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_core.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 8af28e10b4e6..70ebebaf5bc1 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -554,6 +554,9 @@ void nf_flow_table_free(struct nf_flowtable *flow_table) nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL); nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, flow_table); nf_flow_table_offload_flush(flow_table); + if (nf_flowtable_hw_offload(flow_table)) + nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, + flow_table); rhashtable_destroy(&flow_table->rhashtable); } EXPORT_SYMBOL_GPL(nf_flow_table_free); -- cgit v1.2.3 From 15ff197237e76c4dab06b7b518afaa4ebb1c43e0 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Thu, 19 Mar 2020 19:37:21 +0000 Subject: netfilter: flowtable: populate addr_type mask nf_flow_rule_match() sets control.addr_type in key, so needs to also set the corresponding mask. An exact match is wanted, so mask is all ones. Fixes: c29f74e0df7a ("netfilter: nf_flow_table: hardware offload support") Signed-off-by: Edward Cree Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_offload.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index 06f00cdc3891..f2c22c682851 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -87,6 +87,7 @@ static int nf_flow_rule_match(struct nf_flow_match *match, default: return -EOPNOTSUPP; } + mask->control.addr_type = 0xffff; match->dissector.used_keys |= BIT(key->control.addr_type); mask->basic.n_proto = 0xffff; -- cgit v1.2.3 From b738a185beaab8728943acdb3e67371b8a88185e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 19 Mar 2020 12:49:55 -0700 Subject: tcp: ensure skb->dev is NULL before leaving TCP stack skb->rbnode is sharing three skb fields : next, prev, dev When a packet is sent, TCP keeps the original skb (master) in a rtx queue, which was converted to rbtree a while back. __tcp_transmit_skb() is responsible to clone the master skb, and add the TCP header to the clone before sending it to network layer. skb_clone() already clears skb->next and skb->prev, but copies the master oskb->dev into the clone. We need to clear skb->dev, otherwise lower layers could interpret the value as a pointer to a netdev. This old bug surfaced recently when commit 28f8bfd1ac94 ("netfilter: Support iif matches in POSTROUTING") was merged. Before this netfilter commit, skb->dev value was ignored and changed before reaching dev_queue_xmit() Fixes: 75c119afe14f ("tcp: implement rb-tree based retransmit queue") Fixes: 28f8bfd1ac94 ("netfilter: Support iif matches in POSTROUTING") Signed-off-by: Eric Dumazet Reported-by: Martin Zaharinov Cc: Florian Westphal Cc: Pablo Neira Ayuso Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 306e25d743e8..e8cf8fde3d37 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1109,6 +1109,10 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, if (unlikely(!skb)) return -ENOBUFS; + /* retransmit skbs might have a non zero value in skb->dev + * because skb->dev is aliased with skb->rbnode.rb_left + */ + skb->dev = NULL; } inet = inet_sk(sk); -- cgit v1.2.3 From 07f8e4d0fddbf2f87e4cefb551278abc38db8cdd Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 20 Mar 2020 16:52:02 +0100 Subject: tcp: also NULL skb->dev when copy was needed In rare cases retransmit logic will make a full skb copy, which will not trigger the zeroing added in recent change b738a185beaa ("tcp: ensure skb->dev is NULL before leaving TCP stack"). Cc: Eric Dumazet Fixes: 75c119afe14f ("tcp: implement rb-tree based retransmit queue") Fixes: 28f8bfd1ac94 ("netfilter: Support iif matches in POSTROUTING") Signed-off-by: Florian Westphal Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index e8cf8fde3d37..2f45cde168c4 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3041,8 +3041,12 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) tcp_skb_tsorted_save(skb) { nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC); - err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : - -ENOBUFS; + if (nskb) { + nskb->dev = NULL; + err = tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC); + } else { + err = -ENOBUFS; + } } tcp_skb_tsorted_restore(skb); if (!err) { -- cgit v1.2.3 From 3a303cfdd28d5f930a307c82e8a9d996394d5ebd Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Sat, 21 Mar 2020 06:46:50 +0000 Subject: hsr: fix general protection fault in hsr_addr_is_self() The port->hsr is used in the hsr_handle_frame(), which is a callback of rx_handler. hsr master and slaves are initialized in hsr_add_port(). This function initializes several pointers, which includes port->hsr after registering rx_handler. So, in the rx_handler routine, un-initialized pointer would be used. In order to fix this, pointers should be initialized before registering rx_handler. Test commands: ip netns del left ip netns del right modprobe -rv veth modprobe -rv hsr killall ping modprobe hsr ip netns add left ip netns add right ip link add veth0 type veth peer name veth1 ip link add veth2 type veth peer name veth3 ip link add veth4 type veth peer name veth5 ip link set veth1 netns left ip link set veth3 netns right ip link set veth4 netns left ip link set veth5 netns right ip link set veth0 up ip link set veth2 up ip link set veth0 address fc:00:00:00:00:01 ip link set veth2 address fc:00:00:00:00:02 ip netns exec left ip link set veth1 up ip netns exec left ip link set veth4 up ip netns exec right ip link set veth3 up ip netns exec right ip link set veth5 up ip link add hsr0 type hsr slave1 veth0 slave2 veth2 ip a a 192.168.100.1/24 dev hsr0 ip link set hsr0 up ip netns exec left ip link add hsr1 type hsr slave1 veth1 slave2 veth4 ip netns exec left ip a a 192.168.100.2/24 dev hsr1 ip netns exec left ip link set hsr1 up ip netns exec left ip n a 192.168.100.1 dev hsr1 lladdr \ fc:00:00:00:00:01 nud permanent ip netns exec left ip n r 192.168.100.1 dev hsr1 lladdr \ fc:00:00:00:00:01 nud permanent for i in {1..100} do ip netns exec left ping 192.168.100.1 & done ip netns exec left hping3 192.168.100.1 -2 --flood & ip netns exec right ip link add hsr2 type hsr slave1 veth3 slave2 veth5 ip netns exec right ip a a 192.168.100.3/24 dev hsr2 ip netns exec right ip link set hsr2 up ip netns exec right ip n a 192.168.100.1 dev hsr2 lladdr \ fc:00:00:00:00:02 nud permanent ip netns exec right ip n r 192.168.100.1 dev hsr2 lladdr \ fc:00:00:00:00:02 nud permanent for i in {1..100} do ip netns exec right ping 192.168.100.1 & done ip netns exec right hping3 192.168.100.1 -2 --flood & while : do ip link add hsr0 type hsr slave1 veth0 slave2 veth2 ip a a 192.168.100.1/24 dev hsr0 ip link set hsr0 up ip link del hsr0 done Splat looks like: [ 120.954938][ C0] general protection fault, probably for non-canonical address 0xdffffc0000000006: 0000 [#1]I [ 120.957761][ C0] KASAN: null-ptr-deref in range [0x0000000000000030-0x0000000000000037] [ 120.959064][ C0] CPU: 0 PID: 1511 Comm: hping3 Not tainted 5.6.0-rc5+ #460 [ 120.960054][ C0] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 [ 120.962261][ C0] RIP: 0010:hsr_addr_is_self+0x65/0x2a0 [hsr] [ 120.963149][ C0] Code: 44 24 18 70 73 2f c0 48 c1 eb 03 48 8d 04 13 c7 00 f1 f1 f1 f1 c7 40 04 00 f2 f2 f2 4 [ 120.966277][ C0] RSP: 0018:ffff8880d9c09af0 EFLAGS: 00010206 [ 120.967293][ C0] RAX: 0000000000000006 RBX: 1ffff1101b38135f RCX: 0000000000000000 [ 120.968516][ C0] RDX: dffffc0000000000 RSI: ffff8880d17cb208 RDI: 0000000000000000 [ 120.969718][ C0] RBP: 0000000000000030 R08: ffffed101b3c0e3c R09: 0000000000000001 [ 120.972203][ C0] R10: 0000000000000001 R11: ffffed101b3c0e3b R12: 0000000000000000 [ 120.973379][ C0] R13: ffff8880aaf80100 R14: ffff8880aaf800f2 R15: ffff8880aaf80040 [ 120.974410][ C0] FS: 00007f58e693f740(0000) GS:ffff8880d9c00000(0000) knlGS:0000000000000000 [ 120.979794][ C0] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 120.980773][ C0] CR2: 00007ffcb8b38f29 CR3: 00000000afe8e001 CR4: 00000000000606f0 [ 120.981945][ C0] Call Trace: [ 120.982411][ C0] [ 120.982848][ C0] ? hsr_add_node+0x8c0/0x8c0 [hsr] [ 120.983522][ C0] ? rcu_read_lock_held+0x90/0xa0 [ 120.984159][ C0] ? rcu_read_lock_sched_held+0xc0/0xc0 [ 120.984944][ C0] hsr_handle_frame+0x1db/0x4e0 [hsr] [ 120.985597][ C0] ? hsr_nl_nodedown+0x2b0/0x2b0 [hsr] [ 120.986289][ C0] __netif_receive_skb_core+0x6bf/0x3170 [ 120.992513][ C0] ? check_chain_key+0x236/0x5d0 [ 120.993223][ C0] ? do_xdp_generic+0x1460/0x1460 [ 120.993875][ C0] ? register_lock_class+0x14d0/0x14d0 [ 120.994609][ C0] ? __netif_receive_skb_one_core+0x8d/0x160 [ 120.995377][ C0] __netif_receive_skb_one_core+0x8d/0x160 [ 120.996204][ C0] ? __netif_receive_skb_core+0x3170/0x3170 [ ... ] Reported-by: syzbot+fcf5dd39282ceb27108d@syzkaller.appspotmail.com Fixes: c5a759117210 ("net/hsr: Use list_head (and rcu) instead of array for slave devices.") Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- net/hsr/hsr_slave.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c index fbfd0db182b7..a9104d42aafb 100644 --- a/net/hsr/hsr_slave.c +++ b/net/hsr/hsr_slave.c @@ -145,16 +145,16 @@ int hsr_add_port(struct hsr_priv *hsr, struct net_device *dev, if (!port) return -ENOMEM; + port->hsr = hsr; + port->dev = dev; + port->type = type; + if (type != HSR_PT_MASTER) { res = hsr_portdev_setup(dev, port); if (res) goto fail_dev_setup; } - port->hsr = hsr; - port->dev = dev; - port->type = type; - list_add_tail_rcu(&port->port_list, &hsr->ports); synchronize_rcu(); -- cgit v1.2.3 From 6cd6cbf593bfa3ae6fc3ed34ac21da4d35045425 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Mar 2020 19:21:02 -0700 Subject: tcp: repair: fix TCP_QUEUE_SEQ implementation When application uses TCP_QUEUE_SEQ socket option to change tp->rcv_next, we must also update tp->copied_seq. Otherwise, stuff relying on tcp_inq() being precise can eventually be confused. For example, tcp_zerocopy_receive() might crash because it does not expect tcp_recv_skb() to return NULL. We could add tests in various places to fix the issue, or simply make sure tcp_inq() wont return a random value, and leave fast path as it is. Note that this fixes ioctl(fd, SIOCINQ, &val) at the same time. Fixes: ee9952831cfd ("tcp: Initial repair mode") Fixes: 05255b823a61 ("tcp: add TCP_ZEROCOPY_RECEIVE support for zerocopy receive") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index eb2d80519f8e..dc77c303e6f7 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2948,8 +2948,10 @@ static int do_tcp_setsockopt(struct sock *sk, int level, err = -EPERM; else if (tp->repair_queue == TCP_SEND_QUEUE) WRITE_ONCE(tp->write_seq, val); - else if (tp->repair_queue == TCP_RECV_QUEUE) + else if (tp->repair_queue == TCP_RECV_QUEUE) { WRITE_ONCE(tp->rcv_nxt, val); + WRITE_ONCE(tp->copied_seq, val); + } else err = -EINVAL; break; -- cgit v1.2.3 From dddeb30bfc43926620f954266fd12c65a7206f07 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Thu, 19 Mar 2020 22:54:21 -0400 Subject: ipv4: fix a RCU-list lock in inet_dump_fib() There is a place, inet_dump_fib() fib_table_dump fn_trie_dump_leaf() hlist_for_each_entry_rcu() without rcu_read_lock() will trigger a warning, WARNING: suspicious RCU usage ----------------------------- net/ipv4/fib_trie.c:2216 RCU-list traversed in non-reader section!! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 1 lock held by ip/1923: #0: ffffffff8ce76e40 (rtnl_mutex){+.+.}, at: netlink_dump+0xd6/0x840 Call Trace: dump_stack+0xa1/0xea lockdep_rcu_suspicious+0x103/0x10d fn_trie_dump_leaf+0x581/0x590 fib_table_dump+0x15f/0x220 inet_dump_fib+0x4ad/0x5d0 netlink_dump+0x350/0x840 __netlink_dump_start+0x315/0x3e0 rtnetlink_rcv_msg+0x4d1/0x720 netlink_rcv_skb+0xf0/0x220 rtnetlink_rcv+0x15/0x20 netlink_unicast+0x306/0x460 netlink_sendmsg+0x44b/0x770 __sys_sendto+0x259/0x270 __x64_sys_sendto+0x80/0xa0 do_syscall_64+0x69/0xf4 entry_SYSCALL_64_after_hwframe+0x49/0xb3 Fixes: 18a8021a7be3 ("net/ipv4: Plumb support for filtering route dumps") Signed-off-by: Qian Cai Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 577db1d50a24..213be9c050ad 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -997,7 +997,9 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) return -ENOENT; } + rcu_read_lock(); err = fib_table_dump(tb, skb, cb, &filter); + rcu_read_unlock(); return skb->len ? : err; } -- cgit v1.2.3 From 0e62f543bed03a64495bd2651d4fe1aa4bcb7fe5 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Sun, 22 Mar 2020 13:58:50 -0700 Subject: net: dsa: Fix duplicate frames flooded by learning When both the switch and the bridge are learning about new addresses, switch ports attached to the bridge would see duplicate ARP frames because both entities would attempt to send them. Fixes: 5037d532b83d ("net: dsa: add Broadcom tag RX/TX handler") Reported-by: Maxime Bizon Signed-off-by: Florian Fainelli Reviewed-by: Vivien Didelot Signed-off-by: David S. Miller --- net/dsa/tag_brcm.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index 9c3114179690..9169b63a89e3 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -140,6 +140,8 @@ static struct sk_buff *brcm_tag_rcv_ll(struct sk_buff *skb, /* Remove Broadcom tag and update checksum */ skb_pull_rcsum(skb, BRCM_TAG_LEN); + skb->offload_fwd_mark = 1; + return skb; } #endif -- cgit v1.2.3 From 2f599ec422ad6634fb5ad43748b9969ca9d742bd Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Sun, 22 Mar 2020 22:24:21 +0100 Subject: ethtool: fix reference leak in some *_SET handlers Andrew noticed that some handlers for *_SET commands leak a netdev reference if required ethtool_ops callbacks do not exist. A simple reproducer would be e.g. ip link add veth1 type veth peer name veth2 ethtool -s veth1 wol g ip link del veth1 Make sure dev_put() is called when ethtool_ops check fails. v2: add Fixes tags Fixes: a53f3d41e4d3 ("ethtool: set link settings with LINKINFO_SET request") Fixes: bfbcfe2032e7 ("ethtool: set link modes related data with LINKMODES_SET request") Fixes: e54d04e3afea ("ethtool: set message mask with DEBUG_SET request") Fixes: 8d425b19b305 ("ethtool: set wake-on-lan settings with WOL_SET request") Reported-by: Andrew Lunn Signed-off-by: Michal Kubecek Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/ethtool/debug.c | 4 +++- net/ethtool/linkinfo.c | 4 +++- net/ethtool/linkmodes.c | 4 +++- net/ethtool/wol.c | 4 +++- 4 files changed, 12 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ethtool/debug.c b/net/ethtool/debug.c index aaef4843e6ba..92599ad7b3c2 100644 --- a/net/ethtool/debug.c +++ b/net/ethtool/debug.c @@ -107,8 +107,9 @@ int ethnl_set_debug(struct sk_buff *skb, struct genl_info *info) if (ret < 0) return ret; dev = req_info.dev; + ret = -EOPNOTSUPP; if (!dev->ethtool_ops->get_msglevel || !dev->ethtool_ops->set_msglevel) - return -EOPNOTSUPP; + goto out_dev; rtnl_lock(); ret = ethnl_ops_begin(dev); @@ -129,6 +130,7 @@ out_ops: ethnl_ops_complete(dev); out_rtnl: rtnl_unlock(); +out_dev: dev_put(dev); return ret; } diff --git a/net/ethtool/linkinfo.c b/net/ethtool/linkinfo.c index 5d16cb4e8693..6e9e0b590bb5 100644 --- a/net/ethtool/linkinfo.c +++ b/net/ethtool/linkinfo.c @@ -126,9 +126,10 @@ int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info) if (ret < 0) return ret; dev = req_info.dev; + ret = -EOPNOTSUPP; if (!dev->ethtool_ops->get_link_ksettings || !dev->ethtool_ops->set_link_ksettings) - return -EOPNOTSUPP; + goto out_dev; rtnl_lock(); ret = ethnl_ops_begin(dev); @@ -162,6 +163,7 @@ out_ops: ethnl_ops_complete(dev); out_rtnl: rtnl_unlock(); +out_dev: dev_put(dev); return ret; } diff --git a/net/ethtool/linkmodes.c b/net/ethtool/linkmodes.c index 96f20be64553..18cc37be2d9c 100644 --- a/net/ethtool/linkmodes.c +++ b/net/ethtool/linkmodes.c @@ -338,9 +338,10 @@ int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info) if (ret < 0) return ret; dev = req_info.dev; + ret = -EOPNOTSUPP; if (!dev->ethtool_ops->get_link_ksettings || !dev->ethtool_ops->set_link_ksettings) - return -EOPNOTSUPP; + goto out_dev; rtnl_lock(); ret = ethnl_ops_begin(dev); @@ -370,6 +371,7 @@ out_ops: ethnl_ops_complete(dev); out_rtnl: rtnl_unlock(); +out_dev: dev_put(dev); return ret; } diff --git a/net/ethtool/wol.c b/net/ethtool/wol.c index e1b8a65b64c4..55e1ecaaf739 100644 --- a/net/ethtool/wol.c +++ b/net/ethtool/wol.c @@ -128,8 +128,9 @@ int ethnl_set_wol(struct sk_buff *skb, struct genl_info *info) if (ret < 0) return ret; dev = req_info.dev; + ret = -EOPNOTSUPP; if (!dev->ethtool_ops->get_wol || !dev->ethtool_ops->set_wol) - return -EOPNOTSUPP; + goto out_dev; rtnl_lock(); ret = ethnl_ops_begin(dev); @@ -172,6 +173,7 @@ out_ops: ethnl_ops_complete(dev); out_rtnl: rtnl_unlock(); +out_dev: dev_put(dev); return ret; } -- cgit v1.2.3 From 8c2d45b2b65ca1f215244be1c600236e83f9815f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 22 Mar 2020 03:21:58 +0100 Subject: netfilter: nf_tables: Allow set back-ends to report partial overlaps on insertion Currently, the -EEXIST return code of ->insert() callbacks is ambiguous: it might indicate that a given element (including intervals) already exists as such, or that the new element would clash with existing ones. If identical elements already exist, the front-end is ignoring this without returning error, in case NLM_F_EXCL is not set. However, if the new element can't be inserted due an overlap, we should report this to the user. To this purpose, allow set back-ends to return -ENOTEMPTY on collision with existing elements, translate that to -EEXIST, and return that to userspace, no matter if NLM_F_EXCL was set. Reported-by: Phil Sutter Signed-off-by: Stefano Brivio Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 38c680f28f15..d11f1a74d43c 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5082,6 +5082,11 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, err = -EBUSY; else if (!(nlmsg_flags & NLM_F_EXCL)) err = 0; + } else if (err == -ENOTEMPTY) { + /* ENOTEMPTY reports overlapping between this element + * and an existing one. + */ + err = -EEXIST; } goto err_element_clash; } -- cgit v1.2.3 From 0eb4b5ee33f2461d149408a247af8ae24756a6ca Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Sun, 22 Mar 2020 03:21:59 +0100 Subject: netfilter: nft_set_pipapo: Separate partial and complete overlap cases on insertion ...and return -ENOTEMPTY to the front-end on collision, -EEXIST if an identical element already exists. Together with the previous patch, element collision will now be returned to the user as -EEXIST. Reported-by: Phil Sutter Signed-off-by: Stefano Brivio Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_pipapo.c | 34 +++++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 4fc0c924ed5d..ef7e8ad2e344 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -1098,21 +1098,41 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, struct nft_pipapo_field *f; int i, bsize_max, err = 0; + if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END)) + end = (const u8 *)nft_set_ext_key_end(ext)->data; + else + end = start; + dup = pipapo_get(net, set, start, genmask); - if (PTR_ERR(dup) == -ENOENT) { - if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END)) { - end = (const u8 *)nft_set_ext_key_end(ext)->data; - dup = pipapo_get(net, set, end, nft_genmask_next(net)); - } else { - end = start; + if (!IS_ERR(dup)) { + /* Check if we already have the same exact entry */ + const struct nft_data *dup_key, *dup_end; + + dup_key = nft_set_ext_key(&dup->ext); + if (nft_set_ext_exists(&dup->ext, NFT_SET_EXT_KEY_END)) + dup_end = nft_set_ext_key_end(&dup->ext); + else + dup_end = dup_key; + + if (!memcmp(start, dup_key->data, sizeof(*dup_key->data)) && + !memcmp(end, dup_end->data, sizeof(*dup_end->data))) { + *ext2 = &dup->ext; + return -EEXIST; } + + return -ENOTEMPTY; + } + + if (PTR_ERR(dup) == -ENOENT) { + /* Look for partially overlapping entries */ + dup = pipapo_get(net, set, end, nft_genmask_next(net)); } if (PTR_ERR(dup) != -ENOENT) { if (IS_ERR(dup)) return PTR_ERR(dup); *ext2 = &dup->ext; - return -EEXIST; + return -ENOTEMPTY; } /* Validate */ -- cgit v1.2.3 From 6f7c9caf017be8ab0fe3b99509580d0793bf0833 Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Sun, 22 Mar 2020 03:22:00 +0100 Subject: netfilter: nft_set_rbtree: Introduce and use nft_rbtree_interval_start() Replace negations of nft_rbtree_interval_end() with a new helper, nft_rbtree_interval_start(), wherever this helps to visualise the problem at hand, that is, for all the occurrences except for the comparison against given flags in __nft_rbtree_get(). This gets especially useful in the next patch. Signed-off-by: Stefano Brivio Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_rbtree.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 5000b938ab1e..85572b2a6051 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -33,6 +33,11 @@ static bool nft_rbtree_interval_end(const struct nft_rbtree_elem *rbe) (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END); } +static bool nft_rbtree_interval_start(const struct nft_rbtree_elem *rbe) +{ + return !nft_rbtree_interval_end(rbe); +} + static bool nft_rbtree_equal(const struct nft_set *set, const void *this, const struct nft_rbtree_elem *interval) { @@ -64,7 +69,7 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set if (interval && nft_rbtree_equal(set, this, interval) && nft_rbtree_interval_end(rbe) && - !nft_rbtree_interval_end(interval)) + nft_rbtree_interval_start(interval)) continue; interval = rbe; } else if (d > 0) @@ -89,7 +94,7 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set if (set->flags & NFT_SET_INTERVAL && interval != NULL && nft_set_elem_active(&interval->ext, genmask) && - !nft_rbtree_interval_end(interval)) { + nft_rbtree_interval_start(interval)) { *ext = &interval->ext; return true; } @@ -224,9 +229,9 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, p = &parent->rb_right; else { if (nft_rbtree_interval_end(rbe) && - !nft_rbtree_interval_end(new)) { + nft_rbtree_interval_start(new)) { p = &parent->rb_left; - } else if (!nft_rbtree_interval_end(rbe) && + } else if (nft_rbtree_interval_start(rbe) && nft_rbtree_interval_end(new)) { p = &parent->rb_right; } else if (nft_set_elem_active(&rbe->ext, genmask)) { @@ -317,10 +322,10 @@ static void *nft_rbtree_deactivate(const struct net *net, parent = parent->rb_right; else { if (nft_rbtree_interval_end(rbe) && - !nft_rbtree_interval_end(this)) { + nft_rbtree_interval_start(this)) { parent = parent->rb_left; continue; - } else if (!nft_rbtree_interval_end(rbe) && + } else if (nft_rbtree_interval_start(rbe) && nft_rbtree_interval_end(this)) { parent = parent->rb_right; continue; -- cgit v1.2.3 From 7c84d41416d836ef7e533bd4d64ccbdf40c5ac70 Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Sun, 22 Mar 2020 03:22:01 +0100 Subject: netfilter: nft_set_rbtree: Detect partial overlaps on insertion ...and return -ENOTEMPTY to the front-end in this case, instead of proceeding. Currently, nft takes care of checking for these cases and not sending them to the kernel, but if we drop the set_overlap() call in nft we can end up in situations like: # nft add table t # nft add set t s '{ type inet_service ; flags interval ; }' # nft add element t s '{ 1 - 5 }' # nft add element t s '{ 6 - 10 }' # nft add element t s '{ 4 - 7 }' # nft list set t s table ip t { set s { type inet_service flags interval elements = { 1-3, 4-5, 6-7 } } } This change has the primary purpose of making the behaviour consistent with nft_set_pipapo, but is also functional to avoid inconsistent behaviour if userspace sends overlapping elements for any reason. v2: When we meet the same key data in the tree, as start element while inserting an end element, or as end element while inserting a start element, actually check that the existing element is active, before resetting the overlap flag (Pablo Neira Ayuso) Signed-off-by: Stefano Brivio Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_rbtree.c | 70 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 67 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 85572b2a6051..8617fc16a1ed 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -213,8 +213,43 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, u8 genmask = nft_genmask_next(net); struct nft_rbtree_elem *rbe; struct rb_node *parent, **p; + bool overlap = false; int d; + /* Detect overlaps as we descend the tree. Set the flag in these cases: + * + * a1. |__ _ _? >|__ _ _ (insert start after existing start) + * a2. _ _ __>| ?_ _ __| (insert end before existing end) + * a3. _ _ ___| ?_ _ _>| (insert end after existing end) + * a4. >|__ _ _ _ _ __| (insert start before existing end) + * + * and clear it later on, as we eventually reach the points indicated by + * '?' above, in the cases described below. We'll always meet these + * later, locally, due to tree ordering, and overlaps for the intervals + * that are the closest together are always evaluated last. + * + * b1. |__ _ _! >|__ _ _ (insert start after existing end) + * b2. _ _ __>| !_ _ __| (insert end before existing start) + * b3. !_____>| (insert end after existing start) + * + * Case a4. resolves to b1.: + * - if the inserted start element is the leftmost, because the '0' + * element in the tree serves as end element + * - otherwise, if an existing end is found. Note that end elements are + * always inserted after corresponding start elements. + * + * For a new, rightmost pair of elements, we'll hit cases b1. and b3., + * in that order. + * + * The flag is also cleared in two special cases: + * + * b4. |__ _ _!|<_ _ _ (insert start right before existing end) + * b5. |__ _ >|!__ _ _ (insert end right after existing start) + * + * which always happen as last step and imply that no further + * overlapping is possible. + */ + parent = NULL; p = &priv->root.rb_node; while (*p != NULL) { @@ -223,17 +258,42 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, d = memcmp(nft_set_ext_key(&rbe->ext), nft_set_ext_key(&new->ext), set->klen); - if (d < 0) + if (d < 0) { p = &parent->rb_left; - else if (d > 0) + + if (nft_rbtree_interval_start(new)) { + overlap = nft_rbtree_interval_start(rbe) && + nft_set_elem_active(&rbe->ext, + genmask); + } else { + overlap = nft_rbtree_interval_end(rbe) && + nft_set_elem_active(&rbe->ext, + genmask); + } + } else if (d > 0) { p = &parent->rb_right; - else { + + if (nft_rbtree_interval_end(new)) { + overlap = nft_rbtree_interval_end(rbe) && + nft_set_elem_active(&rbe->ext, + genmask); + } else if (nft_rbtree_interval_end(rbe) && + nft_set_elem_active(&rbe->ext, genmask)) { + overlap = true; + } + } else { if (nft_rbtree_interval_end(rbe) && nft_rbtree_interval_start(new)) { p = &parent->rb_left; + + if (nft_set_elem_active(&rbe->ext, genmask)) + overlap = false; } else if (nft_rbtree_interval_start(rbe) && nft_rbtree_interval_end(new)) { p = &parent->rb_right; + + if (nft_set_elem_active(&rbe->ext, genmask)) + overlap = false; } else if (nft_set_elem_active(&rbe->ext, genmask)) { *ext = &rbe->ext; return -EEXIST; @@ -242,6 +302,10 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, } } } + + if (overlap) + return -ENOTEMPTY; + rb_link_node_rcu(&new->node, parent, p); rb_insert_color(&new->node, &priv->root); return 0; -- cgit v1.2.3 From 76a109fac206e158eb3c967af98c178cff738e6a Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 23 Mar 2020 14:27:16 +0100 Subject: netfilter: nft_fwd_netdev: validate family and chain type Make sure the forward action is only used from ingress. Fixes: 39e6dea28adc ("netfilter: nf_tables: add forward expression to the netdev family") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_fwd_netdev.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'net') diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c index aba11c2333f3..ddd28de810b6 100644 --- a/net/netfilter/nft_fwd_netdev.c +++ b/net/netfilter/nft_fwd_netdev.c @@ -190,6 +190,13 @@ nla_put_failure: return -1; } +static int nft_fwd_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data) +{ + return nft_chain_validate_hooks(ctx->chain, (1 << NF_NETDEV_INGRESS)); +} + static struct nft_expr_type nft_fwd_netdev_type; static const struct nft_expr_ops nft_fwd_neigh_netdev_ops = { .type = &nft_fwd_netdev_type, @@ -197,6 +204,7 @@ static const struct nft_expr_ops nft_fwd_neigh_netdev_ops = { .eval = nft_fwd_neigh_eval, .init = nft_fwd_neigh_init, .dump = nft_fwd_neigh_dump, + .validate = nft_fwd_validate, }; static const struct nft_expr_ops nft_fwd_netdev_ops = { @@ -205,6 +213,7 @@ static const struct nft_expr_ops nft_fwd_netdev_ops = { .eval = nft_fwd_netdev_eval, .init = nft_fwd_netdev_init, .dump = nft_fwd_netdev_dump, + .validate = nft_fwd_validate, .offload = nft_fwd_netdev_offload, }; -- cgit v1.2.3 From bcfabee1afd99484b6ba067361b8678e28bbc065 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 23 Mar 2020 19:53:10 +0100 Subject: netfilter: nft_fwd_netdev: allow to redirect to ifb via ingress Set skb->tc_redirected to 1, otherwise the ifb driver drops the packet. Set skb->tc_from_ingress to 1 to reinject the packet back to the ingress path after leaving the ifb egress path. This patch inconditionally sets on these two skb fields that are meaningful to the ifb driver. The existing forward action is guaranteed to run from ingress path. Fixes: 39e6dea28adc ("netfilter: nf_tables: add forward expression to the netdev family") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_fwd_netdev.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c index ddd28de810b6..74f050ba6bad 100644 --- a/net/netfilter/nft_fwd_netdev.c +++ b/net/netfilter/nft_fwd_netdev.c @@ -28,6 +28,10 @@ static void nft_fwd_netdev_eval(const struct nft_expr *expr, struct nft_fwd_netdev *priv = nft_expr_priv(expr); int oif = regs->data[priv->sreg_dev]; + /* These are used by ifb only. */ + pkt->skb->tc_redirected = 1; + pkt->skb->tc_from_ingress = 1; + nf_fwd_netdev_egress(pkt, oif); regs->verdict.code = NF_STOLEN; } -- cgit v1.2.3 From 961d0e5b32946703125964f9f5b6321d60f4d706 Mon Sep 17 00:00:00 2001 From: Zh-yuan Ye Date: Tue, 24 Mar 2020 17:28:25 +0900 Subject: net: cbs: Fix software cbs to consider packet sending time Currently the software CBS does not consider the packet sending time when depleting the credits. It caused the throughput to be Idleslope[kbps] * (Port transmit rate[kbps] / |Sendslope[kbps]|) where Idleslope * (Port transmit rate / (Idleslope + |Sendslope|)) = Idleslope is expected. In order to fix the issue above, this patch takes the time when the packet sending completes into account by moving the anchor time variable "last" ahead to the send completion time upon transmission and adding wait when the next dequeue request comes before the send completion time of the previous packet. changelog: V2->V3: - remove unnecessary whitespace cleanup - add the checks if port_rate is 0 before division V1->V2: - combine variable "send_completed" into "last" - add the comment for estimate of the packet sending Fixes: 585d763af09c ("net/sched: Introduce Credit Based Shaper (CBS) qdisc") Signed-off-by: Zh-yuan Ye Reviewed-by: Vinicius Costa Gomes Signed-off-by: David S. Miller --- net/sched/sch_cbs.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c index b2905b03a432..2eaac2ff380f 100644 --- a/net/sched/sch_cbs.c +++ b/net/sched/sch_cbs.c @@ -181,6 +181,11 @@ static struct sk_buff *cbs_dequeue_soft(struct Qdisc *sch) s64 credits; int len; + /* The previous packet is still being sent */ + if (now < q->last) { + qdisc_watchdog_schedule_ns(&q->watchdog, q->last); + return NULL; + } if (q->credits < 0) { credits = timediff_to_credits(now - q->last, q->idleslope); @@ -212,7 +217,12 @@ static struct sk_buff *cbs_dequeue_soft(struct Qdisc *sch) credits += q->credits; q->credits = max_t(s64, credits, q->locredit); - q->last = now; + /* Estimate of the transmission of the last byte of the packet in ns */ + if (unlikely(atomic64_read(&q->port_rate) == 0)) + q->last = now; + else + q->last = now + div64_s64(len * NSEC_PER_SEC, + atomic64_read(&q->port_rate)); return skb; } -- cgit v1.2.3 From e80f40cbe4dd51371818e967d40da8fe305db5e4 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 24 Mar 2020 11:45:34 +0200 Subject: net: dsa: tag_8021q: replace dsa_8021q_remove_header with __skb_vlan_pop Not only did this wheel did not need reinventing, but there is also an issue with it: It doesn't remove the VLAN header in a way that preserves the L2 payload checksum when that is being provided by the DSA master hw. It should recalculate checksum both for the push, before removing the header, and for the pull afterwards. But the current implementation is quite dizzying, with pulls followed immediately afterwards by pushes, the memmove is done before the push, etc. This makes a DSA master with RX checksumming offload to print stack traces with the infamous 'hw csum failure' message. So remove the dsa_8021q_remove_header function and replace it with something that actually works with inet checksumming. Fixes: d461933638ae ("net: dsa: tag_8021q: Create helper function for removing VLAN header") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/linux/dsa/8021q.h | 7 ------- net/dsa/tag_8021q.c | 43 ------------------------------------------- net/dsa/tag_sja1105.c | 19 +++++++++---------- 3 files changed, 9 insertions(+), 60 deletions(-) (limited to 'net') diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h index 0aa803c451a3..c620d9139c28 100644 --- a/include/linux/dsa/8021q.h +++ b/include/linux/dsa/8021q.h @@ -28,8 +28,6 @@ int dsa_8021q_rx_switch_id(u16 vid); int dsa_8021q_rx_source_port(u16 vid); -struct sk_buff *dsa_8021q_remove_header(struct sk_buff *skb); - #else int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int index, @@ -64,11 +62,6 @@ int dsa_8021q_rx_source_port(u16 vid) return 0; } -struct sk_buff *dsa_8021q_remove_header(struct sk_buff *skb) -{ - return NULL; -} - #endif /* IS_ENABLED(CONFIG_NET_DSA_TAG_8021Q) */ #endif /* _NET_DSA_8021Q_H */ diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c index 2fb6c26294b5..b97ad93d1c1a 100644 --- a/net/dsa/tag_8021q.c +++ b/net/dsa/tag_8021q.c @@ -298,47 +298,4 @@ struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev, } EXPORT_SYMBOL_GPL(dsa_8021q_xmit); -/* In the DSA packet_type handler, skb->data points in the middle of the VLAN - * tag, after tpid and before tci. This is because so far, ETH_HLEN - * (DMAC, SMAC, EtherType) bytes were pulled. - * There are 2 bytes of VLAN tag left in skb->data, and upper - * layers expect the 'real' EtherType to be consumed as well. - * Coincidentally, a VLAN header is also of the same size as - * the number of bytes that need to be pulled. - * - * skb_mac_header skb->data - * | | - * v v - * | | | | | | | | | | | | | | | | | | | - * +-----------------------+-----------------------+-------+-------+-------+ - * | Destination MAC | Source MAC | TPID | TCI | EType | - * +-----------------------+-----------------------+-------+-------+-------+ - * ^ | | - * |<--VLAN_HLEN-->to <---VLAN_HLEN---> - * from | - * >>>>>>> v - * >>>>>>> | | | | | | | | | | | | | | | - * >>>>>>> +-----------------------+-----------------------+-------+ - * >>>>>>> | Destination MAC | Source MAC | EType | - * +-----------------------+-----------------------+-------+ - * ^ ^ - * (now part of | | - * skb->head) skb_mac_header skb->data - */ -struct sk_buff *dsa_8021q_remove_header(struct sk_buff *skb) -{ - u8 *from = skb_mac_header(skb); - u8 *dest = from + VLAN_HLEN; - - memmove(dest, from, ETH_HLEN - VLAN_HLEN); - skb_pull(skb, VLAN_HLEN); - skb_push(skb, ETH_HLEN); - skb_reset_mac_header(skb); - skb_reset_mac_len(skb); - skb_pull_rcsum(skb, ETH_HLEN); - - return skb; -} -EXPORT_SYMBOL_GPL(dsa_8021q_remove_header); - MODULE_LICENSE("GPL v2"); diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c index 5366ea430349..d553bf36bd41 100644 --- a/net/dsa/tag_sja1105.c +++ b/net/dsa/tag_sja1105.c @@ -250,14 +250,14 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb, { struct sja1105_meta meta = {0}; int source_port, switch_id; - struct vlan_ethhdr *hdr; + struct ethhdr *hdr; u16 tpid, vid, tci; bool is_link_local; bool is_tagged; bool is_meta; - hdr = vlan_eth_hdr(skb); - tpid = ntohs(hdr->h_vlan_proto); + hdr = eth_hdr(skb); + tpid = ntohs(hdr->h_proto); is_tagged = (tpid == ETH_P_SJA1105); is_link_local = sja1105_is_link_local(skb); is_meta = sja1105_is_meta_frame(skb); @@ -266,7 +266,12 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb, if (is_tagged) { /* Normal traffic path. */ - tci = ntohs(hdr->h_vlan_TCI); + skb_push_rcsum(skb, ETH_HLEN); + __skb_vlan_pop(skb, &tci); + skb_pull_rcsum(skb, ETH_HLEN); + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + vid = tci & VLAN_VID_MASK; source_port = dsa_8021q_rx_source_port(vid); switch_id = dsa_8021q_rx_switch_id(vid); @@ -295,12 +300,6 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb, return NULL; } - /* Delete/overwrite fake VLAN header, DSA expects to not find - * it there, see dsa_switch_rcv: skb_push(skb, ETH_HLEN). - */ - if (is_tagged) - skb = dsa_8021q_remove_header(skb); - return sja1105_rcv_meta_state_machine(skb, &meta, is_link_local, is_meta); } -- cgit v1.2.3 From 2c64605b590edadb3fb46d1ec6badb49e940b479 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 25 Mar 2020 13:47:18 +0100 Subject: net: Fix CONFIG_NET_CLS_ACT=n and CONFIG_NFT_FWD_NETDEV={y, m} build MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit net/netfilter/nft_fwd_netdev.c: In function ‘nft_fwd_netdev_eval’: net/netfilter/nft_fwd_netdev.c:32:10: error: ‘struct sk_buff’ has no member named ‘tc_redirected’ pkt->skb->tc_redirected = 1; ^~ net/netfilter/nft_fwd_netdev.c:33:10: error: ‘struct sk_buff’ has no member named ‘tc_from_ingress’ pkt->skb->tc_from_ingress = 1; ^~ To avoid a direct dependency with tc actions from netfilter, wrap the redirect bits around CONFIG_NET_REDIRECT and move helpers to include/linux/skbuff.h. Turn on this toggle from the ifb driver, the only existing client of these bits in the tree. This patch adds skb_set_redirected() that sets on the redirected bit on the skbuff, it specifies if the packet was redirect from ingress and resets the timestamp (timestamp reset was originally missing in the netfilter bugfix). Fixes: bcfabee1afd99484 ("netfilter: nft_fwd_netdev: allow to redirect to ifb via ingress") Reported-by: noreply@ellerman.id.au Reported-by: Geert Uytterhoeven Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- drivers/net/Kconfig | 1 + drivers/net/ifb.c | 6 +++--- drivers/net/wireguard/queueing.h | 2 +- include/linux/skbuff.h | 36 ++++++++++++++++++++++++++++++++---- include/net/sch_generic.h | 16 ---------------- net/Kconfig | 3 +++ net/core/dev.c | 4 ++-- net/core/pktgen.c | 2 +- net/netfilter/nft_fwd_netdev.c | 5 ++--- net/sched/act_mirred.c | 6 ++---- 10 files changed, 47 insertions(+), 34 deletions(-) (limited to 'net') diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index 25a8f9387d5a..db8884ad6d40 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -149,6 +149,7 @@ config NET_FC config IFB tristate "Intermediate Functional Block support" depends on NET_CLS_ACT + select NET_REDIRECT ---help--- This is an intermediate driver that allows sharing of resources. diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c index 242b9b0943f8..7fe306e76281 100644 --- a/drivers/net/ifb.c +++ b/drivers/net/ifb.c @@ -75,7 +75,7 @@ static void ifb_ri_tasklet(unsigned long _txp) } while ((skb = __skb_dequeue(&txp->tq)) != NULL) { - skb->tc_redirected = 0; + skb->redirected = 0; skb->tc_skip_classify = 1; u64_stats_update_begin(&txp->tsync); @@ -96,7 +96,7 @@ static void ifb_ri_tasklet(unsigned long _txp) rcu_read_unlock(); skb->skb_iif = txp->dev->ifindex; - if (!skb->tc_from_ingress) { + if (!skb->from_ingress) { dev_queue_xmit(skb); } else { skb_pull_rcsum(skb, skb->mac_len); @@ -243,7 +243,7 @@ static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev) txp->rx_bytes += skb->len; u64_stats_update_end(&txp->rsync); - if (!skb->tc_redirected || !skb->skb_iif) { + if (!skb->redirected || !skb->skb_iif) { dev_kfree_skb(skb); dev->stats.rx_dropped++; return NETDEV_TX_OK; diff --git a/drivers/net/wireguard/queueing.h b/drivers/net/wireguard/queueing.h index cf1e0e2376d8..3432232afe06 100644 --- a/drivers/net/wireguard/queueing.h +++ b/drivers/net/wireguard/queueing.h @@ -100,8 +100,8 @@ static inline void wg_reset_packet(struct sk_buff *skb) skb->dev = NULL; #ifdef CONFIG_NET_SCHED skb->tc_index = 0; - skb_reset_tc(skb); #endif + skb_reset_redirect(skb); skb->hdr_len = skb_headroom(skb); skb_reset_mac_header(skb); skb_reset_network_header(skb); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 5b50278c4bc8..e59620234415 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -645,8 +645,8 @@ typedef unsigned char *sk_buff_data_t; * @offload_l3_fwd_mark: Packet was L3-forwarded in hardware * @tc_skip_classify: do not classify packet. set by IFB device * @tc_at_ingress: used within tc_classify to distinguish in/egress - * @tc_redirected: packet was redirected by a tc action - * @tc_from_ingress: if tc_redirected, tc_at_ingress at time of redirect + * @redirected: packet was redirected by packet classifier + * @from_ingress: packet was redirected from the ingress path * @peeked: this packet has been seen already, so stats have been * done for it, don't do them again * @nf_trace: netfilter packet trace flag @@ -848,8 +848,10 @@ struct sk_buff { #ifdef CONFIG_NET_CLS_ACT __u8 tc_skip_classify:1; __u8 tc_at_ingress:1; - __u8 tc_redirected:1; - __u8 tc_from_ingress:1; +#endif +#ifdef CONFIG_NET_REDIRECT + __u8 redirected:1; + __u8 from_ingress:1; #endif #ifdef CONFIG_TLS_DEVICE __u8 decrypted:1; @@ -4579,5 +4581,31 @@ static inline __wsum lco_csum(struct sk_buff *skb) return csum_partial(l4_hdr, csum_start - l4_hdr, partial); } +static inline bool skb_is_redirected(const struct sk_buff *skb) +{ +#ifdef CONFIG_NET_REDIRECT + return skb->redirected; +#else + return false; +#endif +} + +static inline void skb_set_redirected(struct sk_buff *skb, bool from_ingress) +{ +#ifdef CONFIG_NET_REDIRECT + skb->redirected = 1; + skb->from_ingress = from_ingress; + if (skb->from_ingress) + skb->tstamp = 0; +#endif +} + +static inline void skb_reset_redirect(struct sk_buff *skb) +{ +#ifdef CONFIG_NET_REDIRECT + skb->redirected = 0; +#endif +} + #endif /* __KERNEL__ */ #endif /* _LINUX_SKBUFF_H */ diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 151208704ed2..c30f914867e6 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -675,22 +675,6 @@ void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab); int skb_do_redirect(struct sk_buff *); -static inline void skb_reset_tc(struct sk_buff *skb) -{ -#ifdef CONFIG_NET_CLS_ACT - skb->tc_redirected = 0; -#endif -} - -static inline bool skb_is_tc_redirected(const struct sk_buff *skb) -{ -#ifdef CONFIG_NET_CLS_ACT - return skb->tc_redirected; -#else - return false; -#endif -} - static inline bool skb_at_tc_ingress(const struct sk_buff *skb) { #ifdef CONFIG_NET_CLS_ACT diff --git a/net/Kconfig b/net/Kconfig index 2eeb0e55f7c9..df8d8c9bd021 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -52,6 +52,9 @@ config NET_INGRESS config NET_EGRESS bool +config NET_REDIRECT + bool + config SKB_EXTENSIONS bool diff --git a/net/core/dev.c b/net/core/dev.c index 402a986659cf..500bba8874b0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4516,7 +4516,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, /* Reinjected packets coming from act_mirred or similar should * not get XDP generic processing. */ - if (skb_is_tc_redirected(skb)) + if (skb_is_redirected(skb)) return XDP_PASS; /* XDP packets must be linear and must have sufficient headroom @@ -5063,7 +5063,7 @@ skip_taps: goto out; } #endif - skb_reset_tc(skb); + skb_reset_redirect(skb); skip_classify: if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) goto drop; diff --git a/net/core/pktgen.c b/net/core/pktgen.c index acc849df60b5..d0641bba6b81 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3362,7 +3362,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) /* skb was 'freed' by stack, so clean few * bits and reuse it */ - skb_reset_tc(skb); + skb_reset_redirect(skb); } while (--burst > 0); goto out; /* Skips xmit_mode M_START_XMIT */ } else if (pkt_dev->xmit_mode == M_QUEUE_XMIT) { diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c index 74f050ba6bad..3087e23297db 100644 --- a/net/netfilter/nft_fwd_netdev.c +++ b/net/netfilter/nft_fwd_netdev.c @@ -28,9 +28,8 @@ static void nft_fwd_netdev_eval(const struct nft_expr *expr, struct nft_fwd_netdev *priv = nft_expr_priv(expr); int oif = regs->data[priv->sreg_dev]; - /* These are used by ifb only. */ - pkt->skb->tc_redirected = 1; - pkt->skb->tc_from_ingress = 1; + /* This is used by ifb only. */ + skb_set_redirected(pkt->skb, true); nf_fwd_netdev_egress(pkt, oif); regs->verdict.code = NF_STOLEN; diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 1ad300e6dbc0..83dd82fc9f40 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -284,10 +284,8 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, /* mirror is always swallowed */ if (is_redirect) { - skb2->tc_redirected = 1; - skb2->tc_from_ingress = skb2->tc_at_ingress; - if (skb2->tc_from_ingress) - skb2->tstamp = 0; + skb_set_redirected(skb2, skb2->tc_at_ingress); + /* let's the caller reinsert the packet, if possible */ if (use_reinsert) { res->ingress = want_ingress; -- cgit v1.2.3