From f92f26f2ed2c9f92c9270c705bca96310c3cdf5a Mon Sep 17 00:00:00 2001 From: Luca Coelho Date: Fri, 24 Apr 2020 12:20:08 +0300 Subject: iwlwifi: pcie: handle QuZ configs with killer NICs as well The killer devices were left out of the checks that convert Qu-B0 to QuZ configurations. Add them. Cc: stable@vger.kernel.org # v5.3+ Fixes: 5a8c31aa6357 ("iwlwifi: pcie: fix recognition of QuZ devices") Signed-off-by: Luca Coelho Tested-by: You-Sheng Yang Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/iwlwifi.20200424121518.b715acfbe211.I273a098064a22577e4fca767910fd9cf0013f5cb@changeid --- drivers/net/wireless/intel/iwlwifi/pcie/drv.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c index 6744c0281ffb..29971c25dba4 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c @@ -1092,6 +1092,10 @@ static int iwl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) iwl_trans->cfg = &iwl_ax101_cfg_quz_hr; else if (iwl_trans->cfg == &iwl_ax201_cfg_qu_hr) iwl_trans->cfg = &iwl_ax201_cfg_quz_hr; + else if (iwl_trans->cfg == &killer1650s_2ax_cfg_qu_b0_hr_b0) + iwl_trans->cfg = &iwl_ax1650s_cfg_quz_hr; + else if (iwl_trans->cfg == &killer1650i_2ax_cfg_qu_b0_hr_b0) + iwl_trans->cfg = &iwl_ax1650i_cfg_quz_hr; } #endif -- cgit v1.2.3 From c410bf01933e5e09d142c66c3df9ad470a7eec13 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 11 May 2020 14:54:34 +0100 Subject: rxrpc: Fix the excessive initial retransmission timeout rxrpc currently uses a fixed 4s retransmission timeout until the RTT is sufficiently sampled. This can cause problems with some fileservers with calls to the cache manager in the afs filesystem being dropped from the fileserver because a packet goes missing and the retransmission timeout is greater than the call expiry timeout. Fix this by: (1) Copying the RTT/RTO calculation code from Linux's TCP implementation and altering it to fit rxrpc. (2) Altering the various users of the RTT to make use of the new SRTT value. (3) Replacing the use of rxrpc_resend_timeout to use the calculated RTO value instead (which is needed in jiffies), along with a backoff. Notes: (1) rxrpc provides RTT samples by matching the serial numbers on outgoing DATA packets that have the RXRPC_REQUEST_ACK set and PING ACK packets against the reference serial number in incoming REQUESTED ACK and PING-RESPONSE ACK packets. (2) Each packet that is transmitted on an rxrpc connection gets a new per-connection serial number, even for retransmissions, so an ACK can be cross-referenced to a specific trigger packet. This allows RTT information to be drawn from retransmitted DATA packets also. (3) rxrpc maintains the RTT/RTO state on the rxrpc_peer record rather than on an rxrpc_call because many RPC calls won't live long enough to generate more than one sample. (4) The calculated SRTT value is in units of 8ths of a microsecond rather than nanoseconds. The (S)RTT and RTO values are displayed in /proc/net/rxrpc/peers. Fixes: 17926a79320a ([AF_RXRPC]: Provide secure RxRPC sockets for use by userspace and kernel both"") Signed-off-by: David Howells --- fs/afs/fs_probe.c | 18 ++-- fs/afs/vl_probe.c | 18 ++-- include/net/af_rxrpc.h | 2 +- include/trace/events/rxrpc.h | 17 ++-- net/rxrpc/Makefile | 1 + net/rxrpc/ar-internal.h | 25 ++++-- net/rxrpc/call_accept.c | 2 +- net/rxrpc/call_event.c | 22 ++--- net/rxrpc/input.c | 6 +- net/rxrpc/misc.c | 5 -- net/rxrpc/output.c | 9 +- net/rxrpc/peer_event.c | 46 ---------- net/rxrpc/peer_object.c | 12 +-- net/rxrpc/proc.c | 8 +- net/rxrpc/rtt.c | 195 +++++++++++++++++++++++++++++++++++++++++++ net/rxrpc/sendmsg.c | 26 ++---- net/rxrpc/sysctl.c | 9 -- 17 files changed, 266 insertions(+), 155 deletions(-) create mode 100644 net/rxrpc/rtt.c diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c index a587767b6ae1..237352d3cb53 100644 --- a/fs/afs/fs_probe.c +++ b/fs/afs/fs_probe.c @@ -32,9 +32,8 @@ void afs_fileserver_probe_result(struct afs_call *call) struct afs_server *server = call->server; unsigned int server_index = call->server_index; unsigned int index = call->addr_ix; - unsigned int rtt = UINT_MAX; + unsigned int rtt_us; bool have_result = false; - u64 _rtt; int ret = call->error; _enter("%pU,%u", &server->uuid, index); @@ -93,15 +92,9 @@ responded: } } - /* Get the RTT and scale it to fit into a 32-bit value that represents - * over a minute of time so that we can access it with one instruction - * on a 32-bit system. - */ - _rtt = rxrpc_kernel_get_rtt(call->net->socket, call->rxcall); - _rtt /= 64; - rtt = (_rtt > UINT_MAX) ? UINT_MAX : _rtt; - if (rtt < server->probe.rtt) { - server->probe.rtt = rtt; + rtt_us = rxrpc_kernel_get_srtt(call->net->socket, call->rxcall); + if (rtt_us < server->probe.rtt) { + server->probe.rtt = rtt_us; alist->preferred = index; have_result = true; } @@ -113,8 +106,7 @@ out: spin_unlock(&server->probe_lock); _debug("probe [%u][%u] %pISpc rtt=%u ret=%d", - server_index, index, &alist->addrs[index].transport, - (unsigned int)rtt, ret); + server_index, index, &alist->addrs[index].transport, rtt_us, ret); have_result |= afs_fs_probe_done(server); if (have_result) diff --git a/fs/afs/vl_probe.c b/fs/afs/vl_probe.c index 858498cc1b05..e3aa013c2177 100644 --- a/fs/afs/vl_probe.c +++ b/fs/afs/vl_probe.c @@ -31,10 +31,9 @@ void afs_vlserver_probe_result(struct afs_call *call) struct afs_addr_list *alist = call->alist; struct afs_vlserver *server = call->vlserver; unsigned int server_index = call->server_index; + unsigned int rtt_us = 0; unsigned int index = call->addr_ix; - unsigned int rtt = UINT_MAX; bool have_result = false; - u64 _rtt; int ret = call->error; _enter("%s,%u,%u,%d,%d", server->name, server_index, index, ret, call->abort_code); @@ -93,15 +92,9 @@ responded: } } - /* Get the RTT and scale it to fit into a 32-bit value that represents - * over a minute of time so that we can access it with one instruction - * on a 32-bit system. - */ - _rtt = rxrpc_kernel_get_rtt(call->net->socket, call->rxcall); - _rtt /= 64; - rtt = (_rtt > UINT_MAX) ? UINT_MAX : _rtt; - if (rtt < server->probe.rtt) { - server->probe.rtt = rtt; + rtt_us = rxrpc_kernel_get_srtt(call->net->socket, call->rxcall); + if (rtt_us < server->probe.rtt) { + server->probe.rtt = rtt_us; alist->preferred = index; have_result = true; } @@ -113,8 +106,7 @@ out: spin_unlock(&server->probe_lock); _debug("probe [%u][%u] %pISpc rtt=%u ret=%d", - server_index, index, &alist->addrs[index].transport, - (unsigned int)rtt, ret); + server_index, index, &alist->addrs[index].transport, rtt_us, ret); have_result |= afs_vl_probe_done(server); if (have_result) { diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index 04e97bab6f28..ab988940bf04 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -59,7 +59,7 @@ bool rxrpc_kernel_abort_call(struct socket *, struct rxrpc_call *, void rxrpc_kernel_end_call(struct socket *, struct rxrpc_call *); void rxrpc_kernel_get_peer(struct socket *, struct rxrpc_call *, struct sockaddr_rxrpc *); -u64 rxrpc_kernel_get_rtt(struct socket *, struct rxrpc_call *); +u32 rxrpc_kernel_get_srtt(struct socket *, struct rxrpc_call *); int rxrpc_kernel_charge_accept(struct socket *, rxrpc_notify_rx_t, rxrpc_user_attach_call_t, unsigned long, gfp_t, unsigned int); diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 191fe447f990..ab75f261f04a 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -1112,18 +1112,17 @@ TRACE_EVENT(rxrpc_rtt_tx, TRACE_EVENT(rxrpc_rtt_rx, TP_PROTO(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why, rxrpc_serial_t send_serial, rxrpc_serial_t resp_serial, - s64 rtt, u8 nr, s64 avg), + u32 rtt, u32 rto), - TP_ARGS(call, why, send_serial, resp_serial, rtt, nr, avg), + TP_ARGS(call, why, send_serial, resp_serial, rtt, rto), TP_STRUCT__entry( __field(unsigned int, call ) __field(enum rxrpc_rtt_rx_trace, why ) - __field(u8, nr ) __field(rxrpc_serial_t, send_serial ) __field(rxrpc_serial_t, resp_serial ) - __field(s64, rtt ) - __field(u64, avg ) + __field(u32, rtt ) + __field(u32, rto ) ), TP_fast_assign( @@ -1132,18 +1131,16 @@ TRACE_EVENT(rxrpc_rtt_rx, __entry->send_serial = send_serial; __entry->resp_serial = resp_serial; __entry->rtt = rtt; - __entry->nr = nr; - __entry->avg = avg; + __entry->rto = rto; ), - TP_printk("c=%08x %s sr=%08x rr=%08x rtt=%lld nr=%u avg=%lld", + TP_printk("c=%08x %s sr=%08x rr=%08x rtt=%u rto=%u", __entry->call, __print_symbolic(__entry->why, rxrpc_rtt_rx_traces), __entry->send_serial, __entry->resp_serial, __entry->rtt, - __entry->nr, - __entry->avg) + __entry->rto) ); TRACE_EVENT(rxrpc_timer, diff --git a/net/rxrpc/Makefile b/net/rxrpc/Makefile index 6ffb7e9887ce..ddd0f95713a9 100644 --- a/net/rxrpc/Makefile +++ b/net/rxrpc/Makefile @@ -25,6 +25,7 @@ rxrpc-y := \ peer_event.o \ peer_object.o \ recvmsg.o \ + rtt.o \ security.o \ sendmsg.o \ skbuff.o \ diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 3eb1ab40ca5c..9fe264bec70c 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -311,11 +312,14 @@ struct rxrpc_peer { #define RXRPC_RTT_CACHE_SIZE 32 spinlock_t rtt_input_lock; /* RTT lock for input routine */ ktime_t rtt_last_req; /* Time of last RTT request */ - u64 rtt; /* Current RTT estimate (in nS) */ - u64 rtt_sum; /* Sum of cache contents */ - u64 rtt_cache[RXRPC_RTT_CACHE_SIZE]; /* Determined RTT cache */ - u8 rtt_cursor; /* next entry at which to insert */ - u8 rtt_usage; /* amount of cache actually used */ + unsigned int rtt_count; /* Number of samples we've got */ + + u32 srtt_us; /* smoothed round trip time << 3 in usecs */ + u32 mdev_us; /* medium deviation */ + u32 mdev_max_us; /* maximal mdev for the last rtt period */ + u32 rttvar_us; /* smoothed mdev_max */ + u32 rto_j; /* Retransmission timeout in jiffies */ + u8 backoff; /* Backoff timeout */ u8 cong_cwnd; /* Congestion window size */ }; @@ -1041,7 +1045,6 @@ extern unsigned long rxrpc_idle_ack_delay; extern unsigned int rxrpc_rx_window_size; extern unsigned int rxrpc_rx_mtu; extern unsigned int rxrpc_rx_jumbo_max; -extern unsigned long rxrpc_resend_timeout; extern const s8 rxrpc_ack_priority[]; @@ -1069,8 +1072,6 @@ void rxrpc_send_keepalive(struct rxrpc_peer *); * peer_event.c */ void rxrpc_error_report(struct sock *); -void rxrpc_peer_add_rtt(struct rxrpc_call *, enum rxrpc_rtt_rx_trace, - rxrpc_serial_t, rxrpc_serial_t, ktime_t, ktime_t); void rxrpc_peer_keepalive_worker(struct work_struct *); /* @@ -1102,6 +1103,14 @@ extern const struct seq_operations rxrpc_peer_seq_ops; void rxrpc_notify_socket(struct rxrpc_call *); int rxrpc_recvmsg(struct socket *, struct msghdr *, size_t, int); +/* + * rtt.c + */ +void rxrpc_peer_add_rtt(struct rxrpc_call *, enum rxrpc_rtt_rx_trace, + rxrpc_serial_t, rxrpc_serial_t, ktime_t, ktime_t); +unsigned long rxrpc_get_rto_backoff(struct rxrpc_peer *, bool); +void rxrpc_peer_init_rtt(struct rxrpc_peer *); + /* * rxkad.c */ diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 70e44abf106c..b7611cc159e5 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -248,7 +248,7 @@ static void rxrpc_send_ping(struct rxrpc_call *call, struct sk_buff *skb) struct rxrpc_skb_priv *sp = rxrpc_skb(skb); ktime_t now = skb->tstamp; - if (call->peer->rtt_usage < 3 || + if (call->peer->rtt_count < 3 || ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), now)) rxrpc_propose_ACK(call, RXRPC_ACK_PING, sp->hdr.serial, true, true, diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index cedbbb3a7c2e..2a65ac41055f 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -111,8 +111,8 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, } else { unsigned long now = jiffies, ack_at; - if (call->peer->rtt_usage > 0) - ack_at = nsecs_to_jiffies(call->peer->rtt); + if (call->peer->srtt_us != 0) + ack_at = usecs_to_jiffies(call->peer->srtt_us >> 3); else ack_at = expiry; @@ -157,24 +157,18 @@ static void rxrpc_congestion_timeout(struct rxrpc_call *call) static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j) { struct sk_buff *skb; - unsigned long resend_at; + unsigned long resend_at, rto_j; rxrpc_seq_t cursor, seq, top; - ktime_t now, max_age, oldest, ack_ts, timeout, min_timeo; + ktime_t now, max_age, oldest, ack_ts; int ix; u8 annotation, anno_type, retrans = 0, unacked = 0; _enter("{%d,%d}", call->tx_hard_ack, call->tx_top); - if (call->peer->rtt_usage > 1) - timeout = ns_to_ktime(call->peer->rtt * 3 / 2); - else - timeout = ms_to_ktime(rxrpc_resend_timeout); - min_timeo = ns_to_ktime((1000000000 / HZ) * 4); - if (ktime_before(timeout, min_timeo)) - timeout = min_timeo; + rto_j = call->peer->rto_j; now = ktime_get_real(); - max_age = ktime_sub(now, timeout); + max_age = ktime_sub(now, jiffies_to_usecs(rto_j)); spin_lock_bh(&call->lock); @@ -219,7 +213,7 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j) } resend_at = nsecs_to_jiffies(ktime_to_ns(ktime_sub(now, oldest))); - resend_at += jiffies + rxrpc_resend_timeout; + resend_at += jiffies + rto_j; WRITE_ONCE(call->resend_at, resend_at); if (unacked) @@ -234,7 +228,7 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j) rxrpc_timer_set_for_resend); spin_unlock_bh(&call->lock); ack_ts = ktime_sub(now, call->acks_latest_ts); - if (ktime_to_ns(ack_ts) < call->peer->rtt) + if (ktime_to_us(ack_ts) < (call->peer->srtt_us >> 3)) goto out; rxrpc_propose_ACK(call, RXRPC_ACK_PING, 0, true, false, rxrpc_propose_ack_ping_for_lost_ack); diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 69e09d69c896..e438bfd3fdf5 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -91,11 +91,11 @@ static void rxrpc_congestion_management(struct rxrpc_call *call, /* We analyse the number of packets that get ACK'd per RTT * period and increase the window if we managed to fill it. */ - if (call->peer->rtt_usage == 0) + if (call->peer->rtt_count == 0) goto out; if (ktime_before(skb->tstamp, - ktime_add_ns(call->cong_tstamp, - call->peer->rtt))) + ktime_add_us(call->cong_tstamp, + call->peer->srtt_us >> 3))) goto out_no_clear_ca; change = rxrpc_cong_rtt_window_end; call->cong_tstamp = skb->tstamp; diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c index 214405f75346..d4144fd86f84 100644 --- a/net/rxrpc/misc.c +++ b/net/rxrpc/misc.c @@ -63,11 +63,6 @@ unsigned int rxrpc_rx_mtu = 5692; */ unsigned int rxrpc_rx_jumbo_max = 4; -/* - * Time till packet resend (in milliseconds). - */ -unsigned long rxrpc_resend_timeout = 4 * HZ; - const s8 rxrpc_ack_priority[] = { [0] = 0, [RXRPC_ACK_DELAY] = 1, diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 90e263c6aa69..f8b632a5c619 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -369,7 +369,7 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb, (test_and_clear_bit(RXRPC_CALL_EV_ACK_LOST, &call->events) || retrans || call->cong_mode == RXRPC_CALL_SLOW_START || - (call->peer->rtt_usage < 3 && sp->hdr.seq & 1) || + (call->peer->rtt_count < 3 && sp->hdr.seq & 1) || ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), ktime_get_real()))) whdr.flags |= RXRPC_REQUEST_ACK; @@ -423,13 +423,10 @@ done: if (whdr.flags & RXRPC_REQUEST_ACK) { call->peer->rtt_last_req = skb->tstamp; trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_data, serial); - if (call->peer->rtt_usage > 1) { + if (call->peer->rtt_count > 1) { unsigned long nowj = jiffies, ack_lost_at; - ack_lost_at = nsecs_to_jiffies(2 * call->peer->rtt); - if (ack_lost_at < 1) - ack_lost_at = 1; - + ack_lost_at = rxrpc_get_rto_backoff(call->peer, retrans); ack_lost_at += nowj; WRITE_ONCE(call->ack_lost_at, ack_lost_at); rxrpc_reduce_call_timer(call, ack_lost_at, nowj, diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index 923b263c401b..b1449d971883 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -295,52 +295,6 @@ static void rxrpc_distribute_error(struct rxrpc_peer *peer, int error, } } -/* - * Add RTT information to cache. This is called in softirq mode and has - * exclusive access to the peer RTT data. - */ -void rxrpc_peer_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why, - rxrpc_serial_t send_serial, rxrpc_serial_t resp_serial, - ktime_t send_time, ktime_t resp_time) -{ - struct rxrpc_peer *peer = call->peer; - s64 rtt; - u64 sum = peer->rtt_sum, avg; - u8 cursor = peer->rtt_cursor, usage = peer->rtt_usage; - - rtt = ktime_to_ns(ktime_sub(resp_time, send_time)); - if (rtt < 0) - return; - - spin_lock(&peer->rtt_input_lock); - - /* Replace the oldest datum in the RTT buffer */ - sum -= peer->rtt_cache[cursor]; - sum += rtt; - peer->rtt_cache[cursor] = rtt; - peer->rtt_cursor = (cursor + 1) & (RXRPC_RTT_CACHE_SIZE - 1); - peer->rtt_sum = sum; - if (usage < RXRPC_RTT_CACHE_SIZE) { - usage++; - peer->rtt_usage = usage; - } - - spin_unlock(&peer->rtt_input_lock); - - /* Now recalculate the average */ - if (usage == RXRPC_RTT_CACHE_SIZE) { - avg = sum / RXRPC_RTT_CACHE_SIZE; - } else { - avg = sum; - do_div(avg, usage); - } - - /* Don't need to update this under lock */ - peer->rtt = avg; - trace_rxrpc_rtt_rx(call, why, send_serial, resp_serial, rtt, - usage, avg); -} - /* * Perform keep-alive pings. */ diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index 452163eadb98..ca29976bb193 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -225,6 +225,8 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp) spin_lock_init(&peer->rtt_input_lock); peer->debug_id = atomic_inc_return(&rxrpc_debug_id); + rxrpc_peer_init_rtt(peer); + if (RXRPC_TX_SMSS > 2190) peer->cong_cwnd = 2; else if (RXRPC_TX_SMSS > 1095) @@ -497,14 +499,14 @@ void rxrpc_kernel_get_peer(struct socket *sock, struct rxrpc_call *call, EXPORT_SYMBOL(rxrpc_kernel_get_peer); /** - * rxrpc_kernel_get_rtt - Get a call's peer RTT + * rxrpc_kernel_get_srtt - Get a call's peer smoothed RTT * @sock: The socket on which the call is in progress. * @call: The call to query * - * Get the call's peer RTT. + * Get the call's peer smoothed RTT. */ -u64 rxrpc_kernel_get_rtt(struct socket *sock, struct rxrpc_call *call) +u32 rxrpc_kernel_get_srtt(struct socket *sock, struct rxrpc_call *call) { - return call->peer->rtt; + return call->peer->srtt_us >> 3; } -EXPORT_SYMBOL(rxrpc_kernel_get_rtt); +EXPORT_SYMBOL(rxrpc_kernel_get_srtt); diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index b9d053e42821..8b179e3c802a 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -222,7 +222,7 @@ static int rxrpc_peer_seq_show(struct seq_file *seq, void *v) seq_puts(seq, "Proto Local " " Remote " - " Use CW MTU LastUse RTT Rc\n" + " Use CW MTU LastUse RTT RTO\n" ); return 0; } @@ -236,15 +236,15 @@ static int rxrpc_peer_seq_show(struct seq_file *seq, void *v) now = ktime_get_seconds(); seq_printf(seq, "UDP %-47.47s %-47.47s %3u" - " %3u %5u %6llus %12llu %2u\n", + " %3u %5u %6llus %8u %8u\n", lbuff, rbuff, atomic_read(&peer->usage), peer->cong_cwnd, peer->mtu, now - peer->last_tx_at, - peer->rtt, - peer->rtt_cursor); + peer->srtt_us >> 3, + jiffies_to_usecs(peer->rto_j)); return 0; } diff --git a/net/rxrpc/rtt.c b/net/rxrpc/rtt.c new file mode 100644 index 000000000000..928d8b34a3ee --- /dev/null +++ b/net/rxrpc/rtt.c @@ -0,0 +1,195 @@ +// SPDX-License-Identifier: GPL-2.0 +/* RTT/RTO calculation. + * + * Adapted from TCP for AF_RXRPC by David Howells (dhowells@redhat.com) + * + * https://tools.ietf.org/html/rfc6298 + * https://tools.ietf.org/html/rfc1122#section-4.2.3.1 + * http://ccr.sigcomm.org/archive/1995/jan95/ccr-9501-partridge87.pdf + */ + +#include +#include "ar-internal.h" + +#define RXRPC_RTO_MAX ((unsigned)(120 * HZ)) +#define RXRPC_TIMEOUT_INIT ((unsigned)(1*HZ)) /* RFC6298 2.1 initial RTO value */ +#define rxrpc_jiffies32 ((u32)jiffies) /* As rxrpc_jiffies32 */ +#define rxrpc_min_rtt_wlen 300 /* As sysctl_tcp_min_rtt_wlen */ + +static u32 rxrpc_rto_min_us(struct rxrpc_peer *peer) +{ + return 200; +} + +static u32 __rxrpc_set_rto(const struct rxrpc_peer *peer) +{ + return _usecs_to_jiffies((peer->srtt_us >> 3) + peer->rttvar_us); +} + +static u32 rxrpc_bound_rto(u32 rto) +{ + return min(rto, RXRPC_RTO_MAX); +} + +/* + * Called to compute a smoothed rtt estimate. The data fed to this + * routine either comes from timestamps, or from segments that were + * known _not_ to have been retransmitted [see Karn/Partridge + * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88 + * piece by Van Jacobson. + * NOTE: the next three routines used to be one big routine. + * To save cycles in the RFC 1323 implementation it was better to break + * it up into three procedures. -- erics + */ +static void rxrpc_rtt_estimator(struct rxrpc_peer *peer, long sample_rtt_us) +{ + long m = sample_rtt_us; /* RTT */ + u32 srtt = peer->srtt_us; + + /* The following amusing code comes from Jacobson's + * article in SIGCOMM '88. Note that rtt and mdev + * are scaled versions of rtt and mean deviation. + * This is designed to be as fast as possible + * m stands for "measurement". + * + * On a 1990 paper the rto value is changed to: + * RTO = rtt + 4 * mdev + * + * Funny. This algorithm seems to be very broken. + * These formulae increase RTO, when it should be decreased, increase + * too slowly, when it should be increased quickly, decrease too quickly + * etc. I guess in BSD RTO takes ONE value, so that it is absolutely + * does not matter how to _calculate_ it. Seems, it was trap + * that VJ failed to avoid. 8) + */ + if (srtt != 0) { + m -= (srtt >> 3); /* m is now error in rtt est */ + srtt += m; /* rtt = 7/8 rtt + 1/8 new */ + if (m < 0) { + m = -m; /* m is now abs(error) */ + m -= (peer->mdev_us >> 2); /* similar update on mdev */ + /* This is similar to one of Eifel findings. + * Eifel blocks mdev updates when rtt decreases. + * This solution is a bit different: we use finer gain + * for mdev in this case (alpha*beta). + * Like Eifel it also prevents growth of rto, + * but also it limits too fast rto decreases, + * happening in pure Eifel. + */ + if (m > 0) + m >>= 3; + } else { + m -= (peer->mdev_us >> 2); /* similar update on mdev */ + } + + peer->mdev_us += m; /* mdev = 3/4 mdev + 1/4 new */ + if (peer->mdev_us > peer->mdev_max_us) { + peer->mdev_max_us = peer->mdev_us; + if (peer->mdev_max_us > peer->rttvar_us) + peer->rttvar_us = peer->mdev_max_us; + } + } else { + /* no previous measure. */ + srtt = m << 3; /* take the measured time to be rtt */ + peer->mdev_us = m << 1; /* make sure rto = 3*rtt */ + peer->rttvar_us = max(peer->mdev_us, rxrpc_rto_min_us(peer)); + peer->mdev_max_us = peer->rttvar_us; + } + + peer->srtt_us = max(1U, srtt); +} + +/* + * Calculate rto without backoff. This is the second half of Van Jacobson's + * routine referred to above. + */ +static void rxrpc_set_rto(struct rxrpc_peer *peer) +{ + u32 rto; + + /* 1. If rtt variance happened to be less 50msec, it is hallucination. + * It cannot be less due to utterly erratic ACK generation made + * at least by solaris and freebsd. "Erratic ACKs" has _nothing_ + * to do with delayed acks, because at cwnd>2 true delack timeout + * is invisible. Actually, Linux-2.4 also generates erratic + * ACKs in some circumstances. + */ + rto = __rxrpc_set_rto(peer); + + /* 2. Fixups made earlier cannot be right. + * If we do not estimate RTO correctly without them, + * all the algo is pure shit and should be replaced + * with correct one. It is exactly, which we pretend to do. + */ + + /* NOTE: clamping at RXRPC_RTO_MIN is not required, current algo + * guarantees that rto is higher. + */ + peer->rto_j = rxrpc_bound_rto(rto); +} + +static void rxrpc_ack_update_rtt(struct rxrpc_peer *peer, long rtt_us) +{ + if (rtt_us < 0) + return; + + //rxrpc_update_rtt_min(peer, rtt_us); + rxrpc_rtt_estimator(peer, rtt_us); + rxrpc_set_rto(peer); + + /* RFC6298: only reset backoff on valid RTT measurement. */ + peer->backoff = 0; +} + +/* + * Add RTT information to cache. This is called in softirq mode and has + * exclusive access to the peer RTT data. + */ +void rxrpc_peer_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why, + rxrpc_serial_t send_serial, rxrpc_serial_t resp_serial, + ktime_t send_time, ktime_t resp_time) +{ + struct rxrpc_peer *peer = call->peer; + s64 rtt_us; + + rtt_us = ktime_to_us(ktime_sub(resp_time, send_time)); + if (rtt_us < 0) + return; + + spin_lock(&peer->rtt_input_lock); + rxrpc_ack_update_rtt(peer, rtt_us); + if (peer->rtt_count < 3) + peer->rtt_count++; + spin_unlock(&peer->rtt_input_lock); + + trace_rxrpc_rtt_rx(call, why, send_serial, resp_serial, + peer->srtt_us >> 3, peer->rto_j); +} + +/* + * Get the retransmission timeout to set in jiffies, backing it off each time + * we retransmit. + */ +unsigned long rxrpc_get_rto_backoff(struct rxrpc_peer *peer, bool retrans) +{ + u64 timo_j; + u8 backoff = READ_ONCE(peer->backoff); + + timo_j = peer->rto_j; + timo_j <<= backoff; + if (retrans && timo_j * 2 <= RXRPC_RTO_MAX) + WRITE_ONCE(peer->backoff, backoff + 1); + + if (timo_j < 1) + timo_j = 1; + + return timo_j; +} + +void rxrpc_peer_init_rtt(struct rxrpc_peer *peer) +{ + peer->rto_j = RXRPC_TIMEOUT_INIT; + peer->mdev_us = jiffies_to_usecs(RXRPC_TIMEOUT_INIT); + peer->backoff = 0; + //minmax_reset(&peer->rtt_min, rxrpc_jiffies32, ~0U); +} diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 0fcf157aa09f..5e9c43d4a314 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -66,15 +66,14 @@ static int rxrpc_wait_for_tx_window_waitall(struct rxrpc_sock *rx, struct rxrpc_call *call) { rxrpc_seq_t tx_start, tx_win; - signed long rtt2, timeout; - u64 rtt; + signed long rtt, timeout; - rtt = READ_ONCE(call->peer->rtt); - rtt2 = nsecs_to_jiffies64(rtt) * 2; - if (rtt2 < 2) - rtt2 = 2; + rtt = READ_ONCE(call->peer->srtt_us) >> 3; + rtt = usecs_to_jiffies(rtt) * 2; + if (rtt < 2) + rtt = 2; - timeout = rtt2; + timeout = rtt; tx_start = READ_ONCE(call->tx_hard_ack); for (;;) { @@ -92,7 +91,7 @@ static int rxrpc_wait_for_tx_window_waitall(struct rxrpc_sock *rx, return -EINTR; if (tx_win != tx_start) { - timeout = rtt2; + timeout = rtt; tx_start = tx_win; } @@ -271,16 +270,9 @@ static int rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call, _debug("need instant resend %d", ret); rxrpc_instant_resend(call, ix); } else { - unsigned long now = jiffies, resend_at; + unsigned long now = jiffies; + unsigned long resend_at = now + call->peer->rto_j; - if (call->peer->rtt_usage > 1) - resend_at = nsecs_to_jiffies(call->peer->rtt * 3 / 2); - else - resend_at = rxrpc_resend_timeout; - if (resend_at < 1) - resend_at = 1; - - resend_at += now; WRITE_ONCE(call->resend_at, resend_at); rxrpc_reduce_call_timer(call, resend_at, now, rxrpc_timer_set_for_send); diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c index 2bbb38161851..18dade4e6f9a 100644 --- a/net/rxrpc/sysctl.c +++ b/net/rxrpc/sysctl.c @@ -71,15 +71,6 @@ static struct ctl_table rxrpc_sysctl_table[] = { .extra1 = (void *)&one_jiffy, .extra2 = (void *)&max_jiffies, }, - { - .procname = "resend_timeout", - .data = &rxrpc_resend_timeout, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = proc_doulongvec_ms_jiffies_minmax, - .extra1 = (void *)&one_jiffy, - .extra2 = (void *)&max_jiffies, - }, /* Non-time values */ { -- cgit v1.2.3 From b6dd5acde3f165e364881c36de942c5b252e2a27 Mon Sep 17 00:00:00 2001 From: Madhuparna Bhowmik Date: Sat, 16 May 2020 13:15:15 +0530 Subject: ipv6: Fix suspicious RCU usage warning in ip6mr This patch fixes the following warning: ============================= WARNING: suspicious RCU usage 5.7.0-rc4-next-20200507-syzkaller #0 Not tainted ----------------------------- net/ipv6/ip6mr.c:124 RCU-list traversed in non-reader section!! ipmr_new_table() returns an existing table, but there is no table at init. Therefore the condition: either holding rtnl or the list is empty is used. Fixes: d1db275dd3f6e ("ipv6: ip6mr: support multiple tables") Reported-by: kernel test robot Suggested-by: Jakub Kicinski Signed-off-by: Madhuparna Bhowmik Signed-off-by: David S. Miller --- net/ipv6/ip6mr.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 65a54d74acc1..1e223e26f079 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -98,7 +98,8 @@ static void ipmr_expire_process(struct timer_list *t); #ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES #define ip6mr_for_each_table(mrt, net) \ list_for_each_entry_rcu(mrt, &net->ipv6.mr6_tables, list, \ - lockdep_rtnl_is_held()) + lockdep_rtnl_is_held() || \ + list_empty(&net->ipv6.mr6_tables)) static struct mr_table *ip6mr_mr_table_iter(struct net *net, struct mr_table *mrt) -- cgit v1.2.3 From 5e5502e012b8129e11be616acb0f9c34bc8f8adb Mon Sep 17 00:00:00 2001 From: DENG Qingfang Date: Wed, 13 May 2020 23:10:16 +0800 Subject: net: dsa: mt7530: fix roaming from DSA user ports When a client moves from a DSA user port to a software port in a bridge, it cannot reach any other clients that connected to the DSA user ports. That is because SA learning on the CPU port is disabled, so the switch ignores the client's frames from the CPU port and still thinks it is at the user port. Fix it by enabling SA learning on the CPU port. To prevent the switch from learning from flooding frames from the CPU port, set skb->offload_fwd_mark to 1 for unicast and broadcast frames, and let the switch flood them instead of trapping to the CPU port. Multicast frames still need to be trapped to the CPU port for snooping, so set the SA_DIS bit of the MTK tag to 1 when transmitting those frames to disable SA learning. Fixes: b8f126a8d543 ("net-next: dsa: add dsa support for Mediatek MT7530 switch") Signed-off-by: DENG Qingfang Signed-off-by: David S. Miller --- drivers/net/dsa/mt7530.c | 9 ++------- drivers/net/dsa/mt7530.h | 1 + net/dsa/tag_mtk.c | 15 +++++++++++++++ 3 files changed, 18 insertions(+), 7 deletions(-) diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index 5c444cd722bd..34e4aadfa705 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -628,11 +628,8 @@ mt7530_cpu_port_enable(struct mt7530_priv *priv, mt7530_write(priv, MT7530_PVC_P(port), PORT_SPEC_TAG); - /* Disable auto learning on the cpu port */ - mt7530_set(priv, MT7530_PSC_P(port), SA_DIS); - - /* Unknown unicast frame fordwarding to the cpu port */ - mt7530_set(priv, MT7530_MFC, UNU_FFP(BIT(port))); + /* Unknown multicast frame forwarding to the cpu port */ + mt7530_rmw(priv, MT7530_MFC, UNM_FFP_MASK, UNM_FFP(BIT(port))); /* Set CPU port number */ if (priv->id == ID_MT7621) @@ -1294,8 +1291,6 @@ mt7530_setup(struct dsa_switch *ds) /* Enable and reset MIB counters */ mt7530_mib_reset(ds); - mt7530_clear(priv, MT7530_MFC, UNU_FFP_MASK); - for (i = 0; i < MT7530_NUM_PORTS; i++) { /* Disable forwarding by default on all ports */ mt7530_rmw(priv, MT7530_PCR_P(i), PCR_MATRIX_MASK, diff --git a/drivers/net/dsa/mt7530.h b/drivers/net/dsa/mt7530.h index 979bb6374678..82af4d2d406e 100644 --- a/drivers/net/dsa/mt7530.h +++ b/drivers/net/dsa/mt7530.h @@ -31,6 +31,7 @@ enum { #define MT7530_MFC 0x10 #define BC_FFP(x) (((x) & 0xff) << 24) #define UNM_FFP(x) (((x) & 0xff) << 16) +#define UNM_FFP_MASK UNM_FFP(~0) #define UNU_FFP(x) (((x) & 0xff) << 8) #define UNU_FFP_MASK UNU_FFP(~0) #define CPU_EN BIT(7) diff --git a/net/dsa/tag_mtk.c b/net/dsa/tag_mtk.c index b5705cba8318..d6619edd53e5 100644 --- a/net/dsa/tag_mtk.c +++ b/net/dsa/tag_mtk.c @@ -15,6 +15,7 @@ #define MTK_HDR_XMIT_TAGGED_TPID_8100 1 #define MTK_HDR_RECV_SOURCE_PORT_MASK GENMASK(2, 0) #define MTK_HDR_XMIT_DP_BIT_MASK GENMASK(5, 0) +#define MTK_HDR_XMIT_SA_DIS BIT(6) static struct sk_buff *mtk_tag_xmit(struct sk_buff *skb, struct net_device *dev) @@ -22,6 +23,9 @@ static struct sk_buff *mtk_tag_xmit(struct sk_buff *skb, struct dsa_port *dp = dsa_slave_to_port(dev); u8 *mtk_tag; bool is_vlan_skb = true; + unsigned char *dest = eth_hdr(skb)->h_dest; + bool is_multicast_skb = is_multicast_ether_addr(dest) && + !is_broadcast_ether_addr(dest); /* Build the special tag after the MAC Source Address. If VLAN header * is present, it's required that VLAN header and special tag is @@ -47,6 +51,10 @@ static struct sk_buff *mtk_tag_xmit(struct sk_buff *skb, MTK_HDR_XMIT_UNTAGGED; mtk_tag[1] = (1 << dp->index) & MTK_HDR_XMIT_DP_BIT_MASK; + /* Disable SA learning for multicast frames */ + if (unlikely(is_multicast_skb)) + mtk_tag[1] |= MTK_HDR_XMIT_SA_DIS; + /* Tag control information is kept for 802.1Q */ if (!is_vlan_skb) { mtk_tag[2] = 0; @@ -61,6 +69,9 @@ static struct sk_buff *mtk_tag_rcv(struct sk_buff *skb, struct net_device *dev, { int port; __be16 *phdr, hdr; + unsigned char *dest = eth_hdr(skb)->h_dest; + bool is_multicast_skb = is_multicast_ether_addr(dest) && + !is_broadcast_ether_addr(dest); if (unlikely(!pskb_may_pull(skb, MTK_HDR_LEN))) return NULL; @@ -86,6 +97,10 @@ static struct sk_buff *mtk_tag_rcv(struct sk_buff *skb, struct net_device *dev, if (!skb->dev) return NULL; + /* Only unicast or broadcast frames are offloaded */ + if (likely(!is_multicast_skb)) + skb->offload_fwd_mark = 1; + return skb; } -- cgit v1.2.3 From f45a7bccdc190e2cf6ca3a527edbc4c80d7114ef Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 15 May 2020 14:52:03 -0500 Subject: net: ipa: don't be a hog in gsi_channel_poll() The iteration count value used in gsi_channel_poll() is intended to limit poll iterations to the budget supplied as an argument. But it's never updated. Fix this bug by incrementing the count each time through the loop. Reported-by: Sharath Chandra Vurukala Signed-off-by: Alex Elder Signed-off-by: David S. Miller --- drivers/net/ipa/gsi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ipa/gsi.c b/drivers/net/ipa/gsi.c index b671bea0aa7c..8d9ca1c335e8 100644 --- a/drivers/net/ipa/gsi.c +++ b/drivers/net/ipa/gsi.c @@ -1392,6 +1392,7 @@ static int gsi_channel_poll(struct napi_struct *napi, int budget) while (count < budget) { struct gsi_trans *trans; + count++; trans = gsi_channel_poll_one(channel); if (!trans) break; -- cgit v1.2.3 From 84be69b869a5a496a6cfde9b3c29509207a1f1fa Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sun, 17 May 2020 11:26:32 -0600 Subject: nexthop: Fix attribute checking for groups For nexthop groups, attributes after NHA_GROUP_TYPE are invalid, but nh_check_attr_group starts checking at NHA_GROUP. The group type defaults to multipath and the NHA_GROUP_TYPE is currently optional so this has slipped through so far. Fix the attribute checking to handle support of new group types. Fixes: 430a049190de ("nexthop: Add support for nexthop groups") Signed-off-by: ASSOGBA Emery Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/nexthop.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index fdfca534d094..2a31c4af845e 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -433,7 +433,7 @@ static int nh_check_attr_group(struct net *net, struct nlattr *tb[], if (!valid_group_nh(nh, len, extack)) return -EINVAL; } - for (i = NHA_GROUP + 1; i < __NHA_MAX; ++i) { + for (i = NHA_GROUP_TYPE + 1; i < __NHA_MAX; ++i) { if (!tb[i]) continue; -- cgit v1.2.3 From 61d0301e6c05db55446c7c9b3b3294244649e7bc Mon Sep 17 00:00:00 2001 From: Kurt Kanzenbach Date: Wed, 13 May 2020 16:02:49 +0200 Subject: dt-bindings: net: dsa: b53: Add missing size and address cells to example Add the missing size and address cells to the b53 example. Otherwise, it may not compile or issue warnings if directly copied into a device tree. Signed-off-by: Kurt Kanzenbach Acked-by: Florian Fainelli Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/dsa/b53.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/devicetree/bindings/net/dsa/b53.txt b/Documentation/devicetree/bindings/net/dsa/b53.txt index 5201bc15fdd6..cfd1afdc6e94 100644 --- a/Documentation/devicetree/bindings/net/dsa/b53.txt +++ b/Documentation/devicetree/bindings/net/dsa/b53.txt @@ -110,6 +110,9 @@ Ethernet switch connected via MDIO to the host, CPU port wired to eth0: #size-cells = <0>; ports { + #address-cells = <1>; + #size-cells = <0>; + port0@0 { reg = <0>; label = "lan1"; -- cgit v1.2.3 From a6211caa634da39d861a47437ffcda8b38ef421b Mon Sep 17 00:00:00 2001 From: Yuqi Jin Date: Sat, 16 May 2020 11:46:49 +0800 Subject: net: revert "net: get rid of an signed integer overflow in ip_idents_reserve()" Commit adb03115f459 ("net: get rid of an signed integer overflow in ip_idents_reserve()") used atomic_cmpxchg to replace "atomic_add_return" inside the function "ip_idents_reserve". The reason was to avoid UBSAN warning. However, this change has caused performance degrade and in GCC-8, fno-strict-overflow is now mapped to -fwrapv -fwrapv-pointer and signed integer overflow is now undefined by default at all optimization levels[1]. Moreover, it was a bug in UBSAN vs -fwrapv /-fno-strict-overflow, so Let's revert it safely. [1] https://gcc.gnu.org/gcc-8/changes.html Suggested-by: Peter Zijlstra Suggested-by: Eric Dumazet Cc: "David S. Miller" Cc: Alexey Kuznetsov Cc: Hideaki YOSHIFUJI Cc: Jakub Kicinski Cc: Jiri Pirko Cc: Arvind Sankar Cc: Peter Zijlstra Cc: Eric Dumazet Cc: Jiong Wang Signed-off-by: Yuqi Jin Signed-off-by: Shaokun Zhang Signed-off-by: David S. Miller --- net/ipv4/route.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index fa829f31a3f5..b73f540fa19b 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -491,18 +491,16 @@ u32 ip_idents_reserve(u32 hash, int segs) atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ; u32 old = READ_ONCE(*p_tstamp); u32 now = (u32)jiffies; - u32 new, delta = 0; + u32 delta = 0; if (old != now && cmpxchg(p_tstamp, old, now) == old) delta = prandom_u32_max(now - old); - /* Do not use atomic_add_return() as it makes UBSAN unhappy */ - do { - old = (u32)atomic_read(p_id); - new = old + delta + segs; - } while (atomic_cmpxchg(p_id, old, new) != old); - - return new - segs; + /* If UBSAN reports an error there, please make sure your compiler + * supports -fno-strict-overflow before reporting it that was a bug + * in UBSAN, and it has been fixed in GCC-8. + */ + return atomic_add_return(segs + delta, p_id) - segs; } EXPORT_SYMBOL(ip_idents_reserve); -- cgit v1.2.3 From e3f2d5579c0b8ad9d1fb6a5813cee38a86386e05 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 17 May 2020 14:53:40 +0300 Subject: net: phy: propagate an error back to the callers of phy_sfp_probe The compilation warning below reveals that the errors returned from the sfp_bus_add_upstream() call are not propagated to the callers. Fix it by returning "ret". 14:37:51 drivers/net/phy/phy_device.c: In function 'phy_sfp_probe': 14:37:51 drivers/net/phy/phy_device.c:1236:6: warning: variable 'ret' set but not used [-Wunused-but-set-variable] 14:37:51 1236 | int ret; 14:37:51 | ^~~ Fixes: 298e54fa810e ("net: phy: add core phylib sfp support") Signed-off-by: Leon Romanovsky Signed-off-by: David S. Miller --- drivers/net/phy/phy_device.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index ac2784192472..697c74deb222 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -1233,7 +1233,7 @@ int phy_sfp_probe(struct phy_device *phydev, const struct sfp_upstream_ops *ops) { struct sfp_bus *bus; - int ret; + int ret = 0; if (phydev->mdio.dev.fwnode) { bus = sfp_bus_find_fwnode(phydev->mdio.dev.fwnode); @@ -1245,7 +1245,7 @@ int phy_sfp_probe(struct phy_device *phydev, ret = sfp_bus_add_upstream(bus, phydev, ops); sfp_bus_put(bus); } - return 0; + return ret; } EXPORT_SYMBOL(phy_sfp_probe); -- cgit v1.2.3 From b15e62631c5f19fea9895f7632dae9c1b27fe0cd Mon Sep 17 00:00:00 2001 From: Roman Mashak Date: Sun, 17 May 2020 08:46:31 -0400 Subject: net sched: fix reporting the first-time use timestamp When a new action is installed, firstuse field of 'tcf_t' is explicitly set to 0. Value of zero means "new action, not yet used"; as a packet hits the action, 'firstuse' is stamped with the current jiffies value. tcf_tm_dump() should return 0 for firstuse if action has not yet been hit. Fixes: 48d8ee1694dd ("net sched actions: aggregate dumping of actions timeinfo") Cc: Jamal Hadi Salim Signed-off-by: Roman Mashak Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/net/act_api.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/net/act_api.h b/include/net/act_api.h index c24d7643548e..124bd139886c 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -75,7 +75,8 @@ static inline void tcf_tm_dump(struct tcf_t *dtm, const struct tcf_t *stm) { dtm->install = jiffies_to_clock_t(jiffies - stm->install); dtm->lastuse = jiffies_to_clock_t(jiffies - stm->lastuse); - dtm->firstuse = jiffies_to_clock_t(jiffies - stm->firstuse); + dtm->firstuse = stm->firstuse ? + jiffies_to_clock_t(jiffies - stm->firstuse) : 0; dtm->expires = jiffies_to_clock_t(stm->expires); } -- cgit v1.2.3 From ef01cee2ee1b369c57a936166483d40942bcc3e3 Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Tue, 19 May 2020 09:05:58 +0800 Subject: net: bmac: Fix read of MAC address from ROM In bmac_get_station_address, We're reading two bytes at a time from ROM, but we do that six times, resulting in 12 bytes of read & writes. This means we will write off the end of the six-byte destination buffer. This change fixes the for-loop to only read/write six bytes. Based on a proposed fix from Finn Thain . Signed-off-by: Jeremy Kerr Reported-by: Stan Johnson Tested-by: Stan Johnson Reported-by: Finn Thain Signed-off-by: David S. Miller --- drivers/net/ethernet/apple/bmac.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/apple/bmac.c b/drivers/net/ethernet/apple/bmac.c index a58185b1d8bf..3e3711b60d01 100644 --- a/drivers/net/ethernet/apple/bmac.c +++ b/drivers/net/ethernet/apple/bmac.c @@ -1182,7 +1182,7 @@ bmac_get_station_address(struct net_device *dev, unsigned char *ea) int i; unsigned short data; - for (i = 0; i < 6; i++) + for (i = 0; i < 3; i++) { reset_and_select_srom(dev); data = read_srom(dev, i + EnetAddressOffset/2, SROMAddressBits); -- cgit v1.2.3 From 12555a2d97e5784eeb105ca9b1b533d4c95f1115 Mon Sep 17 00:00:00 2001 From: Todd Malsbary Date: Tue, 19 May 2020 09:45:34 -0700 Subject: mptcp: use rightmost 64 bits in ADD_ADDR HMAC This changes the HMAC used in the ADD_ADDR option from the leftmost 64 bits to the rightmost 64 bits as described in RFC 8684, section 3.4.1. This issue was discovered while adding support to packetdrill for the ADD_ADDR v1 option. Fixes: 3df523ab582c ("mptcp: Add ADD_ADDR handling") Signed-off-by: Todd Malsbary Acked-by: Christoph Paasch Reviewed-by: Matthieu Baerts Signed-off-by: David S. Miller --- net/mptcp/options.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 45497af23906..b88fae233a62 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -545,7 +545,7 @@ static u64 add_addr_generate_hmac(u64 key1, u64 key2, u8 addr_id, mptcp_crypto_hmac_sha(key1, key2, msg, 7, hmac); - return get_unaligned_be64(hmac); + return get_unaligned_be64(&hmac[MPTCP_ADDR_HMAC_LEN - sizeof(u64)]); } #if IS_ENABLED(CONFIG_MPTCP_IPV6) @@ -562,7 +562,7 @@ static u64 add_addr6_generate_hmac(u64 key1, u64 key2, u8 addr_id, mptcp_crypto_hmac_sha(key1, key2, msg, 19, hmac); - return get_unaligned_be64(hmac); + return get_unaligned_be64(&hmac[MPTCP_ADDR_HMAC_LEN - sizeof(u64)]); } #endif -- cgit v1.2.3 From c27a204383616efba5a4194075e90819961ff66a Mon Sep 17 00:00:00 2001 From: Marc Payne Date: Tue, 19 May 2020 19:01:46 +0100 Subject: r8152: support additional Microsoft Surface Ethernet Adapter variant Device id 0927 is the RTL8153B-based component of the 'Surface USB-C to Ethernet and USB Adapter' and may be used as a component of other devices in future. Tested and working with the r8152 driver. Update the cdc_ether blacklist due to the RTL8153 'network jam on suspend' issue which this device will cause (personally confirmed). Signed-off-by: Marc Payne Signed-off-by: David S. Miller --- drivers/net/usb/cdc_ether.c | 11 +++++++++-- drivers/net/usb/r8152.c | 1 + 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/net/usb/cdc_ether.c b/drivers/net/usb/cdc_ether.c index 0cdb2ce47645..a657943c9f01 100644 --- a/drivers/net/usb/cdc_ether.c +++ b/drivers/net/usb/cdc_ether.c @@ -815,14 +815,21 @@ static const struct usb_device_id products[] = { .driver_info = 0, }, -/* Microsoft Surface 3 dock (based on Realtek RTL8153) */ +/* Microsoft Surface Ethernet Adapter (based on Realtek RTL8153) */ { USB_DEVICE_AND_INTERFACE_INFO(MICROSOFT_VENDOR_ID, 0x07c6, USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), .driver_info = 0, }, - /* TP-LINK UE300 USB 3.0 Ethernet Adapters (based on Realtek RTL8153) */ +/* Microsoft Surface Ethernet Adapter (based on Realtek RTL8153B) */ +{ + USB_DEVICE_AND_INTERFACE_INFO(MICROSOFT_VENDOR_ID, 0x0927, USB_CLASS_COMM, + USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), + .driver_info = 0, +}, + +/* TP-LINK UE300 USB 3.0 Ethernet Adapters (based on Realtek RTL8153) */ { USB_DEVICE_AND_INTERFACE_INFO(TPLINK_VENDOR_ID, 0x0601, USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index 8f8d9883d363..c8c873a613b6 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -6880,6 +6880,7 @@ static const struct usb_device_id rtl8152_table[] = { {REALTEK_USB_DEVICE(VENDOR_ID_REALTEK, 0x8153)}, {REALTEK_USB_DEVICE(VENDOR_ID_MICROSOFT, 0x07ab)}, {REALTEK_USB_DEVICE(VENDOR_ID_MICROSOFT, 0x07c6)}, + {REALTEK_USB_DEVICE(VENDOR_ID_MICROSOFT, 0x0927)}, {REALTEK_USB_DEVICE(VENDOR_ID_SAMSUNG, 0xa101)}, {REALTEK_USB_DEVICE(VENDOR_ID_LENOVO, 0x304f)}, {REALTEK_USB_DEVICE(VENDOR_ID_LENOVO, 0x3062)}, -- cgit v1.2.3 From 88d7fcfa3b1fe670f0412b95be785aafca63352b Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 18 May 2020 17:13:34 -0700 Subject: net: inet_csk: Fix so_reuseport bind-address cache in tb->fast* The commit 637bc8bbe6c0 ("inet: reset tb->fastreuseport when adding a reuseport sk") added a bind-address cache in tb->fast*. The tb->fast* caches the address of a sk which has successfully been binded with SO_REUSEPORT ON. The idea is to avoid the expensive conflict search in inet_csk_bind_conflict(). There is an issue with wildcard matching where sk_reuseport_match() should have returned false but it is currently returning true. It ends up hiding bind conflict. For example, bind("[::1]:443"); /* without SO_REUSEPORT. Succeed. */ bind("[::2]:443"); /* with SO_REUSEPORT. Succeed. */ bind("[::]:443"); /* with SO_REUSEPORT. Still Succeed where it shouldn't */ The last bind("[::]:443") with SO_REUSEPORT on should have failed because it should have a conflict with the very first bind("[::1]:443") which has SO_REUSEPORT off. However, the address "[::2]" is cached in tb->fast* in the second bind. In the last bind, the sk_reuseport_match() returns true because the binding sk's wildcard addr "[::]" matches with the "[::2]" cached in tb->fast*. The correct bind conflict is reported by removing the second bind such that tb->fast* cache is not involved and forces the bind("[::]:443") to go through the inet_csk_bind_conflict(): bind("[::1]:443"); /* without SO_REUSEPORT. Succeed. */ bind("[::]:443"); /* with SO_REUSEPORT. -EADDRINUSE */ The expected behavior for sk_reuseport_match() is, it should only allow the "cached" tb->fast* address to be used as a wildcard match but not the address of the binding sk. To do that, the current "bool match_wildcard" arg is split into "bool match_sk1_wildcard" and "bool match_sk2_wildcard". This change only affects the sk_reuseport_match() which is only used by inet_csk (e.g. TCP). The other use cases are calling inet_rcv_saddr_equal() and this patch makes it pass the same "match_wildcard" arg twice to the "ipv[46]_rcv_saddr_equal(..., match_wildcard, match_wildcard)". Cc: Josef Bacik Fixes: 637bc8bbe6c0 ("inet: reset tb->fastreuseport when adding a reuseport sk") Signed-off-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 43 +++++++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 5f34eb951627..65c29f2bd89f 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -24,17 +24,19 @@ #include #if IS_ENABLED(CONFIG_IPV6) -/* match_wildcard == true: IPV6_ADDR_ANY equals to any IPv6 addresses if IPv6 - * only, and any IPv4 addresses if not IPv6 only - * match_wildcard == false: addresses must be exactly the same, i.e. - * IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY, - * and 0.0.0.0 equals to 0.0.0.0 only +/* match_sk*_wildcard == true: IPV6_ADDR_ANY equals to any IPv6 addresses + * if IPv6 only, and any IPv4 addresses + * if not IPv6 only + * match_sk*_wildcard == false: addresses must be exactly the same, i.e. + * IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY, + * and 0.0.0.0 equals to 0.0.0.0 only */ static bool ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6, const struct in6_addr *sk2_rcv_saddr6, __be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr, bool sk1_ipv6only, bool sk2_ipv6only, - bool match_wildcard) + bool match_sk1_wildcard, + bool match_sk2_wildcard) { int addr_type = ipv6_addr_type(sk1_rcv_saddr6); int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED; @@ -44,8 +46,8 @@ static bool ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6, if (!sk2_ipv6only) { if (sk1_rcv_saddr == sk2_rcv_saddr) return true; - if (!sk1_rcv_saddr || !sk2_rcv_saddr) - return match_wildcard; + return (match_sk1_wildcard && !sk1_rcv_saddr) || + (match_sk2_wildcard && !sk2_rcv_saddr); } return false; } @@ -53,11 +55,11 @@ static bool ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6, if (addr_type == IPV6_ADDR_ANY && addr_type2 == IPV6_ADDR_ANY) return true; - if (addr_type2 == IPV6_ADDR_ANY && match_wildcard && + if (addr_type2 == IPV6_ADDR_ANY && match_sk2_wildcard && !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED)) return true; - if (addr_type == IPV6_ADDR_ANY && match_wildcard && + if (addr_type == IPV6_ADDR_ANY && match_sk1_wildcard && !(sk1_ipv6only && addr_type2 == IPV6_ADDR_MAPPED)) return true; @@ -69,18 +71,19 @@ static bool ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6, } #endif -/* match_wildcard == true: 0.0.0.0 equals to any IPv4 addresses - * match_wildcard == false: addresses must be exactly the same, i.e. - * 0.0.0.0 only equals to 0.0.0.0 +/* match_sk*_wildcard == true: 0.0.0.0 equals to any IPv4 addresses + * match_sk*_wildcard == false: addresses must be exactly the same, i.e. + * 0.0.0.0 only equals to 0.0.0.0 */ static bool ipv4_rcv_saddr_equal(__be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr, - bool sk2_ipv6only, bool match_wildcard) + bool sk2_ipv6only, bool match_sk1_wildcard, + bool match_sk2_wildcard) { if (!sk2_ipv6only) { if (sk1_rcv_saddr == sk2_rcv_saddr) return true; - if (!sk1_rcv_saddr || !sk2_rcv_saddr) - return match_wildcard; + return (match_sk1_wildcard && !sk1_rcv_saddr) || + (match_sk2_wildcard && !sk2_rcv_saddr); } return false; } @@ -96,10 +99,12 @@ bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, sk2->sk_rcv_saddr, ipv6_only_sock(sk), ipv6_only_sock(sk2), + match_wildcard, match_wildcard); #endif return ipv4_rcv_saddr_equal(sk->sk_rcv_saddr, sk2->sk_rcv_saddr, - ipv6_only_sock(sk2), match_wildcard); + ipv6_only_sock(sk2), match_wildcard, + match_wildcard); } EXPORT_SYMBOL(inet_rcv_saddr_equal); @@ -285,10 +290,10 @@ static inline int sk_reuseport_match(struct inet_bind_bucket *tb, tb->fast_rcv_saddr, sk->sk_rcv_saddr, tb->fast_ipv6_only, - ipv6_only_sock(sk), true); + ipv6_only_sock(sk), true, false); #endif return ipv4_rcv_saddr_equal(tb->fast_rcv_saddr, sk->sk_rcv_saddr, - ipv6_only_sock(sk), true); + ipv6_only_sock(sk), true, false); } /* Obtain a reference to a local port for the given sock, -- cgit v1.2.3 From c0bbbdc32febd4f034ecbf3ea17865785b2c0652 Mon Sep 17 00:00:00 2001 From: Boris Sukholitko Date: Tue, 19 May 2020 10:32:37 +0300 Subject: __netif_receive_skb_core: pass skb by reference __netif_receive_skb_core may change the skb pointer passed into it (e.g. in rx_handler). The original skb may be freed as a result of this operation. The callers of __netif_receive_skb_core may further process original skb by using pt_prev pointer returned by __netif_receive_skb_core thus leading to unpleasant effects. The solution is to pass skb by reference into __netif_receive_skb_core. v2: Added Fixes tag and comment regarding ppt_prev and skb invariant. Fixes: 88eb1944e18c ("net: core: propagate SKB lists through packet_type lookup") Signed-off-by: Boris Sukholitko Acked-by: Edward Cree Signed-off-by: David S. Miller --- net/core/dev.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 6d327b7aa813..2d8aceee4284 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4988,11 +4988,12 @@ static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev, return 0; } -static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc, +static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc, struct packet_type **ppt_prev) { struct packet_type *ptype, *pt_prev; rx_handler_func_t *rx_handler; + struct sk_buff *skb = *pskb; struct net_device *orig_dev; bool deliver_exact = false; int ret = NET_RX_DROP; @@ -5023,8 +5024,10 @@ another_round: ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb); preempt_enable(); - if (ret2 != XDP_PASS) - return NET_RX_DROP; + if (ret2 != XDP_PASS) { + ret = NET_RX_DROP; + goto out; + } skb_reset_mac_len(skb); } @@ -5174,6 +5177,13 @@ drop: } out: + /* The invariant here is that if *ppt_prev is not NULL + * then skb should also be non-NULL. + * + * Apparently *ppt_prev assignment above holds this invariant due to + * skb dereferencing near it. + */ + *pskb = skb; return ret; } @@ -5183,7 +5193,7 @@ static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc) struct packet_type *pt_prev = NULL; int ret; - ret = __netif_receive_skb_core(skb, pfmemalloc, &pt_prev); + ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev); if (pt_prev) ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb, skb->dev, pt_prev, orig_dev); @@ -5261,7 +5271,7 @@ static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemallo struct packet_type *pt_prev = NULL; skb_list_del_init(skb); - __netif_receive_skb_core(skb, pfmemalloc, &pt_prev); + __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev); if (!pt_prev) continue; if (pt_curr != pt_prev || od_curr != orig_dev) { -- cgit v1.2.3 From 20a785aa52c82246055a089e55df9dac47d67da1 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Tue, 19 May 2020 16:04:05 -0400 Subject: sctp: Don't add the shutdown timer if its already been added This BUG halt was reported a while back, but the patch somehow got missed: PID: 2879 TASK: c16adaa0 CPU: 1 COMMAND: "sctpn" #0 [f418dd28] crash_kexec at c04a7d8c #1 [f418dd7c] oops_end at c0863e02 #2 [f418dd90] do_invalid_op at c040aaca #3 [f418de28] error_code (via invalid_op) at c08631a5 EAX: f34baac0 EBX: 00000090 ECX: f418deb0 EDX: f5542950 EBP: 00000000 DS: 007b ESI: f34ba800 ES: 007b EDI: f418dea0 GS: 00e0 CS: 0060 EIP: c046fa5e ERR: ffffffff EFLAGS: 00010286 #4 [f418de5c] add_timer at c046fa5e #5 [f418de68] sctp_do_sm at f8db8c77 [sctp] #6 [f418df30] sctp_primitive_SHUTDOWN at f8dcc1b5 [sctp] #7 [f418df48] inet_shutdown at c080baf9 #8 [f418df5c] sys_shutdown at c079eedf #9 [f418df70] sys_socketcall at c079fe88 EAX: ffffffda EBX: 0000000d ECX: bfceea90 EDX: 0937af98 DS: 007b ESI: 0000000c ES: 007b EDI: b7150ae4 SS: 007b ESP: bfceea7c EBP: bfceeaa8 GS: 0033 CS: 0073 EIP: b775c424 ERR: 00000066 EFLAGS: 00000282 It appears that the side effect that starts the shutdown timer was processed multiple times, which can happen as multiple paths can trigger it. This of course leads to the BUG halt in add_timer getting called. Fix seems pretty straightforward, just check before the timer is added if its already been started. If it has mod the timer instead to min(current expiration, new expiration) Its been tested but not confirmed to fix the problem, as the issue has only occured in production environments where test kernels are enjoined from being installed. It appears to be a sane fix to me though. Also, recentely, Jere found a reproducer posted on list to confirm that this resolves the issues Signed-off-by: Neil Horman CC: Vlad Yasevich CC: "David S. Miller" CC: jere.leppanen@nokia.com CC: marcelo.leitner@gmail.com CC: netdev@vger.kernel.org Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/sm_sideeffect.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index 2bc29463e1dc..9f36fe911d08 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -1523,9 +1523,17 @@ static int sctp_cmd_interpreter(enum sctp_event_type event_type, timeout = asoc->timeouts[cmd->obj.to]; BUG_ON(!timeout); - timer->expires = jiffies + timeout; - sctp_association_hold(asoc); - add_timer(timer); + /* + * SCTP has a hard time with timer starts. Because we process + * timer starts as side effects, it can be hard to tell if we + * have already started a timer or not, which leads to BUG + * halts when we call add_timer. So here, instead of just starting + * a timer, if the timer is already started, and just mod + * the timer with the shorter of the two expiration times + */ + if (!timer_pending(timer)) + sctp_association_hold(asoc); + timer_reduce(timer, jiffies + timeout); break; case SCTP_CMD_TIMER_RESTART: -- cgit v1.2.3 From d1f129470e6cb79b8b97fecd12689f6eb49e27fe Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 28 Apr 2020 22:06:54 +0100 Subject: rxrpc: Trace discarded ACKs Add a tracepoint to track received ACKs that are discarded due to being outside of the Tx window. Signed-off-by: David Howells --- include/trace/events/rxrpc.h | 35 +++++++++++++++++++++++++++++++++++ net/rxrpc/input.c | 12 ++++++++++-- 2 files changed, 45 insertions(+), 2 deletions(-) diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index ab75f261f04a..ba9efdc848f9 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -1541,6 +1541,41 @@ TRACE_EVENT(rxrpc_notify_socket, __entry->serial) ); +TRACE_EVENT(rxrpc_rx_discard_ack, + TP_PROTO(unsigned int debug_id, rxrpc_serial_t serial, + rxrpc_seq_t first_soft_ack, rxrpc_seq_t call_ackr_first, + rxrpc_seq_t prev_pkt, rxrpc_seq_t call_ackr_prev), + + TP_ARGS(debug_id, serial, first_soft_ack, call_ackr_first, + prev_pkt, call_ackr_prev), + + TP_STRUCT__entry( + __field(unsigned int, debug_id ) + __field(rxrpc_serial_t, serial ) + __field(rxrpc_seq_t, first_soft_ack) + __field(rxrpc_seq_t, call_ackr_first) + __field(rxrpc_seq_t, prev_pkt) + __field(rxrpc_seq_t, call_ackr_prev) + ), + + TP_fast_assign( + __entry->debug_id = debug_id; + __entry->serial = serial; + __entry->first_soft_ack = first_soft_ack; + __entry->call_ackr_first = call_ackr_first; + __entry->prev_pkt = prev_pkt; + __entry->call_ackr_prev = call_ackr_prev; + ), + + TP_printk("c=%08x r=%08x %08x<%08x %08x<%08x", + __entry->debug_id, + __entry->serial, + __entry->first_soft_ack, + __entry->call_ackr_first, + __entry->prev_pkt, + __entry->call_ackr_prev) + ); + #endif /* _TRACE_RXRPC_H */ /* This part must be outside protection */ diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index e438bfd3fdf5..2f22f082a66c 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -866,8 +866,12 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) /* Discard any out-of-order or duplicate ACKs (outside lock). */ if (before(first_soft_ack, call->ackr_first_seq) || - before(prev_pkt, call->ackr_prev_seq)) + before(prev_pkt, call->ackr_prev_seq)) { + trace_rxrpc_rx_discard_ack(call->debug_id, sp->hdr.serial, + first_soft_ack, call->ackr_first_seq, + prev_pkt, call->ackr_prev_seq); return; + } buf.info.rxMTU = 0; ioffset = offset + nr_acks + 3; @@ -879,8 +883,12 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) /* Discard any out-of-order or duplicate ACKs (inside lock). */ if (before(first_soft_ack, call->ackr_first_seq) || - before(prev_pkt, call->ackr_prev_seq)) + before(prev_pkt, call->ackr_prev_seq)) { + trace_rxrpc_rx_discard_ack(call->debug_id, sp->hdr.serial, + first_soft_ack, call->ackr_first_seq, + prev_pkt, call->ackr_prev_seq); goto out; + } call->acks_latest_ts = skb->tstamp; call->ackr_first_seq = first_soft_ack; -- cgit v1.2.3 From 441fdee1eaf050ef0040bde0d7af075c1c6a6d8b Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 29 Apr 2020 23:48:43 +0100 Subject: rxrpc: Fix ack discard The Rx protocol has a "previousPacket" field in it that is not handled in the same way by all protocol implementations. Sometimes it contains the serial number of the last DATA packet received, sometimes the sequence number of the last DATA packet received and sometimes the highest sequence number so far received. AF_RXRPC is using this to weed out ACKs that are out of date (it's possible for ACK packets to get reordered on the wire), but this does not work with OpenAFS which will just stick the sequence number of the last packet seen into previousPacket. The issue being seen is that big AFS FS.StoreData RPC (eg. of ~256MiB) are timing out when partly sent. A trace was captured, with an additional tracepoint to show ACKs being discarded in rxrpc_input_ack(). Here's an excerpt showing the problem. 52873.203230: rxrpc_tx_data: c=000004ae DATA ed1a3584:00000002 0002449c q=00024499 fl=09 A DATA packet with sequence number 00024499 has been transmitted (the "q=" field). ... 52873.243296: rxrpc_rx_ack: c=000004ae 00012a2b DLY r=00024499 f=00024497 p=00024496 n=0 52873.243376: rxrpc_rx_ack: c=000004ae 00012a2c IDL r=0002449b f=00024499 p=00024498 n=0 52873.243383: rxrpc_rx_ack: c=000004ae 00012a2d OOS r=0002449d f=00024499 p=0002449a n=2 The Out-Of-Sequence ACK indicates that the server didn't see DATA sequence number 00024499, but did see seq 0002449a (previousPacket, shown as "p=", skipped the number, but firstPacket, "f=", which shows the bottom of the window is set at that point). 52873.252663: rxrpc_retransmit: c=000004ae q=24499 a=02 xp=14581537 52873.252664: rxrpc_tx_data: c=000004ae DATA ed1a3584:00000002 000244bc q=00024499 fl=0b *RETRANS* The packet has been retransmitted. Retransmission recurs until the peer says it got the packet. 52873.271013: rxrpc_rx_ack: c=000004ae 00012a31 OOS r=000244a1 f=00024499 p=0002449e n=6 More OOS ACKs indicate that the other packets that are already in the transmission pipeline are being received. The specific-ACK list is up to 6 ACKs and NAKs. ... 52873.284792: rxrpc_rx_ack: c=000004ae 00012a49 OOS r=000244b9 f=00024499 p=000244b6 n=30 52873.284802: rxrpc_retransmit: c=000004ae q=24499 a=0a xp=63505500 52873.284804: rxrpc_tx_data: c=000004ae DATA ed1a3584:00000002 000244c2 q=00024499 fl=0b *RETRANS* 52873.287468: rxrpc_rx_ack: c=000004ae 00012a4a OOS r=000244ba f=00024499 p=000244b7 n=31 52873.287478: rxrpc_rx_ack: c=000004ae 00012a4b OOS r=000244bb f=00024499 p=000244b8 n=32 At this point, the server's receive window is full (n=32) with presumably 1 NAK'd packet and 31 ACK'd packets. We can't transmit any more packets. 52873.287488: rxrpc_retransmit: c=000004ae q=24499 a=0a xp=61327980 52873.287489: rxrpc_tx_data: c=000004ae DATA ed1a3584:00000002 000244c3 q=00024499 fl=0b *RETRANS* 52873.293850: rxrpc_rx_ack: c=000004ae 00012a4c DLY r=000244bc f=000244a0 p=00024499 n=25 And now we've received an ACK indicating that a DATA retransmission was received. 7 packets have been processed (the occupied part of the window moved, as indicated by f= and n=). 52873.293853: rxrpc_rx_discard_ack: c=000004ae r=00012a4c 000244a0<00024499 00024499<000244b8 However, the DLY ACK gets discarded because its previousPacket has gone backwards (from p=000244b8, in the ACK at 52873.287478 to p=00024499 in the ACK at 52873.293850). We then end up in a continuous cycle of retransmit/discard. kafs fails to update its window because it's discarding the ACKs and can't transmit an extra packet that would clear the issue because the window is full. OpenAFS doesn't change the previousPacket value in the ACKs because no new DATA packets are received with a different previousPacket number. Fix this by altering the discard check to only discard an ACK based on previousPacket if there was no advance in the firstPacket. This allows us to transmit a new packet which will cause previousPacket to advance in the next ACK. The check, however, needs to allow for the possibility that previousPacket may actually have had the serial number placed in it instead - in which case it will go outside the window and we should ignore it. Fixes: 1a2391c30c0b ("rxrpc: Fix detection of out of order acks") Reported-by: Dave Botsch Signed-off-by: David Howells --- net/rxrpc/input.c | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 2f22f082a66c..3be4177baf70 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -802,6 +802,30 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call, u8 *acks, } } +/* + * Return true if the ACK is valid - ie. it doesn't appear to have regressed + * with respect to the ack state conveyed by preceding ACKs. + */ +static bool rxrpc_is_ack_valid(struct rxrpc_call *call, + rxrpc_seq_t first_pkt, rxrpc_seq_t prev_pkt) +{ + rxrpc_seq_t base = READ_ONCE(call->ackr_first_seq); + + if (after(first_pkt, base)) + return true; /* The window advanced */ + + if (before(first_pkt, base)) + return false; /* firstPacket regressed */ + + if (after_eq(prev_pkt, call->ackr_prev_seq)) + return true; /* previousPacket hasn't regressed. */ + + /* Some rx implementations put a serial number in previousPacket. */ + if (after_eq(prev_pkt, base + call->tx_winsize)) + return false; + return true; +} + /* * Process an ACK packet. * @@ -865,8 +889,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) } /* Discard any out-of-order or duplicate ACKs (outside lock). */ - if (before(first_soft_ack, call->ackr_first_seq) || - before(prev_pkt, call->ackr_prev_seq)) { + if (!rxrpc_is_ack_valid(call, first_soft_ack, prev_pkt)) { trace_rxrpc_rx_discard_ack(call->debug_id, sp->hdr.serial, first_soft_ack, call->ackr_first_seq, prev_pkt, call->ackr_prev_seq); @@ -882,8 +905,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) spin_lock(&call->input_lock); /* Discard any out-of-order or duplicate ACKs (inside lock). */ - if (before(first_soft_ack, call->ackr_first_seq) || - before(prev_pkt, call->ackr_prev_seq)) { + if (!rxrpc_is_ack_valid(call, first_soft_ack, prev_pkt)) { trace_rxrpc_rx_discard_ack(call->debug_id, sp->hdr.serial, first_soft_ack, call->ackr_first_seq, prev_pkt, call->ackr_prev_seq); -- cgit v1.2.3 From 0550cfe8c2c6f8e7a4c348b6603a794576db12dd Mon Sep 17 00:00:00 2001 From: KP Singh Date: Wed, 20 May 2020 14:56:16 +0200 Subject: security: Fix hook iteration for secid_to_secctx secid_to_secctx is not stackable, and since the BPF LSM registers this hook by default, the call_int_hook logic is not suitable which "bails-on-fail" and casues issues when other LSMs register this hook and eventually breaks Audit. In order to fix this, directly iterate over the security hooks instead of using call_int_hook as suggested in: https: //lore.kernel.org/bpf/9d0eb6c6-803a-ff3a-5603-9ad6d9edfc00@schaufler-ca.com/#t Fixes: 98e828a0650f ("security: Refactor declaration of LSM hooks") Fixes: 625236ba3832 ("security: Fix the default value of secid_to_secctx hook") Reported-by: Alexei Starovoitov Signed-off-by: KP Singh Signed-off-by: Alexei Starovoitov Acked-by: James Morris Link: https://lore.kernel.org/bpf/20200520125616.193765-1-kpsingh@chromium.org --- security/security.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/security/security.c b/security/security.c index 7fed24b9d57e..51de970fbb1e 100644 --- a/security/security.c +++ b/security/security.c @@ -1965,8 +1965,20 @@ EXPORT_SYMBOL(security_ismaclabel); int security_secid_to_secctx(u32 secid, char **secdata, u32 *seclen) { - return call_int_hook(secid_to_secctx, -EOPNOTSUPP, secid, secdata, - seclen); + struct security_hook_list *hp; + int rc; + + /* + * Currently, only one LSM can implement secid_to_secctx (i.e this + * LSM hook is not "stackable"). + */ + hlist_for_each_entry(hp, &security_hook_heads.secid_to_secctx, list) { + rc = hp->hook.secid_to_secctx(secid, secdata, seclen); + if (rc != LSM_RET_DEFAULT(secid_to_secctx)) + return rc; + } + + return LSM_RET_DEFAULT(secid_to_secctx); } EXPORT_SYMBOL(security_secid_to_secctx); -- cgit v1.2.3 From dfeb376dd4cb2c5004aeb625e2475f58a5ff2ea7 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 18 May 2020 22:38:24 -0700 Subject: bpf: Prevent mmap()'ing read-only maps as writable As discussed in [0], it's dangerous to allow mapping BPF map, that's meant to be frozen and is read-only on BPF program side, because that allows user-space to actually store a writable view to the page even after it is frozen. This is exacerbated by BPF verifier making a strong assumption that contents of such frozen map will remain unchanged. To prevent this, disallow mapping BPF_F_RDONLY_PROG mmap()'able BPF maps as writable, ever. [0] https://lore.kernel.org/bpf/CAEf4BzYGWYhXdp6BJ7_=9OQPJxQpgug080MMjdSB72i9R+5c6g@mail.gmail.com/ Fixes: fc9702273e2e ("bpf: Add mmap() support for BPF_MAP_TYPE_ARRAY") Suggested-by: Jann Horn Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Reviewed-by: Jann Horn Link: https://lore.kernel.org/bpf/20200519053824.1089415-1-andriin@fb.com --- kernel/bpf/syscall.c | 17 ++++++++++++++--- tools/testing/selftests/bpf/prog_tests/mmap.c | 13 ++++++++++++- tools/testing/selftests/bpf/progs/test_mmap.c | 8 ++++++++ 3 files changed, 34 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2843bbba9ca1..4e6dee19a668 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -623,9 +623,20 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) mutex_lock(&map->freeze_mutex); - if ((vma->vm_flags & VM_WRITE) && map->frozen) { - err = -EPERM; - goto out; + if (vma->vm_flags & VM_WRITE) { + if (map->frozen) { + err = -EPERM; + goto out; + } + /* map is meant to be read-only, so do not allow mapping as + * writable, because it's possible to leak a writable page + * reference and allows user-space to still modify it after + * freezing, while verifier will assume contents do not change + */ + if (map->map_flags & BPF_F_RDONLY_PROG) { + err = -EACCES; + goto out; + } } /* set default open/close callbacks */ diff --git a/tools/testing/selftests/bpf/prog_tests/mmap.c b/tools/testing/selftests/bpf/prog_tests/mmap.c index 6b9dce431d41..43d0b5578f46 100644 --- a/tools/testing/selftests/bpf/prog_tests/mmap.c +++ b/tools/testing/selftests/bpf/prog_tests/mmap.c @@ -19,7 +19,7 @@ void test_mmap(void) const size_t map_sz = roundup_page(sizeof(struct map_data)); const int zero = 0, one = 1, two = 2, far = 1500; const long page_size = sysconf(_SC_PAGE_SIZE); - int err, duration = 0, i, data_map_fd, data_map_id, tmp_fd; + int err, duration = 0, i, data_map_fd, data_map_id, tmp_fd, rdmap_fd; struct bpf_map *data_map, *bss_map; void *bss_mmaped = NULL, *map_mmaped = NULL, *tmp1, *tmp2; struct test_mmap__bss *bss_data; @@ -37,6 +37,17 @@ void test_mmap(void) data_map = skel->maps.data_map; data_map_fd = bpf_map__fd(data_map); + rdmap_fd = bpf_map__fd(skel->maps.rdonly_map); + tmp1 = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, rdmap_fd, 0); + if (CHECK(tmp1 != MAP_FAILED, "rdonly_write_mmap", "unexpected success\n")) { + munmap(tmp1, 4096); + goto cleanup; + } + /* now double-check if it's mmap()'able at all */ + tmp1 = mmap(NULL, 4096, PROT_READ, MAP_SHARED, rdmap_fd, 0); + if (CHECK(tmp1 == MAP_FAILED, "rdonly_read_mmap", "failed: %d\n", errno)) + goto cleanup; + /* get map's ID */ memset(&map_info, 0, map_info_sz); err = bpf_obj_get_info_by_fd(data_map_fd, &map_info, &map_info_sz); diff --git a/tools/testing/selftests/bpf/progs/test_mmap.c b/tools/testing/selftests/bpf/progs/test_mmap.c index 6239596cd14e..4eb42cff5fe9 100644 --- a/tools/testing/selftests/bpf/progs/test_mmap.c +++ b/tools/testing/selftests/bpf/progs/test_mmap.c @@ -7,6 +7,14 @@ char _license[] SEC("license") = "GPL"; +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 4096); + __uint(map_flags, BPF_F_MMAPABLE | BPF_F_RDONLY_PROG); + __type(key, __u32); + __type(value, char); +} rdonly_map SEC(".maps"); + struct { __uint(type, BPF_MAP_TYPE_ARRAY); __uint(max_entries, 512 * 4); /* at least 4 pages of data */ -- cgit v1.2.3 From ee3c1aa3f34b7842c1557cfe5d8c3f7b8c692de8 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Tue, 19 May 2020 22:49:27 -0600 Subject: wireguard: selftests: use newer iproute2 for gcc-10 gcc-10 switched to defaulting to -fno-common, which broke iproute2-5.4. This was fixed in iproute-5.6, so switch to that. Because we're after a stable testing surface, we generally don't like to bump these unnecessarily, but in this case, being able to actually build is a basic necessity. Signed-off-by: Jason A. Donenfeld Signed-off-by: David S. Miller --- tools/testing/selftests/wireguard/qemu/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/wireguard/qemu/Makefile b/tools/testing/selftests/wireguard/qemu/Makefile index 90598a425c18..4bdd6c1a19d3 100644 --- a/tools/testing/selftests/wireguard/qemu/Makefile +++ b/tools/testing/selftests/wireguard/qemu/Makefile @@ -44,7 +44,7 @@ endef $(eval $(call tar_download,MUSL,musl,1.2.0,.tar.gz,https://musl.libc.org/releases/,c6de7b191139142d3f9a7b5b702c9cae1b5ee6e7f57e582da9328629408fd4e8)) $(eval $(call tar_download,IPERF,iperf,3.7,.tar.gz,https://downloads.es.net/pub/iperf/,d846040224317caf2f75c843d309a950a7db23f9b44b94688ccbe557d6d1710c)) $(eval $(call tar_download,BASH,bash,5.0,.tar.gz,https://ftp.gnu.org/gnu/bash/,b4a80f2ac66170b2913efbfb9f2594f1f76c7b1afd11f799e22035d63077fb4d)) -$(eval $(call tar_download,IPROUTE2,iproute2,5.4.0,.tar.xz,https://www.kernel.org/pub/linux/utils/net/iproute2/,fe97aa60a0d4c5ac830be18937e18dc3400ca713a33a89ad896ff1e3d46086ae)) +$(eval $(call tar_download,IPROUTE2,iproute2,5.6.0,.tar.xz,https://www.kernel.org/pub/linux/utils/net/iproute2/,1b5b0e25ce6e23da7526ea1da044e814ad85ba761b10dd29c2b027c056b04692)) $(eval $(call tar_download,IPTABLES,iptables,1.8.4,.tar.bz2,https://www.netfilter.org/projects/iptables/files/,993a3a5490a544c2cbf2ef15cf7e7ed21af1845baf228318d5c36ef8827e157c)) $(eval $(call tar_download,NMAP,nmap,7.80,.tar.bz2,https://nmap.org/dist/,fcfa5a0e42099e12e4bf7a68ebe6fde05553383a682e816a7ec9256ab4773faa)) $(eval $(call tar_download,IPUTILS,iputils,s20190709,.tar.gz,https://github.com/iputils/iputils/archive/s20190709.tar.gz/#,a15720dd741d7538dd2645f9f516d193636ae4300ff7dbc8bfca757bf166490a)) -- cgit v1.2.3 From bc67d371256f5c47d824e2eec51e46c8d62d022e Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Tue, 19 May 2020 22:49:28 -0600 Subject: wireguard: noise: read preshared key while taking lock Prior we read the preshared key after dropping the handshake lock, which isn't an actual crypto issue if it races, but it's still not quite correct. So copy that part of the state into a temporary like we do with the rest of the handshake state variables. Then we can release the lock, operate on the temporary, and zero it out at the end of the function. In performance tests, the impact of this was entirely unnoticable, probably because those bytes are coming from the same cacheline as other things that are being copied out in the same manner. Reported-by: Matt Dunwoodie Fixes: e7096c131e51 ("net: WireGuard secure network tunnel") Signed-off-by: Jason A. Donenfeld Signed-off-by: David S. Miller --- drivers/net/wireguard/noise.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireguard/noise.c b/drivers/net/wireguard/noise.c index 708dc61c974f..07eb438a6dee 100644 --- a/drivers/net/wireguard/noise.c +++ b/drivers/net/wireguard/noise.c @@ -715,6 +715,7 @@ wg_noise_handshake_consume_response(struct message_handshake_response *src, u8 e[NOISE_PUBLIC_KEY_LEN]; u8 ephemeral_private[NOISE_PUBLIC_KEY_LEN]; u8 static_private[NOISE_PUBLIC_KEY_LEN]; + u8 preshared_key[NOISE_SYMMETRIC_KEY_LEN]; down_read(&wg->static_identity.lock); @@ -733,6 +734,8 @@ wg_noise_handshake_consume_response(struct message_handshake_response *src, memcpy(chaining_key, handshake->chaining_key, NOISE_HASH_LEN); memcpy(ephemeral_private, handshake->ephemeral_private, NOISE_PUBLIC_KEY_LEN); + memcpy(preshared_key, handshake->preshared_key, + NOISE_SYMMETRIC_KEY_LEN); up_read(&handshake->lock); if (state != HANDSHAKE_CREATED_INITIATION) @@ -750,7 +753,7 @@ wg_noise_handshake_consume_response(struct message_handshake_response *src, goto fail; /* psk */ - mix_psk(chaining_key, hash, key, handshake->preshared_key); + mix_psk(chaining_key, hash, key, preshared_key); /* {} */ if (!message_decrypt(NULL, src->encrypted_nothing, @@ -783,6 +786,7 @@ out: memzero_explicit(chaining_key, NOISE_HASH_LEN); memzero_explicit(ephemeral_private, NOISE_PUBLIC_KEY_LEN); memzero_explicit(static_private, NOISE_PUBLIC_KEY_LEN); + memzero_explicit(preshared_key, NOISE_SYMMETRIC_KEY_LEN); up_read(&wg->static_identity.lock); return ret_peer; } -- cgit v1.2.3 From c78a0b4a78839d572d8a80f6a62221c0d7843135 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Tue, 19 May 2020 22:49:29 -0600 Subject: wireguard: queueing: preserve flow hash across packet scrubbing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It's important that we clear most header fields during encapsulation and decapsulation, because the packet is substantially changed, and we don't want any info leak or logic bug due to an accidental correlation. But, for encapsulation, it's wrong to clear skb->hash, since it's used by fq_codel and flow dissection in general. Without it, classification does not proceed as usual. This change might make it easier to estimate the number of innerflows by examining clustering of out of order packets, but this shouldn't open up anything that can't already be inferred otherwise (e.g. syn packet size inference), and fq_codel can be disabled anyway. Furthermore, it might be the case that the hash isn't used or queried at all until after wireguard transmits the encrypted UDP packet, which means skb->hash might still be zero at this point, and thus no hash taken over the inner packet data. In order to address this situation, we force a calculation of skb->hash before encrypting packet data. Of course this means that fq_codel might transmit packets slightly more out of order than usual. Toke did some testing on beefy machines with high quantities of parallel flows and found that increasing the reply-attack counter to 8192 takes care of the most pathological cases pretty well. Reported-by: Dave Taht Reviewed-and-tested-by: Toke Høiland-Jørgensen Fixes: e7096c131e51 ("net: WireGuard secure network tunnel") Signed-off-by: Jason A. Donenfeld Signed-off-by: David S. Miller --- drivers/net/wireguard/messages.h | 2 +- drivers/net/wireguard/queueing.h | 10 +++++++++- drivers/net/wireguard/receive.c | 2 +- drivers/net/wireguard/send.c | 7 ++++++- 4 files changed, 17 insertions(+), 4 deletions(-) diff --git a/drivers/net/wireguard/messages.h b/drivers/net/wireguard/messages.h index b8a7b9ce32ba..208da72673fc 100644 --- a/drivers/net/wireguard/messages.h +++ b/drivers/net/wireguard/messages.h @@ -32,7 +32,7 @@ enum cookie_values { }; enum counter_values { - COUNTER_BITS_TOTAL = 2048, + COUNTER_BITS_TOTAL = 8192, COUNTER_REDUNDANT_BITS = BITS_PER_LONG, COUNTER_WINDOW_SIZE = COUNTER_BITS_TOTAL - COUNTER_REDUNDANT_BITS }; diff --git a/drivers/net/wireguard/queueing.h b/drivers/net/wireguard/queueing.h index 3432232afe06..c58df439dbbe 100644 --- a/drivers/net/wireguard/queueing.h +++ b/drivers/net/wireguard/queueing.h @@ -87,12 +87,20 @@ static inline bool wg_check_packet_protocol(struct sk_buff *skb) return real_protocol && skb->protocol == real_protocol; } -static inline void wg_reset_packet(struct sk_buff *skb) +static inline void wg_reset_packet(struct sk_buff *skb, bool encapsulating) { + u8 l4_hash = skb->l4_hash; + u8 sw_hash = skb->sw_hash; + u32 hash = skb->hash; skb_scrub_packet(skb, true); memset(&skb->headers_start, 0, offsetof(struct sk_buff, headers_end) - offsetof(struct sk_buff, headers_start)); + if (encapsulating) { + skb->l4_hash = l4_hash; + skb->sw_hash = sw_hash; + skb->hash = hash; + } skb->queue_mapping = 0; skb->nohdr = 0; skb->peeked = 0; diff --git a/drivers/net/wireguard/receive.c b/drivers/net/wireguard/receive.c index 3bb5b9ae7cd1..d0eebd90c9d5 100644 --- a/drivers/net/wireguard/receive.c +++ b/drivers/net/wireguard/receive.c @@ -484,7 +484,7 @@ int wg_packet_rx_poll(struct napi_struct *napi, int budget) if (unlikely(wg_socket_endpoint_from_skb(&endpoint, skb))) goto next; - wg_reset_packet(skb); + wg_reset_packet(skb, false); wg_packet_consume_data_done(peer, skb, &endpoint); free = false; diff --git a/drivers/net/wireguard/send.c b/drivers/net/wireguard/send.c index 6687db699803..2f5119ff93d8 100644 --- a/drivers/net/wireguard/send.c +++ b/drivers/net/wireguard/send.c @@ -167,6 +167,11 @@ static bool encrypt_packet(struct sk_buff *skb, struct noise_keypair *keypair) struct sk_buff *trailer; int num_frags; + /* Force hash calculation before encryption so that flow analysis is + * consistent over the inner packet. + */ + skb_get_hash(skb); + /* Calculate lengths. */ padding_len = calculate_skb_padding(skb); trailer_len = padding_len + noise_encrypted_len(0); @@ -295,7 +300,7 @@ void wg_packet_encrypt_worker(struct work_struct *work) skb_list_walk_safe(first, skb, next) { if (likely(encrypt_packet(skb, PACKET_CB(first)->keypair))) { - wg_reset_packet(skb); + wg_reset_packet(skb, true); } else { state = PACKET_STATE_DEAD; break; -- cgit v1.2.3 From a9e90d9931f3a474f04bab782ccd9d77904941e9 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Tue, 19 May 2020 22:49:30 -0600 Subject: wireguard: noise: separate receive counter from send counter In "wireguard: queueing: preserve flow hash across packet scrubbing", we were required to slightly increase the size of the receive replay counter to something still fairly small, but an increase nonetheless. It turns out that we can recoup some of the additional memory overhead by splitting up the prior union type into two distinct types. Before, we used the same "noise_counter" union for both sending and receiving, with sending just using a simple atomic64_t, while receiving used the full replay counter checker. This meant that most of the memory being allocated for the sending counter was being wasted. Since the old "noise_counter" type increased in size in the prior commit, now is a good time to split up that union type into a distinct "noise_replay_ counter" for receiving and a boring atomic64_t for sending, each using neither more nor less memory than required. Also, since sometimes the replay counter is accessed without necessitating additional accesses to the bitmap, we can reduce cache misses by hoisting the always-necessary lock above the bitmap in the struct layout. We also change a "noise_replay_counter" stack allocation to kmalloc in a -DDEBUG selftest so that KASAN doesn't trigger a stack frame warning. All and all, removing a bit of abstraction in this commit makes the code simpler and smaller, in addition to the motivating memory usage recuperation. For example, passing around raw "noise_symmetric_key" structs is something that really only makes sense within noise.c, in the one place where the sending and receiving keys can safely be thought of as the same type of object; subsequent to that, it's important that we uniformly access these through keypair->{sending,receiving}, where their distinct roles are always made explicit. So this patch allows us to draw that distinction clearly as well. Fixes: e7096c131e51 ("net: WireGuard secure network tunnel") Signed-off-by: Jason A. Donenfeld Signed-off-by: David S. Miller --- drivers/net/wireguard/noise.c | 16 +++--------- drivers/net/wireguard/noise.h | 14 +++++------ drivers/net/wireguard/receive.c | 42 ++++++++++++++++---------------- drivers/net/wireguard/selftest/counter.c | 17 +++++++++---- drivers/net/wireguard/send.c | 12 ++++----- 5 files changed, 48 insertions(+), 53 deletions(-) diff --git a/drivers/net/wireguard/noise.c b/drivers/net/wireguard/noise.c index 07eb438a6dee..626433690abb 100644 --- a/drivers/net/wireguard/noise.c +++ b/drivers/net/wireguard/noise.c @@ -104,6 +104,7 @@ static struct noise_keypair *keypair_create(struct wg_peer *peer) if (unlikely(!keypair)) return NULL; + spin_lock_init(&keypair->receiving_counter.lock); keypair->internal_id = atomic64_inc_return(&keypair_counter); keypair->entry.type = INDEX_HASHTABLE_KEYPAIR; keypair->entry.peer = peer; @@ -358,25 +359,16 @@ out: memzero_explicit(output, BLAKE2S_HASH_SIZE + 1); } -static void symmetric_key_init(struct noise_symmetric_key *key) -{ - spin_lock_init(&key->counter.receive.lock); - atomic64_set(&key->counter.counter, 0); - memset(key->counter.receive.backtrack, 0, - sizeof(key->counter.receive.backtrack)); - key->birthdate = ktime_get_coarse_boottime_ns(); - key->is_valid = true; -} - static void derive_keys(struct noise_symmetric_key *first_dst, struct noise_symmetric_key *second_dst, const u8 chaining_key[NOISE_HASH_LEN]) { + u64 birthdate = ktime_get_coarse_boottime_ns(); kdf(first_dst->key, second_dst->key, NULL, NULL, NOISE_SYMMETRIC_KEY_LEN, NOISE_SYMMETRIC_KEY_LEN, 0, 0, chaining_key); - symmetric_key_init(first_dst); - symmetric_key_init(second_dst); + first_dst->birthdate = second_dst->birthdate = birthdate; + first_dst->is_valid = second_dst->is_valid = true; } static bool __must_check mix_dh(u8 chaining_key[NOISE_HASH_LEN], diff --git a/drivers/net/wireguard/noise.h b/drivers/net/wireguard/noise.h index f532d59d3f19..c527253dba80 100644 --- a/drivers/net/wireguard/noise.h +++ b/drivers/net/wireguard/noise.h @@ -15,18 +15,14 @@ #include #include -union noise_counter { - struct { - u64 counter; - unsigned long backtrack[COUNTER_BITS_TOTAL / BITS_PER_LONG]; - spinlock_t lock; - } receive; - atomic64_t counter; +struct noise_replay_counter { + u64 counter; + spinlock_t lock; + unsigned long backtrack[COUNTER_BITS_TOTAL / BITS_PER_LONG]; }; struct noise_symmetric_key { u8 key[NOISE_SYMMETRIC_KEY_LEN]; - union noise_counter counter; u64 birthdate; bool is_valid; }; @@ -34,7 +30,9 @@ struct noise_symmetric_key { struct noise_keypair { struct index_hashtable_entry entry; struct noise_symmetric_key sending; + atomic64_t sending_counter; struct noise_symmetric_key receiving; + struct noise_replay_counter receiving_counter; __le32 remote_index; bool i_am_the_initiator; struct kref refcount; diff --git a/drivers/net/wireguard/receive.c b/drivers/net/wireguard/receive.c index d0eebd90c9d5..91438144e4f7 100644 --- a/drivers/net/wireguard/receive.c +++ b/drivers/net/wireguard/receive.c @@ -245,20 +245,20 @@ static void keep_key_fresh(struct wg_peer *peer) } } -static bool decrypt_packet(struct sk_buff *skb, struct noise_symmetric_key *key) +static bool decrypt_packet(struct sk_buff *skb, struct noise_keypair *keypair) { struct scatterlist sg[MAX_SKB_FRAGS + 8]; struct sk_buff *trailer; unsigned int offset; int num_frags; - if (unlikely(!key)) + if (unlikely(!keypair)) return false; - if (unlikely(!READ_ONCE(key->is_valid) || - wg_birthdate_has_expired(key->birthdate, REJECT_AFTER_TIME) || - key->counter.receive.counter >= REJECT_AFTER_MESSAGES)) { - WRITE_ONCE(key->is_valid, false); + if (unlikely(!READ_ONCE(keypair->receiving.is_valid) || + wg_birthdate_has_expired(keypair->receiving.birthdate, REJECT_AFTER_TIME) || + keypair->receiving_counter.counter >= REJECT_AFTER_MESSAGES)) { + WRITE_ONCE(keypair->receiving.is_valid, false); return false; } @@ -283,7 +283,7 @@ static bool decrypt_packet(struct sk_buff *skb, struct noise_symmetric_key *key) if (!chacha20poly1305_decrypt_sg_inplace(sg, skb->len, NULL, 0, PACKET_CB(skb)->nonce, - key->key)) + keypair->receiving.key)) return false; /* Another ugly situation of pushing and pulling the header so as to @@ -298,41 +298,41 @@ static bool decrypt_packet(struct sk_buff *skb, struct noise_symmetric_key *key) } /* This is RFC6479, a replay detection bitmap algorithm that avoids bitshifts */ -static bool counter_validate(union noise_counter *counter, u64 their_counter) +static bool counter_validate(struct noise_replay_counter *counter, u64 their_counter) { unsigned long index, index_current, top, i; bool ret = false; - spin_lock_bh(&counter->receive.lock); + spin_lock_bh(&counter->lock); - if (unlikely(counter->receive.counter >= REJECT_AFTER_MESSAGES + 1 || + if (unlikely(counter->counter >= REJECT_AFTER_MESSAGES + 1 || their_counter >= REJECT_AFTER_MESSAGES)) goto out; ++their_counter; if (unlikely((COUNTER_WINDOW_SIZE + their_counter) < - counter->receive.counter)) + counter->counter)) goto out; index = their_counter >> ilog2(BITS_PER_LONG); - if (likely(their_counter > counter->receive.counter)) { - index_current = counter->receive.counter >> ilog2(BITS_PER_LONG); + if (likely(their_counter > counter->counter)) { + index_current = counter->counter >> ilog2(BITS_PER_LONG); top = min_t(unsigned long, index - index_current, COUNTER_BITS_TOTAL / BITS_PER_LONG); for (i = 1; i <= top; ++i) - counter->receive.backtrack[(i + index_current) & + counter->backtrack[(i + index_current) & ((COUNTER_BITS_TOTAL / BITS_PER_LONG) - 1)] = 0; - counter->receive.counter = their_counter; + counter->counter = their_counter; } index &= (COUNTER_BITS_TOTAL / BITS_PER_LONG) - 1; ret = !test_and_set_bit(their_counter & (BITS_PER_LONG - 1), - &counter->receive.backtrack[index]); + &counter->backtrack[index]); out: - spin_unlock_bh(&counter->receive.lock); + spin_unlock_bh(&counter->lock); return ret; } @@ -472,12 +472,12 @@ int wg_packet_rx_poll(struct napi_struct *napi, int budget) if (unlikely(state != PACKET_STATE_CRYPTED)) goto next; - if (unlikely(!counter_validate(&keypair->receiving.counter, + if (unlikely(!counter_validate(&keypair->receiving_counter, PACKET_CB(skb)->nonce))) { net_dbg_ratelimited("%s: Packet has invalid nonce %llu (max %llu)\n", peer->device->dev->name, PACKET_CB(skb)->nonce, - keypair->receiving.counter.receive.counter); + keypair->receiving_counter.counter); goto next; } @@ -511,8 +511,8 @@ void wg_packet_decrypt_worker(struct work_struct *work) struct sk_buff *skb; while ((skb = ptr_ring_consume_bh(&queue->ring)) != NULL) { - enum packet_state state = likely(decrypt_packet(skb, - &PACKET_CB(skb)->keypair->receiving)) ? + enum packet_state state = + likely(decrypt_packet(skb, PACKET_CB(skb)->keypair)) ? PACKET_STATE_CRYPTED : PACKET_STATE_DEAD; wg_queue_enqueue_per_peer_napi(skb, state); if (need_resched()) diff --git a/drivers/net/wireguard/selftest/counter.c b/drivers/net/wireguard/selftest/counter.c index f4fbb9072ed7..ec3c156bf91b 100644 --- a/drivers/net/wireguard/selftest/counter.c +++ b/drivers/net/wireguard/selftest/counter.c @@ -6,18 +6,24 @@ #ifdef DEBUG bool __init wg_packet_counter_selftest(void) { + struct noise_replay_counter *counter; unsigned int test_num = 0, i; - union noise_counter counter; bool success = true; -#define T_INIT do { \ - memset(&counter, 0, sizeof(union noise_counter)); \ - spin_lock_init(&counter.receive.lock); \ + counter = kmalloc(sizeof(*counter), GFP_KERNEL); + if (unlikely(!counter)) { + pr_err("nonce counter self-test malloc: FAIL\n"); + return false; + } + +#define T_INIT do { \ + memset(counter, 0, sizeof(*counter)); \ + spin_lock_init(&counter->lock); \ } while (0) #define T_LIM (COUNTER_WINDOW_SIZE + 1) #define T(n, v) do { \ ++test_num; \ - if (counter_validate(&counter, n) != (v)) { \ + if (counter_validate(counter, n) != (v)) { \ pr_err("nonce counter self-test %u: FAIL\n", \ test_num); \ success = false; \ @@ -99,6 +105,7 @@ bool __init wg_packet_counter_selftest(void) if (success) pr_info("nonce counter self-tests: pass\n"); + kfree(counter); return success; } #endif diff --git a/drivers/net/wireguard/send.c b/drivers/net/wireguard/send.c index 2f5119ff93d8..f74b9341ab0f 100644 --- a/drivers/net/wireguard/send.c +++ b/drivers/net/wireguard/send.c @@ -129,7 +129,7 @@ static void keep_key_fresh(struct wg_peer *peer) rcu_read_lock_bh(); keypair = rcu_dereference_bh(peer->keypairs.current_keypair); send = keypair && READ_ONCE(keypair->sending.is_valid) && - (atomic64_read(&keypair->sending.counter.counter) > REKEY_AFTER_MESSAGES || + (atomic64_read(&keypair->sending_counter) > REKEY_AFTER_MESSAGES || (keypair->i_am_the_initiator && wg_birthdate_has_expired(keypair->sending.birthdate, REKEY_AFTER_TIME))); rcu_read_unlock_bh(); @@ -349,7 +349,6 @@ void wg_packet_purge_staged_packets(struct wg_peer *peer) void wg_packet_send_staged_packets(struct wg_peer *peer) { - struct noise_symmetric_key *key; struct noise_keypair *keypair; struct sk_buff_head packets; struct sk_buff *skb; @@ -369,10 +368,9 @@ void wg_packet_send_staged_packets(struct wg_peer *peer) rcu_read_unlock_bh(); if (unlikely(!keypair)) goto out_nokey; - key = &keypair->sending; - if (unlikely(!READ_ONCE(key->is_valid))) + if (unlikely(!READ_ONCE(keypair->sending.is_valid))) goto out_nokey; - if (unlikely(wg_birthdate_has_expired(key->birthdate, + if (unlikely(wg_birthdate_has_expired(keypair->sending.birthdate, REJECT_AFTER_TIME))) goto out_invalid; @@ -387,7 +385,7 @@ void wg_packet_send_staged_packets(struct wg_peer *peer) */ PACKET_CB(skb)->ds = ip_tunnel_ecn_encap(0, ip_hdr(skb), skb); PACKET_CB(skb)->nonce = - atomic64_inc_return(&key->counter.counter) - 1; + atomic64_inc_return(&keypair->sending_counter) - 1; if (unlikely(PACKET_CB(skb)->nonce >= REJECT_AFTER_MESSAGES)) goto out_invalid; } @@ -399,7 +397,7 @@ void wg_packet_send_staged_packets(struct wg_peer *peer) return; out_invalid: - WRITE_ONCE(key->is_valid, false); + WRITE_ONCE(keypair->sending.is_valid, false); out_nokey: wg_noise_keypair_put(keypair, false); -- cgit v1.2.3 From 687775cec056b38a4c8f3291e0dd7a9145f7b667 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 19 May 2020 18:24:43 -0700 Subject: ax25: fix setsockopt(SO_BINDTODEVICE) syzbot was able to trigger this trace [1], probably by using a zero optlen. While we are at it, cap optlen to IFNAMSIZ - 1 instead of IFNAMSIZ. [1] BUG: KMSAN: uninit-value in strnlen+0xf9/0x170 lib/string.c:569 CPU: 0 PID: 8807 Comm: syz-executor483 Not tainted 5.7.0-rc4-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x1c9/0x220 lib/dump_stack.c:118 kmsan_report+0xf7/0x1e0 mm/kmsan/kmsan_report.c:121 __msan_warning+0x58/0xa0 mm/kmsan/kmsan_instr.c:215 strnlen+0xf9/0x170 lib/string.c:569 dev_name_hash net/core/dev.c:207 [inline] netdev_name_node_lookup net/core/dev.c:277 [inline] __dev_get_by_name+0x75/0x2b0 net/core/dev.c:778 ax25_setsockopt+0xfa3/0x1170 net/ax25/af_ax25.c:654 __compat_sys_setsockopt+0x4ed/0x910 net/compat.c:403 __do_compat_sys_setsockopt net/compat.c:413 [inline] __se_compat_sys_setsockopt+0xdd/0x100 net/compat.c:410 __ia32_compat_sys_setsockopt+0x62/0x80 net/compat.c:410 do_syscall_32_irqs_on arch/x86/entry/common.c:339 [inline] do_fast_syscall_32+0x3bf/0x6d0 arch/x86/entry/common.c:398 entry_SYSENTER_compat+0x68/0x77 arch/x86/entry/entry_64_compat.S:139 RIP: 0023:0xf7f57dd9 Code: 90 e8 0b 00 00 00 f3 90 0f ae e8 eb f9 8d 74 26 00 89 3c 24 c3 90 90 90 90 90 90 90 90 90 90 90 90 51 52 55 89 e5 0f 34 cd 80 <5d> 5a 59 c3 90 90 90 90 eb 0d 90 90 90 90 90 90 90 90 90 90 90 90 RSP: 002b:00000000ffae8c1c EFLAGS: 00000217 ORIG_RAX: 000000000000016e RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 0000000000000101 RDX: 0000000000000019 RSI: 0000000020000000 RDI: 0000000000000004 RBP: 0000000000000012 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 Local variable ----devname@ax25_setsockopt created at: ax25_setsockopt+0xe6/0x1170 net/ax25/af_ax25.c:536 ax25_setsockopt+0xe6/0x1170 net/ax25/af_ax25.c:536 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/ax25/af_ax25.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index ff57ea89c27e..fd91cd34f25e 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -635,8 +635,10 @@ static int ax25_setsockopt(struct socket *sock, int level, int optname, break; case SO_BINDTODEVICE: - if (optlen > IFNAMSIZ) - optlen = IFNAMSIZ; + if (optlen > IFNAMSIZ - 1) + optlen = IFNAMSIZ - 1; + + memset(devname, 0, sizeof(devname)); if (copy_from_user(devname, optval, optlen)) { res = -EFAULT; -- cgit v1.2.3 From d69100b8eee27c2d60ee52df76e0b80a8d492d34 Mon Sep 17 00:00:00 2001 From: Stephen Worley Date: Tue, 19 May 2020 21:57:12 -0400 Subject: net: nlmsg_cancel() if put fails for nhmsg Fixes data remnant seen when we fail to reserve space for a nexthop group during a larger dump. If we fail the reservation, we goto nla_put_failure and cancel the message. Reproduce with the following iproute2 commands: ===================== ip link add dummy1 type dummy ip link add dummy2 type dummy ip link add dummy3 type dummy ip link add dummy4 type dummy ip link add dummy5 type dummy ip link add dummy6 type dummy ip link add dummy7 type dummy ip link add dummy8 type dummy ip link add dummy9 type dummy ip link add dummy10 type dummy ip link add dummy11 type dummy ip link add dummy12 type dummy ip link add dummy13 type dummy ip link add dummy14 type dummy ip link add dummy15 type dummy ip link add dummy16 type dummy ip link add dummy17 type dummy ip link add dummy18 type dummy ip link add dummy19 type dummy ip link add dummy20 type dummy ip link add dummy21 type dummy ip link add dummy22 type dummy ip link add dummy23 type dummy ip link add dummy24 type dummy ip link add dummy25 type dummy ip link add dummy26 type dummy ip link add dummy27 type dummy ip link add dummy28 type dummy ip link add dummy29 type dummy ip link add dummy30 type dummy ip link add dummy31 type dummy ip link add dummy32 type dummy ip link set dummy1 up ip link set dummy2 up ip link set dummy3 up ip link set dummy4 up ip link set dummy5 up ip link set dummy6 up ip link set dummy7 up ip link set dummy8 up ip link set dummy9 up ip link set dummy10 up ip link set dummy11 up ip link set dummy12 up ip link set dummy13 up ip link set dummy14 up ip link set dummy15 up ip link set dummy16 up ip link set dummy17 up ip link set dummy18 up ip link set dummy19 up ip link set dummy20 up ip link set dummy21 up ip link set dummy22 up ip link set dummy23 up ip link set dummy24 up ip link set dummy25 up ip link set dummy26 up ip link set dummy27 up ip link set dummy28 up ip link set dummy29 up ip link set dummy30 up ip link set dummy31 up ip link set dummy32 up ip link set dummy33 up ip link set dummy34 up ip link set vrf-red up ip link set vrf-blue up ip link set dummyVRFred up ip link set dummyVRFblue up ip ro add 1.1.1.1/32 dev dummy1 ip ro add 1.1.1.2/32 dev dummy2 ip ro add 1.1.1.3/32 dev dummy3 ip ro add 1.1.1.4/32 dev dummy4 ip ro add 1.1.1.5/32 dev dummy5 ip ro add 1.1.1.6/32 dev dummy6 ip ro add 1.1.1.7/32 dev dummy7 ip ro add 1.1.1.8/32 dev dummy8 ip ro add 1.1.1.9/32 dev dummy9 ip ro add 1.1.1.10/32 dev dummy10 ip ro add 1.1.1.11/32 dev dummy11 ip ro add 1.1.1.12/32 dev dummy12 ip ro add 1.1.1.13/32 dev dummy13 ip ro add 1.1.1.14/32 dev dummy14 ip ro add 1.1.1.15/32 dev dummy15 ip ro add 1.1.1.16/32 dev dummy16 ip ro add 1.1.1.17/32 dev dummy17 ip ro add 1.1.1.18/32 dev dummy18 ip ro add 1.1.1.19/32 dev dummy19 ip ro add 1.1.1.20/32 dev dummy20 ip ro add 1.1.1.21/32 dev dummy21 ip ro add 1.1.1.22/32 dev dummy22 ip ro add 1.1.1.23/32 dev dummy23 ip ro add 1.1.1.24/32 dev dummy24 ip ro add 1.1.1.25/32 dev dummy25 ip ro add 1.1.1.26/32 dev dummy26 ip ro add 1.1.1.27/32 dev dummy27 ip ro add 1.1.1.28/32 dev dummy28 ip ro add 1.1.1.29/32 dev dummy29 ip ro add 1.1.1.30/32 dev dummy30 ip ro add 1.1.1.31/32 dev dummy31 ip ro add 1.1.1.32/32 dev dummy32 ip next add id 1 via 1.1.1.1 dev dummy1 ip next add id 2 via 1.1.1.2 dev dummy2 ip next add id 3 via 1.1.1.3 dev dummy3 ip next add id 4 via 1.1.1.4 dev dummy4 ip next add id 5 via 1.1.1.5 dev dummy5 ip next add id 6 via 1.1.1.6 dev dummy6 ip next add id 7 via 1.1.1.7 dev dummy7 ip next add id 8 via 1.1.1.8 dev dummy8 ip next add id 9 via 1.1.1.9 dev dummy9 ip next add id 10 via 1.1.1.10 dev dummy10 ip next add id 11 via 1.1.1.11 dev dummy11 ip next add id 12 via 1.1.1.12 dev dummy12 ip next add id 13 via 1.1.1.13 dev dummy13 ip next add id 14 via 1.1.1.14 dev dummy14 ip next add id 15 via 1.1.1.15 dev dummy15 ip next add id 16 via 1.1.1.16 dev dummy16 ip next add id 17 via 1.1.1.17 dev dummy17 ip next add id 18 via 1.1.1.18 dev dummy18 ip next add id 19 via 1.1.1.19 dev dummy19 ip next add id 20 via 1.1.1.20 dev dummy20 ip next add id 21 via 1.1.1.21 dev dummy21 ip next add id 22 via 1.1.1.22 dev dummy22 ip next add id 23 via 1.1.1.23 dev dummy23 ip next add id 24 via 1.1.1.24 dev dummy24 ip next add id 25 via 1.1.1.25 dev dummy25 ip next add id 26 via 1.1.1.26 dev dummy26 ip next add id 27 via 1.1.1.27 dev dummy27 ip next add id 28 via 1.1.1.28 dev dummy28 ip next add id 29 via 1.1.1.29 dev dummy29 ip next add id 30 via 1.1.1.30 dev dummy30 ip next add id 31 via 1.1.1.31 dev dummy31 ip next add id 32 via 1.1.1.32 dev dummy32 i=100 while [ $i -le 200 ] do ip next add id $i group 1/2/3/4/5/6/7/8/9/10/11/12/13/14/15/16/17/18/19 echo $i ((i++)) done ip next add id 999 group 1/2/3/4/5/6 ip next ls ======================== Fixes: ab84be7e54fc ("net: Initial nexthop code") Signed-off-by: Stephen Worley Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/nexthop.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 2a31c4af845e..715e14475220 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -276,6 +276,7 @@ out: return 0; nla_put_failure: + nlmsg_cancel(skb, nlh); return -EMSGSIZE; } -- cgit v1.2.3 From 7c87e32d2e380228ada79d20ac5b7674718ef097 Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Sun, 10 May 2020 21:04:09 +0200 Subject: ethtool: count header size in reply size estimate As ethnl_request_ops::reply_size handlers do not include common header size into calculated/estimated reply size, it needs to be added in ethnl_default_doit() and ethnl_default_notify() before allocating the message. On the other hand, strset_reply_size() should not add common header size. Fixes: 728480f12442 ("ethtool: default handlers for GET requests") Reported-by: Oleksij Rempel Signed-off-by: Michal Kubecek Signed-off-by: David S. Miller --- net/ethtool/netlink.c | 4 ++-- net/ethtool/strset.c | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 0c772318c023..ed5357210193 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -342,7 +342,7 @@ static int ethnl_default_doit(struct sk_buff *skb, struct genl_info *info) ret = ops->reply_size(req_info, reply_data); if (ret < 0) goto err_cleanup; - reply_len = ret; + reply_len = ret + ethnl_reply_header_size(); ret = -ENOMEM; rskb = ethnl_reply_init(reply_len, req_info->dev, ops->reply_cmd, ops->hdr_attr, info, &reply_payload); @@ -588,7 +588,7 @@ static void ethnl_default_notify(struct net_device *dev, unsigned int cmd, ret = ops->reply_size(req_info, reply_data); if (ret < 0) goto err_cleanup; - reply_len = ret; + reply_len = ret + ethnl_reply_header_size(); ret = -ENOMEM; skb = genlmsg_new(reply_len, GFP_KERNEL); if (!skb) diff --git a/net/ethtool/strset.c b/net/ethtool/strset.c index 95eae5c68a52..0eed4e4909ab 100644 --- a/net/ethtool/strset.c +++ b/net/ethtool/strset.c @@ -324,7 +324,6 @@ static int strset_reply_size(const struct ethnl_req_info *req_base, int len = 0; int ret; - len += ethnl_reply_header_size(); for (i = 0; i < ETH_SS_COUNT; i++) { const struct strset_info *set_info = &data->sets[i]; -- cgit v1.2.3 From d28ea1fbbf437054ef339afec241019f2c4e2bb6 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Tue, 19 May 2020 23:44:16 +0530 Subject: net: qrtr: Fix passing invalid reference to qrtr_local_enqueue() Once the traversal of the list is completed with list_for_each_entry(), the iterator (node) will point to an invalid object. So passing this to qrtr_local_enqueue() which is outside of the iterator block is erroneous eventhough the object is not used. So fix this by passing NULL to qrtr_local_enqueue(). Fixes: bdabad3e363d ("net: Add Qualcomm IPC router") Reported-by: kbuild test robot Reported-by: Julia Lawall Signed-off-by: Manivannan Sadhasivam Reviewed-by: Bjorn Andersson Signed-off-by: David S. Miller --- net/qrtr/qrtr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index 7ed31b5e77e4..2d8d6131bc5f 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -854,7 +854,7 @@ static int qrtr_bcast_enqueue(struct qrtr_node *node, struct sk_buff *skb, } mutex_unlock(&qrtr_node_lock); - qrtr_local_enqueue(node, skb, type, from, to); + qrtr_local_enqueue(NULL, skb, type, from, to); return 0; } -- cgit v1.2.3 From 3469660d1b15ccfdf7b33295c306b6298ca730aa Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 20 May 2020 11:41:15 +0800 Subject: net: ethernet: ti: fix some return value check of cpsw_ale_create() cpsw_ale_create() can return both NULL and PTR_ERR(), but all of the caller only check NULL for error handling. This patch convert it to only return PTR_ERR() in all error cases, and the caller using IS_ERR() instead of NULL test. Fixes: 4b41d3436796 ("net: ethernet: ti: cpsw: allow untagged traffic on host port") Reported-by: Hulk Robot Signed-off-by: Wei Yongjun Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/cpsw_ale.c | 2 +- drivers/net/ethernet/ti/cpsw_priv.c | 4 ++-- drivers/net/ethernet/ti/netcp_ethss.c | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/ti/cpsw_ale.c b/drivers/net/ethernet/ti/cpsw_ale.c index 0374e6936091..8dc6be11b2ff 100644 --- a/drivers/net/ethernet/ti/cpsw_ale.c +++ b/drivers/net/ethernet/ti/cpsw_ale.c @@ -955,7 +955,7 @@ struct cpsw_ale *cpsw_ale_create(struct cpsw_ale_params *params) ale = devm_kzalloc(params->dev, sizeof(*ale), GFP_KERNEL); if (!ale) - return NULL; + return ERR_PTR(-ENOMEM); ale->p0_untag_vid_mask = devm_kmalloc_array(params->dev, BITS_TO_LONGS(VLAN_N_VID), diff --git a/drivers/net/ethernet/ti/cpsw_priv.c b/drivers/net/ethernet/ti/cpsw_priv.c index 97a058ca60ac..d0b6c418a870 100644 --- a/drivers/net/ethernet/ti/cpsw_priv.c +++ b/drivers/net/ethernet/ti/cpsw_priv.c @@ -490,9 +490,9 @@ int cpsw_init_common(struct cpsw_common *cpsw, void __iomem *ss_regs, ale_params.ale_ports = CPSW_ALE_PORTS_NUM; cpsw->ale = cpsw_ale_create(&ale_params); - if (!cpsw->ale) { + if (IS_ERR(cpsw->ale)) { dev_err(dev, "error initializing ale engine\n"); - return -ENODEV; + return PTR_ERR(cpsw->ale); } dma_params.dev = dev; diff --git a/drivers/net/ethernet/ti/netcp_ethss.c b/drivers/net/ethernet/ti/netcp_ethss.c index fb36115e9c51..fdbae734acce 100644 --- a/drivers/net/ethernet/ti/netcp_ethss.c +++ b/drivers/net/ethernet/ti/netcp_ethss.c @@ -3704,9 +3704,9 @@ static int gbe_probe(struct netcp_device *netcp_device, struct device *dev, ale_params.nu_switch_ale = true; } gbe_dev->ale = cpsw_ale_create(&ale_params); - if (!gbe_dev->ale) { + if (IS_ERR(gbe_dev->ale)) { dev_err(gbe_dev->dev, "error initializing ale engine\n"); - ret = -ENODEV; + ret = PTR_ERR(gbe_dev->ale); goto free_sec_ports; } else { dev_dbg(gbe_dev->dev, "Created a gbe ale engine\n"); -- cgit v1.2.3 From 1401cf600d548c72f51e20b5841c330d5c11c9e2 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 20 May 2020 11:41:16 +0800 Subject: net: ethernet: ti: am65-cpsw-nuss: fix error handling of am65_cpsw_nuss_probe Convert to using IS_ERR() instead of NULL test for cpsw_ale_create() error handling. Also fix to return negative error code from this error handling case instead of 0 in. Fixes: 93a76530316a ("net: ethernet: ti: introduce am65x/j721e gigabit eth subsystem driver") Reported-by: Hulk Robot Signed-off-by: Wei Yongjun Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/am65-cpsw-nuss.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index 2517ffba8178..88f52a2f85b3 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -1895,8 +1895,9 @@ static int am65_cpsw_nuss_probe(struct platform_device *pdev) ale_params.nu_switch_ale = true; common->ale = cpsw_ale_create(&ale_params); - if (!common->ale) { + if (IS_ERR(common->ale)) { dev_err(dev, "error initializing ale engine\n"); + ret = PTR_ERR(common->ale); goto err_of_clear; } -- cgit v1.2.3 From a7bff11f6f9afa87c25711db8050c9b5324db0e2 Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Wed, 20 May 2020 11:41:43 +0300 Subject: net/tls: fix encryption error checking bpf_exec_tx_verdict() can return negative value for copied variable. In that case this value will be pushed back to caller and the real error code will be lost. Fix it using signed type and checking for positive value. Fixes: d10523d0b3d7 ("net/tls: free the record on encryption error") Fixes: d3b18ad31f93 ("tls: add bpf support to sk_msg handling") Signed-off-by: Vadim Fedorenko Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index e23f94a5549b..57f80823330e 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -780,7 +780,7 @@ static int tls_push_record(struct sock *sk, int flags, static int bpf_exec_tx_verdict(struct sk_msg *msg, struct sock *sk, bool full_record, u8 record_type, - size_t *copied, int flags) + ssize_t *copied, int flags) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); @@ -916,7 +916,8 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) unsigned char record_type = TLS_RECORD_TYPE_DATA; bool is_kvec = iov_iter_is_kvec(&msg->msg_iter); bool eor = !(msg->msg_flags & MSG_MORE); - size_t try_to_copy, copied = 0; + size_t try_to_copy; + ssize_t copied = 0; struct sk_msg *msg_pl, *msg_en; struct tls_rec *rec; int required_size; @@ -1118,7 +1119,7 @@ send_end: release_sock(sk); mutex_unlock(&tls_ctx->tx_lock); - return copied ? copied : ret; + return copied > 0 ? copied : ret; } static int tls_sw_do_sendpage(struct sock *sk, struct page *page, @@ -1132,7 +1133,7 @@ static int tls_sw_do_sendpage(struct sock *sk, struct page *page, struct sk_msg *msg_pl; struct tls_rec *rec; int num_async = 0; - size_t copied = 0; + ssize_t copied = 0; bool full_record; int record_room; int ret = 0; @@ -1234,7 +1235,7 @@ wait_for_memory: } sendpage_end: ret = sk_stream_error(sk, flags, ret); - return copied ? copied : ret; + return copied > 0 ? copied : ret; } int tls_sw_sendpage_locked(struct sock *sk, struct page *page, -- cgit v1.2.3 From 635d9398178659d8ddba79dd061f9451cec0b4d1 Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Wed, 20 May 2020 11:41:44 +0300 Subject: net/tls: free record only on encryption error We cannot free record on any transient error because it leads to losing previos data. Check socket error to know whether record must be freed or not. Fixes: d10523d0b3d7 ("net/tls: free the record on encryption error") Signed-off-by: Vadim Fedorenko Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 57f80823330e..2d399b6c4075 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -796,9 +796,10 @@ static int bpf_exec_tx_verdict(struct sk_msg *msg, struct sock *sk, psock = sk_psock_get(sk); if (!psock || !policy) { err = tls_push_record(sk, flags, record_type); - if (err && err != -EINPROGRESS) { + if (err && sk->sk_err == EBADMSG) { *copied -= sk_msg_free(sk, msg); tls_free_open_rec(sk); + err = -sk->sk_err; } if (psock) sk_psock_put(sk, psock); @@ -824,9 +825,10 @@ more_data: switch (psock->eval) { case __SK_PASS: err = tls_push_record(sk, flags, record_type); - if (err && err != -EINPROGRESS) { + if (err && sk->sk_err == EBADMSG) { *copied -= sk_msg_free(sk, msg); tls_free_open_rec(sk); + err = -sk->sk_err; goto out_err; } break; -- cgit v1.2.3 From 57ebc8f08504f176eb0f25b3e0fde517dec61a4f Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Wed, 20 May 2020 11:50:48 +0300 Subject: net: ipip: fix wrong address family in init error path In case of error with MPLS support the code is misusing AF_INET instead of AF_MPLS. Fixes: 1b69e7e6c4da ("ipip: support MPLS over IPv4") Signed-off-by: Vadim Fedorenko Signed-off-by: David S. Miller --- net/ipv4/ipip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 2f01cf6fa0de..678575adaf3b 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -698,7 +698,7 @@ out: rtnl_link_failed: #if IS_ENABLED(CONFIG_MPLS) - xfrm4_tunnel_deregister(&mplsip_handler, AF_INET); + xfrm4_tunnel_deregister(&mplsip_handler, AF_MPLS); xfrm_tunnel_mplsip_failed: #endif -- cgit v1.2.3 From 41b4bd986f86331efc599b9a3f5fb86ad92e9af9 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Wed, 20 May 2020 11:15:46 +0200 Subject: net: don't return invalid table id error when we fall back to PF_UNSPEC In case we can't find a ->dumpit callback for the requested (family,type) pair, we fall back to (PF_UNSPEC,type). In effect, we're in the same situation as if userspace had requested a PF_UNSPEC dump. For RTM_GETROUTE, that handler is rtnl_dump_all, which calls all the registered RTM_GETROUTE handlers. The requested table id may or may not exist for all of those families. commit ae677bbb4441 ("net: Don't return invalid table id error when dumping all families") fixed the problem when userspace explicitly requests a PF_UNSPEC dump, but missed the fallback case. For example, when we pass ipv6.disable=1 to a kernel with CONFIG_IP_MROUTE=y and CONFIG_IP_MROUTE_MULTIPLE_TABLES=y, the (PF_INET6, RTM_GETROUTE) handler isn't registered, so we end up in rtnl_dump_all, and listing IPv6 routes will unexpectedly print: # ip -6 r Error: ipv4: MR table does not exist. Dump terminated commit ae677bbb4441 introduced the dump_all_families variable, which gets set when userspace requests a PF_UNSPEC dump. However, we can't simply set the family to PF_UNSPEC in rtnetlink_rcv_msg in the fallback case to get dump_all_families == true, because some messages types (for example RTM_GETRULE and RTM_GETNEIGH) only register the PF_UNSPEC handler and use the family to filter in the kernel what is dumped to userspace. We would then export more entries, that userspace would have to filter. iproute does that, but other programs may not. Instead, this patch removes dump_all_families and updates the RTM_GETROUTE handlers to check if the family that is being dumped is their own. When it's not, which covers both the intentional PF_UNSPEC dumps (as dump_all_families did) and the fallback case, ignore the missing table id error. Fixes: cb167893f41e ("net: Plumb support for filtering ipv4 and ipv6 multicast route dumps") Signed-off-by: Sabrina Dubroca Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip_fib.h | 1 - net/ipv4/fib_frontend.c | 3 +-- net/ipv4/ipmr.c | 2 +- net/ipv6/ip6_fib.c | 2 +- net/ipv6/ip6mr.c | 2 +- 5 files changed, 4 insertions(+), 6 deletions(-) diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 59e0d4e99f94..b219a8fe0950 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -257,7 +257,6 @@ struct fib_dump_filter { u32 table_id; /* filter_set is an optimization that an entry is set */ bool filter_set; - bool dump_all_families; bool dump_routes; bool dump_exceptions; unsigned char protocol; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 213be9c050ad..1bf9da3a75f9 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -918,7 +918,6 @@ int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh, else filter->dump_exceptions = false; - filter->dump_all_families = (rtm->rtm_family == AF_UNSPEC); filter->flags = rtm->rtm_flags; filter->protocol = rtm->rtm_protocol; filter->rt_type = rtm->rtm_type; @@ -990,7 +989,7 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) if (filter.table_id) { tb = fib_get_table(net, filter.table_id); if (!tb) { - if (filter.dump_all_families) + if (rtnl_msg_family(cb->nlh) != PF_INET) return skb->len; NL_SET_ERR_MSG(cb->extack, "ipv4: FIB table does not exist"); diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 5c218db2dede..b2363b82b48d 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -2613,7 +2613,7 @@ static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) mrt = ipmr_get_table(sock_net(skb->sk), filter.table_id); if (!mrt) { - if (filter.dump_all_families) + if (rtnl_msg_family(cb->nlh) != RTNL_FAMILY_IPMR) return skb->len; NL_SET_ERR_MSG(cb->extack, "ipv4: MR table does not exist"); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 46ed56719476..20314895509c 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -664,7 +664,7 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) if (arg.filter.table_id) { tb = fib6_get_table(net, arg.filter.table_id); if (!tb) { - if (arg.filter.dump_all_families) + if (rtnl_msg_family(cb->nlh) != PF_INET6) goto out; NL_SET_ERR_MSG_MOD(cb->extack, "FIB table does not exist"); diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 1e223e26f079..1f4d20e97c07 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -2503,7 +2503,7 @@ static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) mrt = ip6mr_get_table(sock_net(skb->sk), filter.table_id); if (!mrt) { - if (filter.dump_all_families) + if (rtnl_msg_family(cb->nlh) != RTNL_FAMILY_IP6MR) return skb->len; NL_SET_ERR_MSG_MOD(cb->extack, "MR table does not exist"); -- cgit v1.2.3 From a7654211d0ffeaa8eb0545ea00f8445242cbce05 Mon Sep 17 00:00:00 2001 From: Tang Bin Date: Wed, 20 May 2020 17:55:32 +0800 Subject: net: sgi: ioc3-eth: Fix return value check in ioc3eth_probe() In the function devm_platform_ioremap_resource(), if get resource failed, the return value is ERR_PTR() not NULL. Thus it must be replaced by IS_ERR(), or else it may result in crashes if a critical error path is encountered. Fixes: 0ce5ebd24d25 ("mfd: ioc3: Add driver for SGI IOC3 chip") Signed-off-by: Zhang Shengju Signed-off-by: Tang Bin Signed-off-by: David S. Miller --- drivers/net/ethernet/sgi/ioc3-eth.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/sgi/ioc3-eth.c b/drivers/net/ethernet/sgi/ioc3-eth.c index 7305e8e86c51..6646eba9f57f 100644 --- a/drivers/net/ethernet/sgi/ioc3-eth.c +++ b/drivers/net/ethernet/sgi/ioc3-eth.c @@ -848,14 +848,14 @@ static int ioc3eth_probe(struct platform_device *pdev) ip = netdev_priv(dev); ip->dma_dev = pdev->dev.parent; ip->regs = devm_platform_ioremap_resource(pdev, 0); - if (!ip->regs) { - err = -ENOMEM; + if (IS_ERR(ip->regs)) { + err = PTR_ERR(ip->regs); goto out_free; } ip->ssram = devm_platform_ioremap_resource(pdev, 1); - if (!ip->ssram) { - err = -ENOMEM; + if (IS_ERR(ip->ssram)) { + err = PTR_ERR(ip->ssram); goto out_free; } -- cgit v1.2.3 From 5cf65922bb15279402e1e19b5ee8c51d618fa51f Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 21 May 2020 10:34:35 +0200 Subject: flow_dissector: Drop BPF flow dissector prog ref on netns cleanup When attaching a flow dissector program to a network namespace with bpf(BPF_PROG_ATTACH, ...) we grab a reference to bpf_prog. If netns gets destroyed while a flow dissector is still attached, and there are no other references to the prog, we leak the reference and the program remains loaded. Leak can be reproduced by running flow dissector tests from selftests/bpf: # bpftool prog list # ./test_flow_dissector.sh ... selftests: test_flow_dissector [PASS] # bpftool prog list 4: flow_dissector name _dissect tag e314084d332a5338 gpl loaded_at 2020-05-20T18:50:53+0200 uid 0 xlated 552B jited 355B memlock 4096B map_ids 3,4 btf_id 4 # Fix it by detaching the flow dissector program when netns is going away. Fixes: d58e468b1112 ("flow_dissector: implements flow dissector BPF hook") Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Reviewed-by: Stanislav Fomichev Link: https://lore.kernel.org/bpf/20200521083435.560256-1-jakub@cloudflare.com --- net/core/flow_dissector.c | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 3eff84824c8b..5dceed467f64 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -160,12 +160,10 @@ out: return ret; } -int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr) +static int flow_dissector_bpf_prog_detach(struct net *net) { struct bpf_prog *attached; - struct net *net; - net = current->nsproxy->net_ns; mutex_lock(&flow_dissector_mutex); attached = rcu_dereference_protected(net->flow_dissector_prog, lockdep_is_held(&flow_dissector_mutex)); @@ -179,6 +177,24 @@ int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr) return 0; } +int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr) +{ + return flow_dissector_bpf_prog_detach(current->nsproxy->net_ns); +} + +static void __net_exit flow_dissector_pernet_pre_exit(struct net *net) +{ + /* We're not racing with attach/detach because there are no + * references to netns left when pre_exit gets called. + */ + if (rcu_access_pointer(net->flow_dissector_prog)) + flow_dissector_bpf_prog_detach(net); +} + +static struct pernet_operations flow_dissector_pernet_ops __net_initdata = { + .pre_exit = flow_dissector_pernet_pre_exit, +}; + /** * __skb_flow_get_ports - extract the upper layer ports and return them * @skb: sk_buff to extract the ports from @@ -1836,7 +1852,7 @@ static int __init init_default_flow_dissectors(void) skb_flow_dissector_init(&flow_keys_basic_dissector, flow_keys_basic_dissector_keys, ARRAY_SIZE(flow_keys_basic_dissector_keys)); - return 0; -} + return register_pernet_subsys(&flow_dissector_pernet_ops); +} core_initcall(init_default_flow_dissectors); -- cgit v1.2.3 From bd6972226f50910a5b97e6b9d443c5d0433bf054 Mon Sep 17 00:00:00 2001 From: Todd Malsbary Date: Thu, 21 May 2020 19:10:49 -0700 Subject: mptcp: use untruncated hash in ADD_ADDR HMAC There is some ambiguity in the RFC as to whether the ADD_ADDR HMAC is the rightmost 64 bits of the entire hash or of the leftmost 160 bits of the hash. The intention, as clarified with the author of the RFC, is the entire hash. This change returns the entire hash from mptcp_crypto_hmac_sha (instead of only the first 160 bits), and moves any truncation/selection operation on the hash to the caller. Fixes: 12555a2d97e5 ("mptcp: use rightmost 64 bits in ADD_ADDR HMAC") Reviewed-by: Christoph Paasch Reviewed-by: Mat Martineau Signed-off-by: Todd Malsbary Signed-off-by: David S. Miller --- net/mptcp/crypto.c | 24 +++++++++--------------- net/mptcp/options.c | 9 +++++---- net/mptcp/protocol.h | 1 - net/mptcp/subflow.c | 15 ++++++++++----- 4 files changed, 24 insertions(+), 25 deletions(-) diff --git a/net/mptcp/crypto.c b/net/mptcp/crypto.c index c151628bd416..0f5a414a9366 100644 --- a/net/mptcp/crypto.c +++ b/net/mptcp/crypto.c @@ -47,8 +47,6 @@ void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn) void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac) { u8 input[SHA256_BLOCK_SIZE + SHA256_DIGEST_SIZE]; - __be32 mptcp_hashed_key[SHA256_DIGEST_WORDS]; - __be32 *hash_out = (__force __be32 *)hmac; struct sha256_state state; u8 key1be[8]; u8 key2be[8]; @@ -86,11 +84,7 @@ void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac) sha256_init(&state); sha256_update(&state, input, SHA256_BLOCK_SIZE + SHA256_DIGEST_SIZE); - sha256_final(&state, (u8 *)mptcp_hashed_key); - - /* takes only first 160 bits */ - for (i = 0; i < 5; i++) - hash_out[i] = mptcp_hashed_key[i]; + sha256_final(&state, (u8 *)hmac); } #ifdef CONFIG_MPTCP_HMAC_TEST @@ -101,29 +95,29 @@ struct test_cast { }; /* we can't reuse RFC 4231 test vectors, as we have constraint on the - * input and key size, and we truncate the output. + * input and key size. */ static struct test_cast tests[] = { { .key = "0b0b0b0b0b0b0b0b", .msg = "48692054", - .result = "8385e24fb4235ac37556b6b886db106284a1da67", + .result = "8385e24fb4235ac37556b6b886db106284a1da671699f46db1f235ec622dcafa", }, { .key = "aaaaaaaaaaaaaaaa", .msg = "dddddddd", - .result = "2c5e219164ff1dca1c4a92318d847bb6b9d44492", + .result = "2c5e219164ff1dca1c4a92318d847bb6b9d44492984e1eb71aff9022f71046e9", }, { .key = "0102030405060708", .msg = "cdcdcdcd", - .result = "e73b9ba9969969cefb04aa0d6df18ec2fcc075b6", + .result = "e73b9ba9969969cefb04aa0d6df18ec2fcc075b6f23b4d8c4da736a5dbbc6e7d", }, }; static int __init test_mptcp_crypto(void) { - char hmac[20], hmac_hex[41]; + char hmac[32], hmac_hex[65]; u32 nonce1, nonce2; u64 key1, key2; u8 msg[8]; @@ -140,11 +134,11 @@ static int __init test_mptcp_crypto(void) put_unaligned_be32(nonce2, &msg[4]); mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac); - for (j = 0; j < 20; ++j) + for (j = 0; j < 32; ++j) sprintf(&hmac_hex[j << 1], "%02x", hmac[j] & 0xff); - hmac_hex[40] = 0; + hmac_hex[64] = 0; - if (memcmp(hmac_hex, tests[i].result, 40)) + if (memcmp(hmac_hex, tests[i].result, 64)) pr_err("test %d failed, got %s expected %s", i, hmac_hex, tests[i].result); else diff --git a/net/mptcp/options.c b/net/mptcp/options.c index b88fae233a62..7793b6011fa7 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -7,6 +7,7 @@ #define pr_fmt(fmt) "MPTCP: " fmt #include +#include #include #include #include "protocol.h" @@ -535,7 +536,7 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, static u64 add_addr_generate_hmac(u64 key1, u64 key2, u8 addr_id, struct in_addr *addr) { - u8 hmac[MPTCP_ADDR_HMAC_LEN]; + u8 hmac[SHA256_DIGEST_SIZE]; u8 msg[7]; msg[0] = addr_id; @@ -545,14 +546,14 @@ static u64 add_addr_generate_hmac(u64 key1, u64 key2, u8 addr_id, mptcp_crypto_hmac_sha(key1, key2, msg, 7, hmac); - return get_unaligned_be64(&hmac[MPTCP_ADDR_HMAC_LEN - sizeof(u64)]); + return get_unaligned_be64(&hmac[SHA256_DIGEST_SIZE - sizeof(u64)]); } #if IS_ENABLED(CONFIG_MPTCP_IPV6) static u64 add_addr6_generate_hmac(u64 key1, u64 key2, u8 addr_id, struct in6_addr *addr) { - u8 hmac[MPTCP_ADDR_HMAC_LEN]; + u8 hmac[SHA256_DIGEST_SIZE]; u8 msg[19]; msg[0] = addr_id; @@ -562,7 +563,7 @@ static u64 add_addr6_generate_hmac(u64 key1, u64 key2, u8 addr_id, mptcp_crypto_hmac_sha(key1, key2, msg, 19, hmac); - return get_unaligned_be64(&hmac[MPTCP_ADDR_HMAC_LEN - sizeof(u64)]); + return get_unaligned_be64(&hmac[SHA256_DIGEST_SIZE - sizeof(u64)]); } #endif diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index e4ca6320ce76..d0803dfb8108 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -81,7 +81,6 @@ /* MPTCP ADD_ADDR flags */ #define MPTCP_ADDR_ECHO BIT(0) -#define MPTCP_ADDR_HMAC_LEN 20 #define MPTCP_ADDR_IPVERSION_4 4 #define MPTCP_ADDR_IPVERSION_6 6 diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 4931a29a6f08..8968b2c065e7 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -89,7 +90,7 @@ static bool subflow_token_join_request(struct request_sock *req, const struct sk_buff *skb) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); - u8 hmac[MPTCPOPT_HMAC_LEN]; + u8 hmac[SHA256_DIGEST_SIZE]; struct mptcp_sock *msk; int local_id; @@ -201,7 +202,7 @@ static void subflow_v6_init_req(struct request_sock *req, /* validate received truncated hmac and create hmac for third ACK */ static bool subflow_thmac_valid(struct mptcp_subflow_context *subflow) { - u8 hmac[MPTCPOPT_HMAC_LEN]; + u8 hmac[SHA256_DIGEST_SIZE]; u64 thmac; subflow_generate_hmac(subflow->remote_key, subflow->local_key, @@ -267,6 +268,8 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) subflow->ssn_offset = TCP_SKB_CB(skb)->seq; } } else if (subflow->mp_join) { + u8 hmac[SHA256_DIGEST_SIZE]; + pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", subflow, subflow->thmac, subflow->remote_nonce); @@ -279,7 +282,9 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) subflow_generate_hmac(subflow->local_key, subflow->remote_key, subflow->local_nonce, subflow->remote_nonce, - subflow->hmac); + hmac); + + memcpy(subflow->hmac, hmac, MPTCPOPT_HMAC_LEN); if (skb) subflow->ssn_offset = TCP_SKB_CB(skb)->seq; @@ -347,7 +352,7 @@ static bool subflow_hmac_valid(const struct request_sock *req, const struct mptcp_options_received *mp_opt) { const struct mptcp_subflow_request_sock *subflow_req; - u8 hmac[MPTCPOPT_HMAC_LEN]; + u8 hmac[SHA256_DIGEST_SIZE]; struct mptcp_sock *msk; bool ret; @@ -361,7 +366,7 @@ static bool subflow_hmac_valid(const struct request_sock *req, subflow_req->local_nonce, hmac); ret = true; - if (crypto_memneq(hmac, mp_opt->hmac, sizeof(hmac))) + if (crypto_memneq(hmac, mp_opt->hmac, MPTCPOPT_HMAC_LEN)) ret = false; sock_put((struct sock *)msk); -- cgit v1.2.3 From b4024c9e5c57902155d3b5e7de482e245f492bff Mon Sep 17 00:00:00 2001 From: Claudiu Manoil Date: Fri, 22 May 2020 11:54:34 +0300 Subject: felix: Fix initialization of ioremap resources The caller of devm_ioremap_resource(), either accidentally or by wrong assumption, is writing back derived resource data to global static resource initialization tables that should have been constant. Meaning that after it computes the final physical start address it saves the address for no reason in the static tables. This doesn't affect the first driver probing after reboot, but it breaks consecutive driver reloads (i.e. driver unbind & bind) because the initialization tables no longer have the correct initial values. So the next probe() will map the device registers to wrong physical addresses, causing ARM SError async exceptions. This patch fixes all of the above. Fixes: 56051948773e ("net: dsa: ocelot: add driver for Felix switch family") Signed-off-by: Claudiu Manoil Reviewed-by: Vladimir Oltean Tested-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/dsa/ocelot/felix.c | 23 +++++++++++------------ drivers/net/dsa/ocelot/felix.h | 6 +++--- drivers/net/dsa/ocelot/felix_vsc9959.c | 22 ++++++++++------------ 3 files changed, 24 insertions(+), 27 deletions(-) diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c index e2c6bf0e430e..e8aae64db1ca 100644 --- a/drivers/net/dsa/ocelot/felix.c +++ b/drivers/net/dsa/ocelot/felix.c @@ -388,6 +388,7 @@ static int felix_init_structs(struct felix *felix, int num_phys_ports) struct ocelot *ocelot = &felix->ocelot; phy_interface_t *port_phy_modes; resource_size_t switch_base; + struct resource res; int port, i, err; ocelot->num_phys_ports = num_phys_ports; @@ -422,17 +423,16 @@ static int felix_init_structs(struct felix *felix, int num_phys_ports) for (i = 0; i < TARGET_MAX; i++) { struct regmap *target; - struct resource *res; if (!felix->info->target_io_res[i].name) continue; - res = &felix->info->target_io_res[i]; - res->flags = IORESOURCE_MEM; - res->start += switch_base; - res->end += switch_base; + memcpy(&res, &felix->info->target_io_res[i], sizeof(res)); + res.flags = IORESOURCE_MEM; + res.start += switch_base; + res.end += switch_base; - target = ocelot_regmap_init(ocelot, res); + target = ocelot_regmap_init(ocelot, &res); if (IS_ERR(target)) { dev_err(ocelot->dev, "Failed to map device memory space\n"); @@ -453,7 +453,6 @@ static int felix_init_structs(struct felix *felix, int num_phys_ports) for (port = 0; port < num_phys_ports; port++) { struct ocelot_port *ocelot_port; void __iomem *port_regs; - struct resource *res; ocelot_port = devm_kzalloc(ocelot->dev, sizeof(struct ocelot_port), @@ -465,12 +464,12 @@ static int felix_init_structs(struct felix *felix, int num_phys_ports) return -ENOMEM; } - res = &felix->info->port_io_res[port]; - res->flags = IORESOURCE_MEM; - res->start += switch_base; - res->end += switch_base; + memcpy(&res, &felix->info->port_io_res[port], sizeof(res)); + res.flags = IORESOURCE_MEM; + res.start += switch_base; + res.end += switch_base; - port_regs = devm_ioremap_resource(ocelot->dev, res); + port_regs = devm_ioremap_resource(ocelot->dev, &res); if (IS_ERR(port_regs)) { dev_err(ocelot->dev, "failed to map registers for port %d\n", port); diff --git a/drivers/net/dsa/ocelot/felix.h b/drivers/net/dsa/ocelot/felix.h index 9af106513e53..730a8a90e1f7 100644 --- a/drivers/net/dsa/ocelot/felix.h +++ b/drivers/net/dsa/ocelot/felix.h @@ -8,9 +8,9 @@ /* Platform-specific information */ struct felix_info { - struct resource *target_io_res; - struct resource *port_io_res; - struct resource *imdio_res; + const struct resource *target_io_res; + const struct resource *port_io_res; + const struct resource *imdio_res; const struct reg_field *regfields; const u32 *const *map; const struct ocelot_ops *ops; diff --git a/drivers/net/dsa/ocelot/felix_vsc9959.c b/drivers/net/dsa/ocelot/felix_vsc9959.c index 8bf395f12b47..5211f05ef2fb 100644 --- a/drivers/net/dsa/ocelot/felix_vsc9959.c +++ b/drivers/net/dsa/ocelot/felix_vsc9959.c @@ -333,10 +333,8 @@ static const u32 *vsc9959_regmap[] = { [GCB] = vsc9959_gcb_regmap, }; -/* Addresses are relative to the PCI device's base address and - * will be fixed up at ioremap time. - */ -static struct resource vsc9959_target_io_res[] = { +/* Addresses are relative to the PCI device's base address */ +static const struct resource vsc9959_target_io_res[] = { [ANA] = { .start = 0x0280000, .end = 0x028ffff, @@ -379,7 +377,7 @@ static struct resource vsc9959_target_io_res[] = { }, }; -static struct resource vsc9959_port_io_res[] = { +static const struct resource vsc9959_port_io_res[] = { { .start = 0x0100000, .end = 0x010ffff, @@ -415,7 +413,7 @@ static struct resource vsc9959_port_io_res[] = { /* Port MAC 0 Internal MDIO bus through which the SerDes acting as an * SGMII/QSGMII MAC PCS can be found. */ -static struct resource vsc9959_imdio_res = { +static const struct resource vsc9959_imdio_res = { .start = 0x8030, .end = 0x8040, .name = "imdio", @@ -1111,7 +1109,7 @@ static int vsc9959_mdio_bus_alloc(struct ocelot *ocelot) struct device *dev = ocelot->dev; resource_size_t imdio_base; void __iomem *imdio_regs; - struct resource *res; + struct resource res; struct enetc_hw *hw; struct mii_bus *bus; int port; @@ -1128,12 +1126,12 @@ static int vsc9959_mdio_bus_alloc(struct ocelot *ocelot) imdio_base = pci_resource_start(felix->pdev, felix->info->imdio_pci_bar); - res = felix->info->imdio_res; - res->flags = IORESOURCE_MEM; - res->start += imdio_base; - res->end += imdio_base; + memcpy(&res, felix->info->imdio_res, sizeof(res)); + res.flags = IORESOURCE_MEM; + res.start += imdio_base; + res.end += imdio_base; - imdio_regs = devm_ioremap_resource(dev, res); + imdio_regs = devm_ioremap_resource(dev, &res); if (IS_ERR(imdio_regs)) { dev_err(dev, "failed to map internal MDIO registers\n"); return PTR_ERR(imdio_regs); -- cgit v1.2.3 From 3138a07ce219acde4c0d7ea0b6d54ba64153328b Mon Sep 17 00:00:00 2001 From: Russell King Date: Wed, 20 May 2020 12:26:35 +0100 Subject: net: mvpp2: fix RX hashing for non-10G ports When rxhash is enabled on any ethernet port except the first in each CP block, traffic flow is prevented. The analysis is below: I've been investigating this afternoon, and what I've found, comparing a kernel without 895586d5dc32 and with 895586d5dc32 applied is: - The table programmed into the hardware via mvpp22_rss_fill_table() appears to be identical with or without the commit. - When rxhash is enabled on eth2, mvpp2_rss_port_c2_enable() reports that c2.attr[0] and c2.attr[2] are written back containing: - with 895586d5dc32, failing: 00200000 40000000 - without 895586d5dc32, working: 04000000 40000000 - When disabling rxhash, c2.attr[0] and c2.attr[2] are written back as: 04000000 00000000 The second value represents the MVPP22_CLS_C2_ATTR2_RSS_EN bit, the first value is the queue number, which comprises two fields. The high 5 bits are 24:29 and the low three are 21:23 inclusive. This comes from: c2.attr[0] = MVPP22_CLS_C2_ATTR0_QHIGH(qh) | MVPP22_CLS_C2_ATTR0_QLOW(ql); So, the working case gives eth2 a queue id of 4.0, or 32 as per port->first_rxq, and the non-working case a queue id of 0.1, or 1. The allocation of queue IDs seems to be in mvpp2_port_probe(): if (priv->hw_version == MVPP21) port->first_rxq = port->id * port->nrxqs; else port->first_rxq = port->id * priv->max_port_rxqs; Where: if (priv->hw_version == MVPP21) priv->max_port_rxqs = 8; else priv->max_port_rxqs = 32; Making the port 0 (eth0 / eth1) have port->first_rxq = 0, and port 1 (eth2) be 32. It seems the idea is that the first 32 queues belong to port 0, the second 32 queues belong to port 1, etc. mvpp2_rss_port_c2_enable() gets the queue number from it's parameter, 'ctx', which comes from mvpp22_rss_ctx(port, 0). This returns port->rss_ctx[0]. mvpp22_rss_context_create() is responsible for allocating that, which it does by looking for an unallocated priv->rss_tables[] pointer. This table is shared amongst all ports on the CP silicon. When we write the tables in mvpp22_rss_fill_table(), the RSS table entry is defined by: u32 sel = MVPP22_RSS_INDEX_TABLE(rss_ctx) | MVPP22_RSS_INDEX_TABLE_ENTRY(i); where rss_ctx is the context ID (queue number) and i is the index in the table. If we look at what is written: - The first table to be written has "sel" values of 00000000..0000001f, containing values 0..3. This appears to be for eth1. This is table 0, RX queue number 0. - The second table has "sel" values of 00000100..0000011f, and appears to be for eth2. These contain values 0x20..0x23. This is table 1, RX queue number 0. - The third table has "sel" values of 00000200..0000021f, and appears to be for eth3. These contain values 0x40..0x43. This is table 2, RX queue number 0. How do queue numbers translate to the RSS table? There is another table - the RXQ2RSS table, indexed by the MVPP22_RSS_INDEX_QUEUE field of MVPP22_RSS_INDEX and accessed through the MVPP22_RXQ2RSS_TABLE register. Before 895586d5dc32, it was: mvpp2_write(priv, MVPP22_RSS_INDEX, MVPP22_RSS_INDEX_QUEUE(port->first_rxq)); mvpp2_write(priv, MVPP22_RXQ2RSS_TABLE, MVPP22_RSS_TABLE_POINTER(port->id)); and after: mvpp2_write(priv, MVPP22_RSS_INDEX, MVPP22_RSS_INDEX_QUEUE(ctx)); mvpp2_write(priv, MVPP22_RXQ2RSS_TABLE, MVPP22_RSS_TABLE_POINTER(ctx)); Before the commit, for eth2, that would've contained '32' for the index and '1' for the table pointer - mapping queue 32 to table 1. Remember that this is queue-high.queue-low of 4.0. After the commit, we appear to map queue 1 to table 1. That again looks fine on the face of it. Section 9.3.1 of the A8040 manual seems indicate the reason that the queue number is separated. queue-low seems to always come from the classifier, whereas queue-high can be from the ingress physical port number or the classifier depending on the MVPP2_CLS_SWFWD_PCTRL_REG. We set the port bit in MVPP2_CLS_SWFWD_PCTRL_REG, meaning that queue-high comes from the MVPP2_CLS_SWFWD_P2HQ_REG() register... and this seems to be where our bug comes from. mvpp2_cls_oversize_rxq_set() sets this up as: mvpp2_write(port->priv, MVPP2_CLS_SWFWD_P2HQ_REG(port->id), (port->first_rxq >> MVPP2_CLS_OVERSIZE_RXQ_LOW_BITS)); val = mvpp2_read(port->priv, MVPP2_CLS_SWFWD_PCTRL_REG); val |= MVPP2_CLS_SWFWD_PCTRL_MASK(port->id); mvpp2_write(port->priv, MVPP2_CLS_SWFWD_PCTRL_REG, val); Setting the MVPP2_CLS_SWFWD_PCTRL_MASK bit means that the queue-high for eth2 is _always_ 4, so only queues 32 through 39 inclusive are available to eth2. Yet, we're trying to tell the classifier to set queue-high, which will be ignored, to zero. Hence, the queue-high field (MVPP22_CLS_C2_ATTR0_QHIGH()) from the classifier will be ignored. This means we end up directing traffic from eth2 not to queue 1, but to queue 33, and then we tell it to look up queue 33 in the RSS table. However, RSS table has not been programmed for queue 33, and so it ends up (presumably) dropping the packets. It seems that mvpp22_rss_context_create() doesn't take account of the fact that the upper 5 bits of the queue ID can't actually be changed due to the settings in mvpp2_cls_oversize_rxq_set(), _or_ it seems that mvpp2_cls_oversize_rxq_set() has been missed in this commit. Either way, these two functions mutually disagree with what queue number should be used. Looking deeper into what mvpp2_cls_oversize_rxq_set() and the MTU validation is doing, it seems that MVPP2_CLS_SWFWD_P2HQ_REG() is used for over-sized packets attempting to egress through this port. With the classifier having had RSS enabled and directing eth2 traffic to queue 1, we may still have packets appearing on queue 32 for this port. However, the only way we may end up with over-sized packets attempting to egress through eth2 - is if the A8040 forwards frames between its ports. From what I can see, we don't support that feature, and the kernel restricts the egress packet size to the MTU. In any case, if we were to attempt to transmit an oversized packet, we have no support in the kernel to deal with that appearing in the port's receive queue. So, this patch attempts to solve the issue by clearing the MVPP2_CLS_SWFWD_PCTRL_MASK() bit, allowing MVPP22_CLS_C2_ATTR0_QHIGH() from the classifier to define the queue-high field of the queue number. My testing seems to confirm my findings above - clearing this bit means that if I enable rxhash on eth2, the interface can then pass traffic, as we are now directing traffic to RX queue 1 rather than queue 33. Traffic still seems to work with rxhash off as well. Reported-by: Matteo Croce Tested-by: Matteo Croce Fixes: 895586d5dc32 ("net: mvpp2: cls: Use RSS contexts to handle RSS tables") Signed-off-by: Russell King Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mvpp2/mvpp2_cls.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_cls.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_cls.c index 7352244c5e68..d4a4e241333d 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_cls.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_cls.c @@ -1070,7 +1070,7 @@ void mvpp2_cls_oversize_rxq_set(struct mvpp2_port *port) (port->first_rxq >> MVPP2_CLS_OVERSIZE_RXQ_LOW_BITS)); val = mvpp2_read(port->priv, MVPP2_CLS_SWFWD_PCTRL_REG); - val |= MVPP2_CLS_SWFWD_PCTRL_MASK(port->id); + val &= ~MVPP2_CLS_SWFWD_PCTRL_MASK(port->id); mvpp2_write(port->priv, MVPP2_CLS_SWFWD_PCTRL_REG, val); } -- cgit v1.2.3 From 1378817486d6860f6a927f573491afe65287abf1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 21 May 2020 11:29:58 -0700 Subject: tipc: block BH before using dst_cache dst_cache_get() documents it must be used with BH disabled. sysbot reported : BUG: using smp_processor_id() in preemptible [00000000] code: /21697 caller is dst_cache_get+0x3a/0xb0 net/core/dst_cache.c:68 CPU: 0 PID: 21697 Comm: Not tainted 5.7.0-rc6-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x188/0x20d lib/dump_stack.c:118 check_preemption_disabled lib/smp_processor_id.c:47 [inline] debug_smp_processor_id.cold+0x88/0x9b lib/smp_processor_id.c:57 dst_cache_get+0x3a/0xb0 net/core/dst_cache.c:68 tipc_udp_xmit.isra.0+0xb9/0xad0 net/tipc/udp_media.c:164 tipc_udp_send_msg+0x3e6/0x490 net/tipc/udp_media.c:244 tipc_bearer_xmit_skb+0x1de/0x3f0 net/tipc/bearer.c:526 tipc_enable_bearer+0xb2f/0xd60 net/tipc/bearer.c:331 __tipc_nl_bearer_enable+0x2bf/0x390 net/tipc/bearer.c:995 tipc_nl_bearer_enable+0x1e/0x30 net/tipc/bearer.c:1003 genl_family_rcv_msg_doit net/netlink/genetlink.c:673 [inline] genl_family_rcv_msg net/netlink/genetlink.c:718 [inline] genl_rcv_msg+0x627/0xdf0 net/netlink/genetlink.c:735 netlink_rcv_skb+0x15a/0x410 net/netlink/af_netlink.c:2469 genl_rcv+0x24/0x40 net/netlink/genetlink.c:746 netlink_unicast_kernel net/netlink/af_netlink.c:1303 [inline] netlink_unicast+0x537/0x740 net/netlink/af_netlink.c:1329 netlink_sendmsg+0x882/0xe10 net/netlink/af_netlink.c:1918 sock_sendmsg_nosec net/socket.c:652 [inline] sock_sendmsg+0xcf/0x120 net/socket.c:672 ____sys_sendmsg+0x6bf/0x7e0 net/socket.c:2362 ___sys_sendmsg+0x100/0x170 net/socket.c:2416 __sys_sendmsg+0xec/0x1b0 net/socket.c:2449 do_syscall_64+0xf6/0x7d0 arch/x86/entry/common.c:295 entry_SYSCALL_64_after_hwframe+0x49/0xb3 RIP: 0033:0x45ca29 Fixes: e9c1a793210f ("tipc: add dst_cache support for udp media") Cc: Xin Long Cc: Jon Maloy Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/tipc/udp_media.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index d6620ad53546..28a283f26a8d 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -161,9 +161,11 @@ static int tipc_udp_xmit(struct net *net, struct sk_buff *skb, struct udp_bearer *ub, struct udp_media_addr *src, struct udp_media_addr *dst, struct dst_cache *cache) { - struct dst_entry *ndst = dst_cache_get(cache); + struct dst_entry *ndst; int ttl, err = 0; + local_bh_disable(); + ndst = dst_cache_get(cache); if (dst->proto == htons(ETH_P_IP)) { struct rtable *rt = (struct rtable *)ndst; @@ -210,9 +212,11 @@ static int tipc_udp_xmit(struct net *net, struct sk_buff *skb, src->port, dst->port, false); #endif } + local_bh_enable(); return err; tx_error: + local_bh_enable(); kfree_skb(skb); return err; } -- cgit v1.2.3 From d3e8e4c11870413789f029a71e72ae6e971fe678 Mon Sep 17 00:00:00 2001 From: Jere Leppänen Date: Wed, 20 May 2020 18:15:31 +0300 Subject: sctp: Start shutdown on association restart if in SHUTDOWN-SENT state and socket is closed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit bdf6fa52f01b ("sctp: handle association restarts when the socket is closed.") starts shutdown when an association is restarted, if in SHUTDOWN-PENDING state and the socket is closed. However, the rationale stated in that commit applies also when in SHUTDOWN-SENT state - we don't want to move an association to ESTABLISHED state when the socket has been closed, because that results in an association that is unreachable from user space. The problem scenario: 1. Client crashes and/or restarts. 2. Server (using one-to-one socket) calls close(). SHUTDOWN is lost. 3. Client reconnects using the same addresses and ports. 4. Server's association is restarted. The association and the socket move to ESTABLISHED state, even though the server process has closed its descriptor. Also, after step 4 when the server process exits, some resources are leaked in an attempt to release the underlying inet sock structure in ESTABLISHED state: IPv4: Attempt to release TCP socket in state 1 00000000377288c7 Fix by acting the same way as in SHUTDOWN-PENDING state. That is, if an association is restarted in SHUTDOWN-SENT state and the socket is closed, then start shutdown and don't move the association or the socket to ESTABLISHED state. Fixes: bdf6fa52f01b ("sctp: handle association restarts when the socket is closed.") Signed-off-by: Jere Leppänen Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/sm_statefuns.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 26788f4a3b9e..e86620fbd90f 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -1856,12 +1856,13 @@ static enum sctp_disposition sctp_sf_do_dupcook_a( /* Update the content of current association. */ sctp_add_cmd_sf(commands, SCTP_CMD_UPDATE_ASSOC, SCTP_ASOC(new_asoc)); sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(ev)); - if (sctp_state(asoc, SHUTDOWN_PENDING) && + if ((sctp_state(asoc, SHUTDOWN_PENDING) || + sctp_state(asoc, SHUTDOWN_SENT)) && (sctp_sstate(asoc->base.sk, CLOSING) || sock_flag(asoc->base.sk, SOCK_DEAD))) { - /* if were currently in SHUTDOWN_PENDING, but the socket - * has been closed by user, don't transition to ESTABLISHED. - * Instead trigger SHUTDOWN bundled with COOKIE_ACK. + /* If the socket has been closed by user, don't + * transition to ESTABLISHED. Instead trigger SHUTDOWN + * bundled with COOKIE_ACK. */ sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl)); return sctp_sf_do_9_2_start_shutdown(net, ep, asoc, -- cgit v1.2.3 From 79dde73cf9bcf1dd317a2667f78b758e9fe139ed Mon Sep 17 00:00:00 2001 From: Valentin Longchamp Date: Wed, 20 May 2020 17:53:50 +0200 Subject: net/ethernet/freescale: rework quiesce/activate for ucc_geth ugeth_quiesce/activate are used to halt the controller when there is a link change that requires to reconfigure the mac. The previous implementation called netif_device_detach(). This however causes the initial activation of the netdevice to fail precisely because it's detached. For details, see [1]. A possible workaround was the revert of commit net: linkwatch: add check for netdevice being present to linkwatch_do_dev However, the check introduced in the above commit is correct and shall be kept. The netif_device_detach() is thus replaced with netif_tx_stop_all_queues() that prevents any tranmission. This allows to perform mac config change required by the link change, without detaching the corresponding netdevice and thus not preventing its initial activation. [1] https://lists.openwall.net/netdev/2020/01/08/201 Signed-off-by: Valentin Longchamp Acked-by: Matteo Ghidoni Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/ucc_geth.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/freescale/ucc_geth.c b/drivers/net/ethernet/freescale/ucc_geth.c index 6e5f6dd169b5..552e7554a9f8 100644 --- a/drivers/net/ethernet/freescale/ucc_geth.c +++ b/drivers/net/ethernet/freescale/ucc_geth.c @@ -42,6 +42,7 @@ #include #include #include +#include #include "ucc_geth.h" @@ -1548,11 +1549,8 @@ static int ugeth_disable(struct ucc_geth_private *ugeth, enum comm_dir mode) static void ugeth_quiesce(struct ucc_geth_private *ugeth) { - /* Prevent any further xmits, plus detach the device. */ - netif_device_detach(ugeth->ndev); - - /* Wait for any current xmits to finish. */ - netif_tx_disable(ugeth->ndev); + /* Prevent any further xmits */ + netif_tx_stop_all_queues(ugeth->ndev); /* Disable the interrupt to avoid NAPI rescheduling. */ disable_irq(ugeth->ug_info->uf_info.irq); @@ -1565,7 +1563,10 @@ static void ugeth_activate(struct ucc_geth_private *ugeth) { napi_enable(&ugeth->napi); enable_irq(ugeth->ug_info->uf_info.irq); - netif_device_attach(ugeth->ndev); + + /* allow to xmit again */ + netif_tx_wake_all_queues(ugeth->ndev); + __netdev_watchdog_up(ugeth->ndev); } /* Called every time the controller might need to be made -- cgit v1.2.3 From be43224fc0e4697ad0d03107cbaf1ecf26e7ee72 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 21 May 2020 14:46:16 +0300 Subject: netdevsim: Ensure policer drop counter always increases In case the policer drop counter is retrieved when the jiffies value is a multiple of 64, the counter will not be incremented. This randomly breaks a selftest [1] the reads the counter twice and checks that it was incremented: ``` TEST: Trap policer [FAIL] Policer drop counter was not incremented ``` Fix by always incrementing the counter by 1. [1] tools/testing/selftests/drivers/net/netdevsim/devlink_trap.sh Fixes: ad188458d012 ("netdevsim: Add devlink-trap policer support") Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/netdevsim/dev.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/netdevsim/dev.c b/drivers/net/netdevsim/dev.c index 68668a22b9dd..dc3ff0e20944 100644 --- a/drivers/net/netdevsim/dev.c +++ b/drivers/net/netdevsim/dev.c @@ -858,8 +858,7 @@ nsim_dev_devlink_trap_policer_counter_get(struct devlink *devlink, return -EINVAL; cnt = &nsim_dev->trap_data->trap_policers_cnt_arr[policer->id - 1]; - *p_drops = *cnt; - *cnt += jiffies % 64; + *p_drops = (*cnt)++; return 0; } -- cgit v1.2.3 From 4d59e59cf45046fe1263a935f8abc418bb61215c Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 21 May 2020 14:46:17 +0300 Subject: selftests: netdevsim: Always initialize 'RET' variable The variable is used by log_test() to check if the test case completely successfully or not. In case it is not initialized at the start of a test case, it is possible for the test case to fail despite not encountering any errors. Example: ``` ... TEST: Trap group statistics [ OK ] TEST: Trap policer [FAIL] Policer drop counter was not incremented TEST: Trap policer binding [FAIL] Policer drop counter was not incremented ``` Failure of trap_policer_test() caused trap_policer_bind_test() to fail as well. Fix by adding missing initialization of the variable. Fixes: 5fbff58e27a1 ("selftests: netdevsim: Add test cases for devlink-trap policers") Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- tools/testing/selftests/drivers/net/netdevsim/devlink_trap.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/testing/selftests/drivers/net/netdevsim/devlink_trap.sh b/tools/testing/selftests/drivers/net/netdevsim/devlink_trap.sh index dbd1e014ba17..da49ad2761b5 100755 --- a/tools/testing/selftests/drivers/net/netdevsim/devlink_trap.sh +++ b/tools/testing/selftests/drivers/net/netdevsim/devlink_trap.sh @@ -264,6 +264,8 @@ trap_policer_test() local packets_t0 local packets_t1 + RET=0 + if [ $(devlink_trap_policers_num_get) -eq 0 ]; then check_err 1 "Failed to dump policers" fi @@ -328,6 +330,8 @@ trap_group_check_policer() trap_policer_bind_test() { + RET=0 + devlink trap group set $DEVLINK_DEV group l2_drops policer 1 check_err $? "Failed to bind a valid policer" if [ $(devlink_trap_group_policer_get "l2_drops") -ne 1 ]; then -- cgit v1.2.3 From a96ac8a0045e3cbe3e5af6d1b3c78c6c2065dec5 Mon Sep 17 00:00:00 2001 From: Jonathan McDowell Date: Thu, 21 May 2020 12:49:34 +0100 Subject: net: ethernet: stmmac: Enable interface clocks on probe for IPQ806x The ipq806x_gmac_probe() function enables the PTP clock but not the appropriate interface clocks. This means that if the bootloader hasn't done so attempting to bring up the interface will fail with an error like: [ 59.028131] ipq806x-gmac-dwmac 37600000.ethernet: Failed to reset the dma [ 59.028196] ipq806x-gmac-dwmac 37600000.ethernet eth1: stmmac_hw_setup: DMA engine initialization failed [ 59.034056] ipq806x-gmac-dwmac 37600000.ethernet eth1: stmmac_open: Hw setup failed This patch, a slightly cleaned up version of one posted by Sergey Sergeev in: https://forum.openwrt.org/t/support-for-mikrotik-rb3011uias-rm/4064/257 correctly enables the clock; we have already configured the source just before this. Tested on a MikroTik RB3011. Signed-off-by: Jonathan McDowell Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.c index 6ae13dc19510..02102c781a8c 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.c @@ -319,6 +319,19 @@ static int ipq806x_gmac_probe(struct platform_device *pdev) /* Enable PTP clock */ regmap_read(gmac->nss_common, NSS_COMMON_CLK_GATE, &val); val |= NSS_COMMON_CLK_GATE_PTP_EN(gmac->id); + switch (gmac->phy_mode) { + case PHY_INTERFACE_MODE_RGMII: + val |= NSS_COMMON_CLK_GATE_RGMII_RX_EN(gmac->id) | + NSS_COMMON_CLK_GATE_RGMII_TX_EN(gmac->id); + break; + case PHY_INTERFACE_MODE_SGMII: + val |= NSS_COMMON_CLK_GATE_GMII_RX_EN(gmac->id) | + NSS_COMMON_CLK_GATE_GMII_TX_EN(gmac->id); + break; + default: + /* We don't get here; the switch above will have errored out */ + unreachable(); + } regmap_write(gmac->nss_common, NSS_COMMON_CLK_GATE, val); if (gmac->phy_mode == PHY_INTERFACE_MODE_SGMII) { -- cgit v1.2.3 From 4340f42f207eacb81e7a6b6bb1e3b6afad9a2e26 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 21 May 2020 15:11:44 +0300 Subject: mlxsw: spectrum: Fix use-after-free of split/unsplit/type_set in case reload fails In case of reload fail, the mlxsw_sp->ports contains a pointer to a freed memory (either by reload_down() or reload_up() error path). Fix this by initializing the pointer to NULL and checking it before dereferencing in split/unsplit/type_set callpaths. Fixes: 24cc68ad6c46 ("mlxsw: core: Add support for reload") Reported-by: Danielle Ratson Signed-off-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 14 ++++++++++++-- drivers/net/ethernet/mellanox/mlxsw/switchx2.c | 8 ++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 24ca8d5bc564..6b39978acd07 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -3986,6 +3986,7 @@ static void mlxsw_sp_ports_remove(struct mlxsw_sp *mlxsw_sp) mlxsw_sp_port_remove(mlxsw_sp, i); mlxsw_sp_cpu_port_remove(mlxsw_sp); kfree(mlxsw_sp->ports); + mlxsw_sp->ports = NULL; } static int mlxsw_sp_ports_create(struct mlxsw_sp *mlxsw_sp) @@ -4022,6 +4023,7 @@ err_port_create: mlxsw_sp_cpu_port_remove(mlxsw_sp); err_cpu_port_create: kfree(mlxsw_sp->ports); + mlxsw_sp->ports = NULL; return err; } @@ -4143,6 +4145,14 @@ static int mlxsw_sp_local_ports_offset(struct mlxsw_core *mlxsw_core, return mlxsw_core_res_get(mlxsw_core, local_ports_in_x_res_id); } +static struct mlxsw_sp_port * +mlxsw_sp_port_get_by_local_port(struct mlxsw_sp *mlxsw_sp, u8 local_port) +{ + if (mlxsw_sp->ports && mlxsw_sp->ports[local_port]) + return mlxsw_sp->ports[local_port]; + return NULL; +} + static int mlxsw_sp_port_split(struct mlxsw_core *mlxsw_core, u8 local_port, unsigned int count, struct netlink_ext_ack *extack) @@ -4156,7 +4166,7 @@ static int mlxsw_sp_port_split(struct mlxsw_core *mlxsw_core, u8 local_port, int i; int err; - mlxsw_sp_port = mlxsw_sp->ports[local_port]; + mlxsw_sp_port = mlxsw_sp_port_get_by_local_port(mlxsw_sp, local_port); if (!mlxsw_sp_port) { dev_err(mlxsw_sp->bus_info->dev, "Port number \"%d\" does not exist\n", local_port); @@ -4251,7 +4261,7 @@ static int mlxsw_sp_port_unsplit(struct mlxsw_core *mlxsw_core, u8 local_port, int offset; int i; - mlxsw_sp_port = mlxsw_sp->ports[local_port]; + mlxsw_sp_port = mlxsw_sp_port_get_by_local_port(mlxsw_sp, local_port); if (!mlxsw_sp_port) { dev_err(mlxsw_sp->bus_info->dev, "Port number \"%d\" does not exist\n", local_port); diff --git a/drivers/net/ethernet/mellanox/mlxsw/switchx2.c b/drivers/net/ethernet/mellanox/mlxsw/switchx2.c index 90535820b559..2503f61db5fb 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/switchx2.c +++ b/drivers/net/ethernet/mellanox/mlxsw/switchx2.c @@ -1259,6 +1259,7 @@ static void mlxsw_sx_ports_remove(struct mlxsw_sx *mlxsw_sx) if (mlxsw_sx_port_created(mlxsw_sx, i)) mlxsw_sx_port_remove(mlxsw_sx, i); kfree(mlxsw_sx->ports); + mlxsw_sx->ports = NULL; } static int mlxsw_sx_ports_create(struct mlxsw_sx *mlxsw_sx) @@ -1293,6 +1294,7 @@ err_port_module_info_get: if (mlxsw_sx_port_created(mlxsw_sx, i)) mlxsw_sx_port_remove(mlxsw_sx, i); kfree(mlxsw_sx->ports); + mlxsw_sx->ports = NULL; return err; } @@ -1376,6 +1378,12 @@ static int mlxsw_sx_port_type_set(struct mlxsw_core *mlxsw_core, u8 local_port, u8 module, width; int err; + if (!mlxsw_sx->ports || !mlxsw_sx->ports[local_port]) { + dev_err(mlxsw_sx->bus_info->dev, "Port number \"%d\" does not exist\n", + local_port); + return -EINVAL; + } + if (new_type == DEVLINK_PORT_TYPE_AUTO) return -EOPNOTSUPP; -- cgit v1.2.3 From 46ca11177ed593f39d534f8d2c74ec5344e90c11 Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Thu, 21 May 2020 15:11:45 +0300 Subject: selftests: mlxsw: qos_mc_aware: Specify arping timeout as an integer Starting from iputils s20190709 (used in Fedora 31), arping does not support timeout being specified as a decimal: $ arping -c 1 -I swp1 -b 192.0.2.66 -q -w 0.1 arping: invalid argument: '0.1' Previously, such timeouts were rounded to an integer. Fix this by specifying the timeout as an integer. Fixes: a5ee171d087e ("selftests: mlxsw: qos_mc_aware: Add a test for UC awareness") Signed-off-by: Amit Cohen Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- tools/testing/selftests/drivers/net/mlxsw/qos_mc_aware.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/drivers/net/mlxsw/qos_mc_aware.sh b/tools/testing/selftests/drivers/net/mlxsw/qos_mc_aware.sh index 24dd8ed48580..b025daea062d 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/qos_mc_aware.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/qos_mc_aware.sh @@ -300,7 +300,7 @@ test_uc_aware() local i for ((i = 0; i < attempts; ++i)); do - if $ARPING -c 1 -I $h1 -b 192.0.2.66 -q -w 0.1; then + if $ARPING -c 1 -I $h1 -b 192.0.2.66 -q -w 1; then ((passes++)) fi -- cgit v1.2.3 From 561535b0f23961ced071b82575d5e83e6351a814 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 21 May 2020 22:03:08 +0200 Subject: r8169: fix OCP access on RTL8117 According to r8168 vendor driver DASHv3 chips like RTL8168fp/RTL8117 need a special addressing for OCP access. Fix is compile-tested only due to missing test hardware. Fixes: 1287723aa139 ("r8169: add support for RTL8117") Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169_main.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 78e15cc00e0a..c51b48dc3639 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -1050,6 +1050,13 @@ static u16 rtl_ephy_read(struct rtl8169_private *tp, int reg_addr) RTL_R32(tp, EPHYAR) & EPHYAR_DATA_MASK : ~0; } +static void r8168fp_adjust_ocp_cmd(struct rtl8169_private *tp, u32 *cmd, int type) +{ + /* based on RTL8168FP_OOBMAC_BASE in vendor driver */ + if (tp->mac_version == RTL_GIGA_MAC_VER_52 && type == ERIAR_OOB) + *cmd |= 0x7f0 << 18; +} + DECLARE_RTL_COND(rtl_eriar_cond) { return RTL_R32(tp, ERIAR) & ERIAR_FLAG; @@ -1058,9 +1065,12 @@ DECLARE_RTL_COND(rtl_eriar_cond) static void _rtl_eri_write(struct rtl8169_private *tp, int addr, u32 mask, u32 val, int type) { + u32 cmd = ERIAR_WRITE_CMD | type | mask | addr; + BUG_ON((addr & 3) || (mask == 0)); RTL_W32(tp, ERIDR, val); - RTL_W32(tp, ERIAR, ERIAR_WRITE_CMD | type | mask | addr); + r8168fp_adjust_ocp_cmd(tp, &cmd, type); + RTL_W32(tp, ERIAR, cmd); rtl_udelay_loop_wait_low(tp, &rtl_eriar_cond, 100, 100); } @@ -1073,7 +1083,10 @@ static void rtl_eri_write(struct rtl8169_private *tp, int addr, u32 mask, static u32 _rtl_eri_read(struct rtl8169_private *tp, int addr, int type) { - RTL_W32(tp, ERIAR, ERIAR_READ_CMD | type | ERIAR_MASK_1111 | addr); + u32 cmd = ERIAR_READ_CMD | type | ERIAR_MASK_1111 | addr; + + r8168fp_adjust_ocp_cmd(tp, &cmd, type); + RTL_W32(tp, ERIAR, cmd); return rtl_udelay_loop_wait_high(tp, &rtl_eriar_cond, 100, 100) ? RTL_R32(tp, ERIDR) : ~0; -- cgit v1.2.3 From bf655ba212dfd10d1c86afeee3f3372dbd731d46 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 22 May 2020 00:31:23 +0300 Subject: net: mscc: ocelot: fix address ageing time (again) ocelot_set_ageing_time has 2 callers: - felix_set_ageing_time: from drivers/net/dsa/ocelot/felix.c - ocelot_port_attr_ageing_set: from drivers/net/ethernet/mscc/ocelot.c The issue described in the fixed commit below actually happened for the felix_set_ageing_time code path only, since ocelot_port_attr_ageing_set was already dividing by 1000. So to make both paths symmetrical (and to fix addresses getting aged way too fast on Ocelot), stop dividing by 1000 at caller side altogether. Fixes: c0d7eccbc761 ("net: mscc: ocelot: ANA_AUTOAGE_AGE_PERIOD holds a value in seconds, not ms") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/ethernet/mscc/ocelot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index 02350c3d9d01..efb3965a3e42 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -1467,7 +1467,7 @@ static void ocelot_port_attr_ageing_set(struct ocelot *ocelot, int port, unsigned long ageing_clock_t) { unsigned long ageing_jiffies = clock_t_to_jiffies(ageing_clock_t); - u32 ageing_time = jiffies_to_msecs(ageing_jiffies) / 1000; + u32 ageing_time = jiffies_to_msecs(ageing_jiffies); ocelot_set_ageing_time(ocelot, ageing_time); } -- cgit v1.2.3 From 5a730153984dd13f82ffae93d7170d76eba204e9 Mon Sep 17 00:00:00 2001 From: Qiushi Wu Date: Fri, 22 May 2020 16:50:27 -0500 Subject: net: sun: fix missing release regions in cas_init_one(). MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In cas_init_one(), "pdev" is requested by "pci_request_regions", but it was not released after a call of the function “pci_write_config_byte” failed. Thus replace the jump target “err_write_cacheline” by "err_out_free_res". Fixes: 1f26dac32057 ("[NET]: Add Sun Cassini driver.") Signed-off-by: Qiushi Wu Signed-off-by: David S. Miller --- drivers/net/ethernet/sun/cassini.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/sun/cassini.c b/drivers/net/ethernet/sun/cassini.c index e6d1aa882fa5..f1c8615ab6f0 100644 --- a/drivers/net/ethernet/sun/cassini.c +++ b/drivers/net/ethernet/sun/cassini.c @@ -4963,7 +4963,7 @@ static int cas_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) cas_cacheline_size)) { dev_err(&pdev->dev, "Could not set PCI cache " "line size\n"); - goto err_write_cacheline; + goto err_out_free_res; } } #endif @@ -5136,7 +5136,6 @@ err_out_iounmap: err_out_free_res: pci_release_regions(pdev); -err_write_cacheline: /* Try to restore it in case the error occurred after we * set it. */ -- cgit v1.2.3 From 8a1d24e1cc6d96a17f2dcb1400d370cadbfb7cb6 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 22 May 2020 23:58:28 +0100 Subject: rxrpc: Fix a warning Fix a warning due to an uninitialised variable. le included from ../fs/afs/fs_probe.c:11: ../fs/afs/fs_probe.c: In function 'afs_fileserver_probe_result': ../fs/afs/internal.h:1453:2: warning: 'rtt_us' may be used uninitialized in this function [-Wmaybe-uninitialized] 1453 | printk("[%-6.6s] "FMT"\n", current->comm ,##__VA_ARGS__) | ^~~~~~ ../fs/afs/fs_probe.c:35:15: note: 'rtt_us' was declared here Signed-off-by: David Howells --- fs/afs/fs_probe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c index 237352d3cb53..37d1bba57b00 100644 --- a/fs/afs/fs_probe.c +++ b/fs/afs/fs_probe.c @@ -32,7 +32,7 @@ void afs_fileserver_probe_result(struct afs_call *call) struct afs_server *server = call->server; unsigned int server_index = call->server_index; unsigned int index = call->addr_ix; - unsigned int rtt_us; + unsigned int rtt_us = 0; bool have_result = false; int ret = call->error; -- cgit v1.2.3 From f45d01f4f30b53c3a0a1c6c1c154acb7ff74ab9f Mon Sep 17 00:00:00 2001 From: Qiushi Wu Date: Fri, 22 May 2020 13:45:18 -0500 Subject: rxrpc: Fix a memory leak in rxkad_verify_response() A ticket was not released after a call of the function "rxkad_decrypt_ticket" failed. Thus replace the jump target "temporary_error_free_resp" by "temporary_error_free_ticket". Fixes: 8c2f826dc3631 ("rxrpc: Don't put crypto buffers on the stack") Signed-off-by: Qiushi Wu Signed-off-by: David Howells cc: Markus Elfring --- net/rxrpc/rxkad.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 098f1f9ec53b..52a24d4ef5d8 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -1148,7 +1148,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, ret = rxkad_decrypt_ticket(conn, skb, ticket, ticket_len, &session_key, &expiry, _abort_code); if (ret < 0) - goto temporary_error_free_resp; + goto temporary_error_free_ticket; /* use the session key from inside the ticket to decrypt the * response */ @@ -1230,7 +1230,6 @@ protocol_error: temporary_error_free_ticket: kfree(ticket); -temporary_error_free_resp: kfree(response); temporary_error: /* Ignore the response packet if we got a temporary error such as -- cgit v1.2.3 From 17d00e839d3b592da9659c1977d45f85b77f986a Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Fri, 27 Dec 2019 07:01:53 +0200 Subject: net/mlx5: Add command entry handling completion When FW response to commands is very slow and all command entries in use are waiting for completion we can have a race where commands can get timeout before they get out of the queue and handled. Timeout completion on uninitialized command will cause releasing command's buffers before accessing it for initialization and then we will get NULL pointer exception while trying access it. It may also cause releasing buffers of another command since we may have timeout completion before even allocating entry index for this command. Add entry handling completion to avoid this race. Fixes: e126ba97dba9 ("mlx5: Add driver for Mellanox Connect-IB adapters") Signed-off-by: Moshe Shemesh Signed-off-by: Eran Ben Elisha Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 14 ++++++++++++++ include/linux/mlx5/driver.h | 1 + 2 files changed, 15 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index cede5bdfd598..d695b75bc0af 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -861,6 +861,7 @@ static void cmd_work_handler(struct work_struct *work) int alloc_ret; int cmd_mode; + complete(&ent->handling); sem = ent->page_queue ? &cmd->pages_sem : &cmd->sem; down(sem); if (!ent->page_queue) { @@ -978,6 +979,11 @@ static int wait_func(struct mlx5_core_dev *dev, struct mlx5_cmd_work_ent *ent) struct mlx5_cmd *cmd = &dev->cmd; int err; + if (!wait_for_completion_timeout(&ent->handling, timeout) && + cancel_work_sync(&ent->work)) { + ent->ret = -ECANCELED; + goto out_err; + } if (cmd->mode == CMD_MODE_POLLING || ent->polling) { wait_for_completion(&ent->done); } else if (!wait_for_completion_timeout(&ent->done, timeout)) { @@ -985,12 +991,17 @@ static int wait_func(struct mlx5_core_dev *dev, struct mlx5_cmd_work_ent *ent) mlx5_cmd_comp_handler(dev, 1UL << ent->idx, true); } +out_err: err = ent->ret; if (err == -ETIMEDOUT) { mlx5_core_warn(dev, "%s(0x%x) timeout. Will cause a leak of a command resource\n", mlx5_command_str(msg_to_opcode(ent->in)), msg_to_opcode(ent->in)); + } else if (err == -ECANCELED) { + mlx5_core_warn(dev, "%s(0x%x) canceled on out of queue timeout.\n", + mlx5_command_str(msg_to_opcode(ent->in)), + msg_to_opcode(ent->in)); } mlx5_core_dbg(dev, "err %d, delivery status %s(%d)\n", err, deliv_status_to_str(ent->status), ent->status); @@ -1026,6 +1037,7 @@ static int mlx5_cmd_invoke(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *in, ent->token = token; ent->polling = force_polling; + init_completion(&ent->handling); if (!callback) init_completion(&ent->done); @@ -1045,6 +1057,8 @@ static int mlx5_cmd_invoke(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *in, err = wait_func(dev, ent); if (err == -ETIMEDOUT) goto out; + if (err == -ECANCELED) + goto out_free; ds = ent->ts2 - ent->ts1; op = MLX5_GET(mbox_in, in->first.data, opcode); diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 6f8f79ef829b..9b1f29f26c27 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -743,6 +743,7 @@ struct mlx5_cmd_work_ent { struct delayed_work cb_timeout_work; void *context; int idx; + struct completion handling; struct completion done; struct mlx5_cmd *cmd; struct work_struct work; -- cgit v1.2.3 From d43b7007dbd1195a5b6b83213e49b1516aaf6f5e Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Wed, 18 Mar 2020 21:44:32 +0200 Subject: net/mlx5: Fix a race when moving command interface to events mode After driver creates (via FW command) an EQ for commands, the driver will be informed on new commands completion by EQE. However, due to a race in driver's internal command mode metadata update, some new commands will still be miss-handled by driver as if we are in polling mode. Such commands can get two non forced completion, leading to already freed command entry access. CREATE_EQ command, that maps EQ to the command queue must be posted to the command queue while it is empty and no other command should be posted. Add SW mechanism that once the CREATE_EQ command is about to be executed, all other commands will return error without being sent to the FW. Allow sending other commands only after successfully changing the driver's internal command mode metadata. We can safely return error to all other commands while creating the command EQ, as all other commands might be sent from the user/application during driver load. Application can rerun them later after driver's load was finished. Fixes: e126ba97dba9 ("mlx5: Add driver for Mellanox Connect-IB adapters") Signed-off-by: Eran Ben Elisha Signed-off-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 35 ++++++++++++++++++++++++--- drivers/net/ethernet/mellanox/mlx5/core/eq.c | 3 +++ include/linux/mlx5/driver.h | 6 +++++ 3 files changed, 40 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index d695b75bc0af..2f3cafdc3b1f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -848,6 +848,14 @@ static void free_msg(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *msg); static void mlx5_free_cmd_msg(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *msg); +static bool opcode_allowed(struct mlx5_cmd *cmd, u16 opcode) +{ + if (cmd->allowed_opcode == CMD_ALLOWED_OPCODE_ALL) + return true; + + return cmd->allowed_opcode == opcode; +} + static void cmd_work_handler(struct work_struct *work) { struct mlx5_cmd_work_ent *ent = container_of(work, struct mlx5_cmd_work_ent, work); @@ -914,7 +922,8 @@ static void cmd_work_handler(struct work_struct *work) /* Skip sending command to fw if internal error */ if (pci_channel_offline(dev->pdev) || - dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) { + dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR || + !opcode_allowed(&dev->cmd, ent->op)) { u8 status = 0; u32 drv_synd; @@ -1405,6 +1414,22 @@ static void create_debugfs_files(struct mlx5_core_dev *dev) mlx5_cmdif_debugfs_init(dev); } +void mlx5_cmd_allowed_opcode(struct mlx5_core_dev *dev, u16 opcode) +{ + struct mlx5_cmd *cmd = &dev->cmd; + int i; + + for (i = 0; i < cmd->max_reg_cmds; i++) + down(&cmd->sem); + down(&cmd->pages_sem); + + cmd->allowed_opcode = opcode; + + up(&cmd->pages_sem); + for (i = 0; i < cmd->max_reg_cmds; i++) + up(&cmd->sem); +} + static void mlx5_cmd_change_mod(struct mlx5_core_dev *dev, int mode) { struct mlx5_cmd *cmd = &dev->cmd; @@ -1681,12 +1706,13 @@ static int cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out, int err; u8 status = 0; u32 drv_synd; + u16 opcode; u8 token; + opcode = MLX5_GET(mbox_in, in, opcode); if (pci_channel_offline(dev->pdev) || - dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) { - u16 opcode = MLX5_GET(mbox_in, in, opcode); - + dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR || + !opcode_allowed(&dev->cmd, opcode)) { err = mlx5_internal_err_ret_value(dev, opcode, &drv_synd, &status); MLX5_SET(mbox_out, out, status, status); MLX5_SET(mbox_out, out, syndrome, drv_synd); @@ -1988,6 +2014,7 @@ int mlx5_cmd_init(struct mlx5_core_dev *dev) mlx5_core_dbg(dev, "descriptor at dma 0x%llx\n", (unsigned long long)(cmd->dma)); cmd->mode = CMD_MODE_POLLING; + cmd->allowed_opcode = CMD_ALLOWED_OPCODE_ALL; create_msg_cache(dev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c index cccea3a8eddd..ce6c621af043 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c @@ -611,11 +611,13 @@ static int create_async_eqs(struct mlx5_core_dev *dev) .nent = MLX5_NUM_CMD_EQE, .mask[0] = 1ull << MLX5_EVENT_TYPE_CMD, }; + mlx5_cmd_allowed_opcode(dev, MLX5_CMD_OP_CREATE_EQ); err = setup_async_eq(dev, &table->cmd_eq, ¶m, "cmd"); if (err) goto err1; mlx5_cmd_use_events(dev); + mlx5_cmd_allowed_opcode(dev, CMD_ALLOWED_OPCODE_ALL); param = (struct mlx5_eq_param) { .irq_index = 0, @@ -645,6 +647,7 @@ err2: mlx5_cmd_use_polling(dev); cleanup_async_eq(dev, &table->cmd_eq, "cmd"); err1: + mlx5_cmd_allowed_opcode(dev, CMD_ALLOWED_OPCODE_ALL); mlx5_eq_notifier_unregister(dev, &table->cq_err_nb); return err; } diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 9b1f29f26c27..c03778c75dfa 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -284,6 +284,7 @@ struct mlx5_cmd { struct semaphore sem; struct semaphore pages_sem; int mode; + u16 allowed_opcode; struct mlx5_cmd_work_ent *ent_arr[MLX5_MAX_COMMANDS]; struct dma_pool *pool; struct mlx5_cmd_debug dbg; @@ -875,10 +876,15 @@ mlx5_frag_buf_get_idx_last_contig_stride(struct mlx5_frag_buf_ctrl *fbc, u32 ix) return min_t(u32, last_frag_stride_idx - fbc->strides_offset, fbc->sz_m1); } +enum { + CMD_ALLOWED_OPCODE_ALL, +}; + int mlx5_cmd_init(struct mlx5_core_dev *dev); void mlx5_cmd_cleanup(struct mlx5_core_dev *dev); void mlx5_cmd_use_events(struct mlx5_core_dev *dev); void mlx5_cmd_use_polling(struct mlx5_core_dev *dev); +void mlx5_cmd_allowed_opcode(struct mlx5_core_dev *dev, u16 opcode); struct mlx5_async_ctx { struct mlx5_core_dev *dev; -- cgit v1.2.3 From f7936ddd35d8b849daf0372770c7c9dbe7910fca Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Thu, 19 Mar 2020 21:43:13 +0200 Subject: net/mlx5: Avoid processing commands before cmdif is ready When driver is reloading during recovery flow, it can't get new commands till command interface is up again. Otherwise we may get to null pointer trying to access non initialized command structures. Add cmdif state to avoid processing commands while cmdif is not ready. Fixes: e126ba97dba9 ("mlx5: Add driver for Mellanox Connect-IB adapters") Signed-off-by: Eran Ben Elisha Signed-off-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 10 ++++++++++ drivers/net/ethernet/mellanox/mlx5/core/main.c | 4 ++++ include/linux/mlx5/driver.h | 9 +++++++++ 3 files changed, 23 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index 2f3cafdc3b1f..7a77fe40af3a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -923,6 +923,7 @@ static void cmd_work_handler(struct work_struct *work) /* Skip sending command to fw if internal error */ if (pci_channel_offline(dev->pdev) || dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR || + cmd->state != MLX5_CMDIF_STATE_UP || !opcode_allowed(&dev->cmd, ent->op)) { u8 status = 0; u32 drv_synd; @@ -1712,6 +1713,7 @@ static int cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out, opcode = MLX5_GET(mbox_in, in, opcode); if (pci_channel_offline(dev->pdev) || dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR || + dev->cmd.state != MLX5_CMDIF_STATE_UP || !opcode_allowed(&dev->cmd, opcode)) { err = mlx5_internal_err_ret_value(dev, opcode, &drv_synd, &status); MLX5_SET(mbox_out, out, status, status); @@ -1977,6 +1979,7 @@ int mlx5_cmd_init(struct mlx5_core_dev *dev) goto err_free_page; } + cmd->state = MLX5_CMDIF_STATE_DOWN; cmd->checksum_disabled = 1; cmd->max_reg_cmds = (1 << cmd->log_sz) - 1; cmd->bitmask = (1UL << cmd->max_reg_cmds) - 1; @@ -2054,3 +2057,10 @@ void mlx5_cmd_cleanup(struct mlx5_core_dev *dev) dma_pool_destroy(cmd->pool); } EXPORT_SYMBOL(mlx5_cmd_cleanup); + +void mlx5_cmd_set_state(struct mlx5_core_dev *dev, + enum mlx5_cmdif_state cmdif_state) +{ + dev->cmd.state = cmdif_state; +} +EXPORT_SYMBOL(mlx5_cmd_set_state); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 7af4210c1b96..a61e473db7e1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -965,6 +965,8 @@ static int mlx5_function_setup(struct mlx5_core_dev *dev, bool boot) goto err_cmd_cleanup; } + mlx5_cmd_set_state(dev, MLX5_CMDIF_STATE_UP); + err = mlx5_core_enable_hca(dev, 0); if (err) { mlx5_core_err(dev, "enable hca failed\n"); @@ -1026,6 +1028,7 @@ reclaim_boot_pages: err_disable_hca: mlx5_core_disable_hca(dev, 0); err_cmd_cleanup: + mlx5_cmd_set_state(dev, MLX5_CMDIF_STATE_DOWN); mlx5_cmd_cleanup(dev); return err; @@ -1043,6 +1046,7 @@ static int mlx5_function_teardown(struct mlx5_core_dev *dev, bool boot) } mlx5_reclaim_startup_pages(dev); mlx5_core_disable_hca(dev, 0); + mlx5_cmd_set_state(dev, MLX5_CMDIF_STATE_DOWN); mlx5_cmd_cleanup(dev); return 0; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index c03778c75dfa..8397b6558dc7 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -213,6 +213,12 @@ enum mlx5_port_status { MLX5_PORT_DOWN = 2, }; +enum mlx5_cmdif_state { + MLX5_CMDIF_STATE_UNINITIALIZED, + MLX5_CMDIF_STATE_UP, + MLX5_CMDIF_STATE_DOWN, +}; + struct mlx5_cmd_first { __be32 data[4]; }; @@ -258,6 +264,7 @@ struct mlx5_cmd_stats { struct mlx5_cmd { struct mlx5_nb nb; + enum mlx5_cmdif_state state; void *cmd_alloc_buf; dma_addr_t alloc_dma; int alloc_size; @@ -882,6 +889,8 @@ enum { int mlx5_cmd_init(struct mlx5_core_dev *dev); void mlx5_cmd_cleanup(struct mlx5_core_dev *dev); +void mlx5_cmd_set_state(struct mlx5_core_dev *dev, + enum mlx5_cmdif_state cmdif_state); void mlx5_cmd_use_events(struct mlx5_core_dev *dev); void mlx5_cmd_use_polling(struct mlx5_core_dev *dev); void mlx5_cmd_allowed_opcode(struct mlx5_core_dev *dev, u16 opcode); -- cgit v1.2.3 From 321348475d544aa6705dcfac2135deeccb8dc0bb Mon Sep 17 00:00:00 2001 From: Maor Dickman Date: Thu, 23 Apr 2020 15:16:17 +0300 Subject: net/mlx5e: Fix allowed tc redirect merged eswitch offload cases After changing the parent_id to be the same for both NICs of same The cited commit wrongly allow offload of tc redirect flows from VF to uplink and vice versa when devcies are on different eswitch, these cases aren't supported by HW. Disallow the above offloads when devcies are on different eswitch and VF LAG is not configured. Fixes: f6dc1264f1c0 ("net/mlx5e: Disallow tc redirect offload cases we don't support") Signed-off-by: Maor Dickman Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 8 ++--- drivers/net/ethernet/mellanox/mlx5/core/en_rep.h | 7 ++++- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 40 +++++++++++++++++++----- 3 files changed, 41 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index f372e94948fd..cdecf4280e86 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -1484,13 +1484,9 @@ bool mlx5e_eswitch_uplink_rep(struct net_device *netdev) return netdev->netdev_ops == &mlx5e_netdev_ops_uplink_rep; } -bool mlx5e_eswitch_rep(struct net_device *netdev) +bool mlx5e_eswitch_vf_rep(struct net_device *netdev) { - if (netdev->netdev_ops == &mlx5e_netdev_ops_rep || - netdev->netdev_ops == &mlx5e_netdev_ops_uplink_rep) - return true; - - return false; + return netdev->netdev_ops == &mlx5e_netdev_ops_rep; } static void mlx5e_build_rep_params(struct net_device *netdev) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h index 6a2337900420..612b5cf0673d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h @@ -210,8 +210,13 @@ void mlx5e_rep_encap_entry_detach(struct mlx5e_priv *priv, void mlx5e_rep_queue_neigh_stats_work(struct mlx5e_priv *priv); -bool mlx5e_eswitch_rep(struct net_device *netdev); +bool mlx5e_eswitch_vf_rep(struct net_device *netdev); bool mlx5e_eswitch_uplink_rep(struct net_device *netdev); +static inline bool mlx5e_eswitch_rep(struct net_device *netdev) +{ + return mlx5e_eswitch_vf_rep(netdev) || + mlx5e_eswitch_uplink_rep(netdev); +} #else /* CONFIG_MLX5_ESWITCH */ static inline bool mlx5e_is_uplink_rep(struct mlx5e_priv *priv) { return false; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index a574c588269a..5bcf95fcdd59 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -3073,6 +3073,11 @@ static bool actions_match_supported(struct mlx5e_priv *priv, return true; } +static bool same_port_devs(struct mlx5e_priv *priv, struct mlx5e_priv *peer_priv) +{ + return priv->mdev == peer_priv->mdev; +} + static bool same_hw_devs(struct mlx5e_priv *priv, struct mlx5e_priv *peer_priv) { struct mlx5_core_dev *fmdev, *pmdev; @@ -3291,7 +3296,7 @@ static inline int hash_encap_info(struct encap_key *key) } -static bool is_merged_eswitch_dev(struct mlx5e_priv *priv, +static bool is_merged_eswitch_vfs(struct mlx5e_priv *priv, struct net_device *peer_netdev) { struct mlx5e_priv *peer_priv; @@ -3299,13 +3304,11 @@ static bool is_merged_eswitch_dev(struct mlx5e_priv *priv, peer_priv = netdev_priv(peer_netdev); return (MLX5_CAP_ESW(priv->mdev, merged_eswitch) && - mlx5e_eswitch_rep(priv->netdev) && - mlx5e_eswitch_rep(peer_netdev) && + mlx5e_eswitch_vf_rep(priv->netdev) && + mlx5e_eswitch_vf_rep(peer_netdev) && same_hw_devs(priv, peer_priv)); } - - bool mlx5e_encap_take(struct mlx5e_encap_entry *e) { return refcount_inc_not_zero(&e->refcnt); @@ -3575,14 +3578,37 @@ static int add_vlan_pop_action(struct mlx5e_priv *priv, return err; } +static bool same_hw_reps(struct mlx5e_priv *priv, + struct net_device *peer_netdev) +{ + struct mlx5e_priv *peer_priv; + + peer_priv = netdev_priv(peer_netdev); + + return mlx5e_eswitch_rep(priv->netdev) && + mlx5e_eswitch_rep(peer_netdev) && + same_hw_devs(priv, peer_priv); +} + +static bool is_lag_dev(struct mlx5e_priv *priv, + struct net_device *peer_netdev) +{ + return ((mlx5_lag_is_sriov(priv->mdev) || + mlx5_lag_is_multipath(priv->mdev)) && + same_hw_reps(priv, peer_netdev)); +} + bool mlx5e_is_valid_eswitch_fwd_dev(struct mlx5e_priv *priv, struct net_device *out_dev) { - if (is_merged_eswitch_dev(priv, out_dev)) + if (is_merged_eswitch_vfs(priv, out_dev)) + return true; + + if (is_lag_dev(priv, out_dev)) return true; return mlx5e_eswitch_rep(out_dev) && - same_hw_devs(priv, netdev_priv(out_dev)); + same_port_devs(priv, netdev_priv(out_dev)); } static bool is_duplicated_output_device(struct net_device *dev, -- cgit v1.2.3 From 16736e11f43b80a38f98f6add54fab3b8c297df3 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Mon, 27 Apr 2020 16:56:59 +0300 Subject: net/mlx5e: kTLS, Destroy key object after destroying the TIS The TLS TIS object contains the dek/key ID. By destroying the key first, the TIS would contain an invalid non-existing key ID. Reverse the destroy order, this also acheives the desired assymetry between the destroy and the create flows. Fixes: d2ead1f360e8 ("net/mlx5e: Add kTLS TX HW offload support") Signed-off-by: Tariq Toukan Reviewed-by: Boris Pismenny Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.c index 46725cd743a3..7d1985fa0d4f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.c @@ -69,8 +69,8 @@ static void mlx5e_ktls_del(struct net_device *netdev, struct mlx5e_ktls_offload_context_tx *tx_priv = mlx5e_get_ktls_tx_priv_ctx(tls_ctx); - mlx5_ktls_destroy_key(priv->mdev, tx_priv->key_id); mlx5e_destroy_tis(priv->mdev, tx_priv->tisn); + mlx5_ktls_destroy_key(priv->mdev, tx_priv->key_id); kvfree(tx_priv); } -- cgit v1.2.3 From a16b8e0dcf7043bee46174bed0553cc9e36b63a5 Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Thu, 30 Apr 2020 09:16:01 +0300 Subject: net/mlx5e: Fix inner tirs handling In the cited commit inner_tirs argument was added to create and destroy inner tirs, and no indication was added to mlx5e_modify_tirs_hash() function. In order to have a consistent handling, use inner_indir_tir[0].tirn in tirs destroy/modify function as an indication to whether inner tirs are created. Inner tirs are not created for representors and before this commit, a call to mlx5e_modify_tirs_hash() was sending HW commands to modify non-existent inner tirs. Fixes: 46dc933cee82 ("net/mlx5e: Provide explicit directive if to create inner indirect tirs") Signed-off-by: Roi Dayan Reviewed-by: Vlad Buslov Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 12 +++++++----- drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 4 ++-- drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c | 4 ++-- 4 files changed, 12 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 23701c0e36ec..59745402747b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -1121,7 +1121,7 @@ void mlx5e_close_drop_rq(struct mlx5e_rq *drop_rq); int mlx5e_create_indirect_rqt(struct mlx5e_priv *priv); int mlx5e_create_indirect_tirs(struct mlx5e_priv *priv, bool inner_ttc); -void mlx5e_destroy_indirect_tirs(struct mlx5e_priv *priv, bool inner_ttc); +void mlx5e_destroy_indirect_tirs(struct mlx5e_priv *priv); int mlx5e_create_direct_rqts(struct mlx5e_priv *priv, struct mlx5e_tir *tirs); void mlx5e_destroy_direct_rqts(struct mlx5e_priv *priv, struct mlx5e_tir *tirs); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index b314adf438da..c6b83042d431 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -2717,7 +2717,8 @@ void mlx5e_modify_tirs_hash(struct mlx5e_priv *priv, void *in, int inlen) mlx5_core_modify_tir(mdev, priv->indir_tir[tt].tirn, in, inlen); } - if (!mlx5e_tunnel_inner_ft_supported(priv->mdev)) + /* Verify inner tirs resources allocated */ + if (!priv->inner_indir_tir[0].tirn) return; for (tt = 0; tt < MLX5E_NUM_INDIR_TIRS; tt++) { @@ -3408,14 +3409,15 @@ out: return err; } -void mlx5e_destroy_indirect_tirs(struct mlx5e_priv *priv, bool inner_ttc) +void mlx5e_destroy_indirect_tirs(struct mlx5e_priv *priv) { int i; for (i = 0; i < MLX5E_NUM_INDIR_TIRS; i++) mlx5e_destroy_tir(priv->mdev, &priv->indir_tir[i]); - if (!inner_ttc || !mlx5e_tunnel_inner_ft_supported(priv->mdev)) + /* Verify inner tirs resources allocated */ + if (!priv->inner_indir_tir[0].tirn) return; for (i = 0; i < MLX5E_NUM_INDIR_TIRS; i++) @@ -5123,7 +5125,7 @@ err_destroy_xsk_rqts: err_destroy_direct_tirs: mlx5e_destroy_direct_tirs(priv, priv->direct_tir); err_destroy_indirect_tirs: - mlx5e_destroy_indirect_tirs(priv, true); + mlx5e_destroy_indirect_tirs(priv); err_destroy_direct_rqts: mlx5e_destroy_direct_rqts(priv, priv->direct_tir); err_destroy_indirect_rqts: @@ -5142,7 +5144,7 @@ static void mlx5e_cleanup_nic_rx(struct mlx5e_priv *priv) mlx5e_destroy_direct_tirs(priv, priv->xsk_tir); mlx5e_destroy_direct_rqts(priv, priv->xsk_tir); mlx5e_destroy_direct_tirs(priv, priv->direct_tir); - mlx5e_destroy_indirect_tirs(priv, true); + mlx5e_destroy_indirect_tirs(priv); mlx5e_destroy_direct_rqts(priv, priv->direct_tir); mlx5e_destroy_rqt(priv, &priv->indir_rqt); mlx5e_close_drop_rq(&priv->drop_rq); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index cdecf4280e86..4a8e0dfdc5f2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -1743,7 +1743,7 @@ err_destroy_ttc_table: err_destroy_direct_tirs: mlx5e_destroy_direct_tirs(priv, priv->direct_tir); err_destroy_indirect_tirs: - mlx5e_destroy_indirect_tirs(priv, false); + mlx5e_destroy_indirect_tirs(priv); err_destroy_direct_rqts: mlx5e_destroy_direct_rqts(priv, priv->direct_tir); err_destroy_indirect_rqts: @@ -1761,7 +1761,7 @@ static void mlx5e_cleanup_rep_rx(struct mlx5e_priv *priv) mlx5e_destroy_rep_root_ft(priv); mlx5e_destroy_ttc_table(priv, &priv->fs.ttc); mlx5e_destroy_direct_tirs(priv, priv->direct_tir); - mlx5e_destroy_indirect_tirs(priv, false); + mlx5e_destroy_indirect_tirs(priv); mlx5e_destroy_direct_rqts(priv, priv->direct_tir); mlx5e_destroy_rqt(priv, &priv->indir_rqt); mlx5e_close_drop_rq(&priv->drop_rq); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c index 673aaa815f57..505cf6eeae25 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c @@ -396,7 +396,7 @@ static int mlx5i_init_rx(struct mlx5e_priv *priv) err_destroy_direct_tirs: mlx5e_destroy_direct_tirs(priv, priv->direct_tir); err_destroy_indirect_tirs: - mlx5e_destroy_indirect_tirs(priv, true); + mlx5e_destroy_indirect_tirs(priv); err_destroy_direct_rqts: mlx5e_destroy_direct_rqts(priv, priv->direct_tir); err_destroy_indirect_rqts: @@ -412,7 +412,7 @@ static void mlx5i_cleanup_rx(struct mlx5e_priv *priv) { mlx5i_destroy_flow_steering(priv); mlx5e_destroy_direct_tirs(priv, priv->direct_tir); - mlx5e_destroy_indirect_tirs(priv, true); + mlx5e_destroy_indirect_tirs(priv); mlx5e_destroy_direct_rqts(priv, priv->direct_tir); mlx5e_destroy_rqt(priv, &priv->indir_rqt); mlx5e_close_drop_rq(&priv->drop_rq); -- cgit v1.2.3 From df14ad1eccb04a4a28c90389214dbacab085b244 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Wed, 29 Apr 2020 23:56:58 +0300 Subject: net/mlx5: Fix memory leak in mlx5_events_init Fix memory leak in mlx5_events_init(), in case create_single_thread_workqueue() fails, events struct should be freed. Fixes: 5d3c537f9070 ("net/mlx5: Handle event of power detection in the PCIE slot") Signed-off-by: Moshe Shemesh Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/events.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/events.c b/drivers/net/ethernet/mellanox/mlx5/core/events.c index 8bcf3426b9c6..3ce17c3d7a00 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/events.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/events.c @@ -346,8 +346,10 @@ int mlx5_events_init(struct mlx5_core_dev *dev) events->dev = dev; dev->priv.events = events; events->wq = create_singlethread_workqueue("mlx5_events"); - if (!events->wq) + if (!events->wq) { + kfree(events); return -ENOMEM; + } INIT_WORK(&events->pcie_core_work, mlx5_pcie_event); return 0; -- cgit v1.2.3 From aee37f3d940ca732df71c3df49347bccaafc0b24 Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Mon, 11 May 2020 16:32:09 +0300 Subject: net/mlx5: Fix cleaning unmanaged flow tables Unmanaged flow tables doesn't have a parent and tree_put_node() assume there is always a parent if cleaning is needed. fix that. Fixes: 5281a0c90919 ("net/mlx5: fs_core: Introduce unmanaged flow tables") Signed-off-by: Roi Dayan Reviewed-by: Mark Bloch Reviewed-by: Paul Blakey Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index d5defe09339a..8f62bfcf57af 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -344,14 +344,13 @@ static void tree_put_node(struct fs_node *node, bool locked) if (node->del_hw_func) node->del_hw_func(node); if (parent_node) { - /* Only root namespace doesn't have parent and we just - * need to free its node. - */ down_write_ref_node(parent_node, locked); list_del_init(&node->list); if (node->del_sw_func) node->del_sw_func(node); up_write_ref_node(parent_node, locked); + } else if (node->del_sw_func) { + node->del_sw_func(node); } else { kfree(node); } @@ -468,8 +467,10 @@ static void del_sw_flow_table(struct fs_node *node) fs_get_obj(ft, node); rhltable_destroy(&ft->fgs_hash); - fs_get_obj(prio, ft->node.parent); - prio->num_ft--; + if (ft->node.parent) { + fs_get_obj(prio, ft->node.parent); + prio->num_ft--; + } kfree(ft); } -- cgit v1.2.3 From 6eb7a268a99bad8346d4baa148a14456d061c1c3 Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Mon, 11 May 2020 16:37:11 +0300 Subject: net/mlx5: Don't maintain a case of del_sw_func being null Add del_sw_func cb for root ns. Now there is no need to maintain a case of del_sw_func being null when freeing the node. Fixes: 2cc43b494a6c ("net/mlx5_core: Managing root flow table") Signed-off-by: Roi Dayan Reviewed-by: Mark Bloch Reviewed-by: Paul Blakey Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index 8f62bfcf57af..02d0f94eaaad 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -346,14 +346,10 @@ static void tree_put_node(struct fs_node *node, bool locked) if (parent_node) { down_write_ref_node(parent_node, locked); list_del_init(&node->list); - if (node->del_sw_func) - node->del_sw_func(node); - up_write_ref_node(parent_node, locked); - } else if (node->del_sw_func) { - node->del_sw_func(node); - } else { - kfree(node); } + node->del_sw_func(node); + if (parent_node) + up_write_ref_node(parent_node, locked); node = NULL; } if (!node && parent_node) @@ -2352,6 +2348,11 @@ static int init_root_tree(struct mlx5_flow_steering *steering, return 0; } +static void del_sw_root_ns(struct fs_node *node) +{ + kfree(node); +} + static struct mlx5_flow_root_namespace *create_root_ns(struct mlx5_flow_steering *steering, enum fs_flow_table_type table_type) @@ -2378,7 +2379,7 @@ static struct mlx5_flow_root_namespace ns = &root_ns->ns; fs_init_namespace(ns); mutex_init(&root_ns->chain_lock); - tree_init_node(&ns->node, NULL, NULL); + tree_init_node(&ns->node, NULL, del_sw_root_ns); tree_add_node(&ns->node, NULL); return root_ns; -- cgit v1.2.3 From 9ca415399dae133b00273a4283ef31d003a6818d Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Thu, 14 May 2020 23:44:38 +0300 Subject: net/mlx5: Annotate mutex destroy for root ns Invoke mutex_destroy() to catch any errors. Fixes: 2cc43b494a6c ("net/mlx5_core: Managing root flow table") Signed-off-by: Roi Dayan Reviewed-by: Mark Bloch Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index 02d0f94eaaad..9620c8650e13 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -2350,6 +2350,12 @@ static int init_root_tree(struct mlx5_flow_steering *steering, static void del_sw_root_ns(struct fs_node *node) { + struct mlx5_flow_root_namespace *root_ns; + struct mlx5_flow_namespace *ns; + + fs_get_obj(ns, node); + root_ns = container_of(ns, struct mlx5_flow_root_namespace, ns); + mutex_destroy(&root_ns->chain_lock); kfree(node); } -- cgit v1.2.3 From 5e911e2c06bd8c17df29147a5e2d4b17fafda024 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Tue, 7 Apr 2020 17:38:28 +0300 Subject: net/mlx5e: Update netdev txq on completions during closure On sq closure when we free its descriptors, we should also update netdev txq on completions which would not arrive. Otherwise if we reopen sqs and attach them back, for example on fw fatal recovery flow, we may get tx timeout. Fixes: 29429f3300a3 ("net/mlx5e: Timeout if SQ doesn't flush during close") Signed-off-by: Moshe Shemesh Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_tx.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c index fd6b2a1898c5..119a5c6cc167 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c @@ -537,10 +537,9 @@ bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq, int napi_budget) void mlx5e_free_txqsq_descs(struct mlx5e_txqsq *sq) { struct mlx5e_tx_wqe_info *wi; + u32 dma_fifo_cc, nbytes = 0; + u16 ci, sqcc, npkts = 0; struct sk_buff *skb; - u32 dma_fifo_cc; - u16 sqcc; - u16 ci; int i; sqcc = sq->cc; @@ -565,11 +564,15 @@ void mlx5e_free_txqsq_descs(struct mlx5e_txqsq *sq) } dev_kfree_skb_any(skb); + npkts++; + nbytes += wi->num_bytes; sqcc += wi->num_wqebbs; } sq->dma_fifo_cc = dma_fifo_cc; sq->cc = sqcc; + + netdev_tx_completed_queue(sq->txq, npkts, nbytes); } #ifdef CONFIG_MLX5_CORE_IPOIB -- cgit v1.2.3 From d37bd5e81ed0d58f0ebe2e01658c26722e0c033e Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Mon, 18 May 2020 20:21:11 +0300 Subject: net/mlx5e: CT: Correctly get flow rule The correct way is to us the flow_cls_offload_flow_rule() wrapper instead of f->rule directly. Fixes: 4c3844d9e97e ("net/mlx5e: CT: Introduce connection tracking") Signed-off-by: Roi Dayan Reviewed-by: Oz Shlomo Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c | 5 +++-- drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.h | 4 +++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c index a172c5e39710..4eb305af0106 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c @@ -699,6 +699,7 @@ mlx5_tc_ct_parse_match(struct mlx5e_priv *priv, struct netlink_ext_ack *extack) { struct mlx5_tc_ct_priv *ct_priv = mlx5_tc_ct_get_ct_priv(priv); + struct flow_rule *rule = flow_cls_offload_flow_rule(f); struct flow_dissector_key_ct *mask, *key; bool trk, est, untrk, unest, new; u32 ctstate = 0, ctstate_mask = 0; @@ -706,7 +707,7 @@ mlx5_tc_ct_parse_match(struct mlx5e_priv *priv, u16 ct_state, ct_state_mask; struct flow_match_ct match; - if (!flow_rule_match_key(f->rule, FLOW_DISSECTOR_KEY_CT)) + if (!flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_CT)) return 0; if (!ct_priv) { @@ -715,7 +716,7 @@ mlx5_tc_ct_parse_match(struct mlx5e_priv *priv, return -EOPNOTSUPP; } - flow_rule_match_ct(f->rule, &match); + flow_rule_match_ct(rule, &match); key = match.key; mask = match.mask; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.h b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.h index 091d305b633e..626f6c04882e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.h @@ -130,7 +130,9 @@ mlx5_tc_ct_parse_match(struct mlx5e_priv *priv, struct flow_cls_offload *f, struct netlink_ext_ack *extack) { - if (!flow_rule_match_key(f->rule, FLOW_DISSECTOR_KEY_CT)) + struct flow_rule *rule = flow_cls_offload_flow_rule(f); + + if (!flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_CT)) return 0; NL_SET_ERR_MSG_MOD(extack, "mlx5 tc ct offload isn't enabled."); -- cgit v1.2.3 From 4f7400d5cbaef676e00cdffb0565bf731c6bb09e Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Wed, 6 May 2020 14:52:04 +0300 Subject: net/mlx5: Fix error flow in case of function_setup failure Currently, if an error occurred during mlx5_function_setup(), we keep dev->state as DEVICE_STATE_UP. Fixing it by adding a goto label. Fixes: e161105e58da ("net/mlx5: Function setup/teardown procedures") Signed-off-by: Shay Drory Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index a61e473db7e1..c1618b818f3a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1195,7 +1195,7 @@ int mlx5_load_one(struct mlx5_core_dev *dev, bool boot) err = mlx5_function_setup(dev, boot); if (err) - goto out; + goto err_function; if (boot) { err = mlx5_init_once(dev); @@ -1233,6 +1233,7 @@ err_load: mlx5_cleanup_once(dev); function_teardown: mlx5_function_teardown(dev, boot); +err_function: dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR; mutex_unlock(&dev->intf_state_mutex); -- cgit v1.2.3 From ef24d6c3d6965158dfe23ae961d87e9a343e18a2 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Fri, 22 May 2020 19:03:21 +0800 Subject: net: Fix return value about devm_platform_ioremap_resource() When call function devm_platform_ioremap_resource(), we should use IS_ERR() to check the return value and return PTR_ERR() if failed. Signed-off-by: Tiezhu Yang Signed-off-by: David S. Miller --- drivers/net/can/ifi_canfd/ifi_canfd.c | 5 ++++- drivers/net/can/sun4i_can.c | 2 +- drivers/net/dsa/b53/b53_srab.c | 2 +- drivers/net/ethernet/marvell/pxa168_eth.c | 2 +- 4 files changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/net/can/ifi_canfd/ifi_canfd.c b/drivers/net/can/ifi_canfd/ifi_canfd.c index 04d59bede5ea..74503cacf594 100644 --- a/drivers/net/can/ifi_canfd/ifi_canfd.c +++ b/drivers/net/can/ifi_canfd/ifi_canfd.c @@ -947,8 +947,11 @@ static int ifi_canfd_plat_probe(struct platform_device *pdev) u32 id, rev; addr = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(addr)) + return PTR_ERR(addr); + irq = platform_get_irq(pdev, 0); - if (IS_ERR(addr) || irq < 0) + if (irq < 0) return -EINVAL; id = readl(addr + IFI_CANFD_IP_ID); diff --git a/drivers/net/can/sun4i_can.c b/drivers/net/can/sun4i_can.c index e3ba8ab0cbf4..e2c6cf4b2228 100644 --- a/drivers/net/can/sun4i_can.c +++ b/drivers/net/can/sun4i_can.c @@ -792,7 +792,7 @@ static int sun4ican_probe(struct platform_device *pdev) addr = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(addr)) { - err = -EBUSY; + err = PTR_ERR(addr); goto exit; } diff --git a/drivers/net/dsa/b53/b53_srab.c b/drivers/net/dsa/b53/b53_srab.c index 0a1be5259be0..38cd8285ac67 100644 --- a/drivers/net/dsa/b53/b53_srab.c +++ b/drivers/net/dsa/b53/b53_srab.c @@ -609,7 +609,7 @@ static int b53_srab_probe(struct platform_device *pdev) priv->regs = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(priv->regs)) - return -ENOMEM; + return PTR_ERR(priv->regs); dev = b53_switch_alloc(&pdev->dev, &b53_srab_ops, priv); if (!dev) diff --git a/drivers/net/ethernet/marvell/pxa168_eth.c b/drivers/net/ethernet/marvell/pxa168_eth.c index 7a0d785b826c..17243bb5ba91 100644 --- a/drivers/net/ethernet/marvell/pxa168_eth.c +++ b/drivers/net/ethernet/marvell/pxa168_eth.c @@ -1418,7 +1418,7 @@ static int pxa168_eth_probe(struct platform_device *pdev) pep->base = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(pep->base)) { - err = -ENOMEM; + err = PTR_ERR(pep->base); goto err_netdev; } -- cgit v1.2.3 From 31096c3e8b1163c6e966bf4d1f36d8b699008f84 Mon Sep 17 00:00:00 2001 From: Leon Yu Date: Fri, 22 May 2020 23:29:43 +0800 Subject: net: stmmac: don't attach interface until resume finishes Commit 14b41a2959fb ("net: stmmac: Delete txtimer in suspend") was the first attempt to fix a race between mod_timer() and setup_timer() during stmmac_resume(). However the issue still exists as the commit only addressed half of the issue. Same race can still happen as stmmac_resume() re-attaches interface way too early - even before hardware is fully initialized. Worse, doing so allows network traffic to restart and stmmac_tx_timer_arm() being called in the middle of stmmac_resume(), which re-init tx timers in stmmac_init_coalesce(). timer_list will be corrupted and system crashes as a result of race between mod_timer() and setup_timer(). systemd--1995 2.... 552950018us : stmmac_suspend: 4994 ksoftirq-9 0..s2 553123133us : stmmac_tx_timer_arm: 2276 systemd--1995 0.... 553127896us : stmmac_resume: 5101 systemd--320 7...2 553132752us : stmmac_tx_timer_arm: 2276 (sd-exec-1999 5...2 553135204us : stmmac_tx_timer_arm: 2276 --------------------------------- pc : run_timer_softirq+0x468/0x5e0 lr : run_timer_softirq+0x570/0x5e0 Call trace: run_timer_softirq+0x468/0x5e0 __do_softirq+0x124/0x398 irq_exit+0xd8/0xe0 __handle_domain_irq+0x6c/0xc0 gic_handle_irq+0x60/0xb0 el1_irq+0xb8/0x180 arch_cpu_idle+0x38/0x230 default_idle_call+0x24/0x3c do_idle+0x1e0/0x2b8 cpu_startup_entry+0x28/0x48 secondary_start_kernel+0x1b4/0x208 Fix this by deferring netif_device_attach() to the end of stmmac_resume(). Signed-off-by: Leon Yu Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index a999d6b33a64..1f319c9cee46 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -5190,8 +5190,6 @@ int stmmac_resume(struct device *dev) return ret; } - netif_device_attach(ndev); - mutex_lock(&priv->lock); stmmac_reset_queues_param(priv); @@ -5218,6 +5216,8 @@ int stmmac_resume(struct device *dev) phylink_mac_change(priv->phylink, true); + netif_device_attach(ndev); + return 0; } EXPORT_SYMBOL_GPL(stmmac_resume); -- cgit v1.2.3 From 0ddfee1feece1c85592d49b759286032ef2dd803 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Fri, 22 May 2020 17:55:45 +0200 Subject: net: phy: mscc: fix initialization of the MACsec protocol mode At the very end of the MACsec block initialization in the MSCC PHY driver, the MACsec "protocol mode" is set. This setting should be set based on the PHY id within the package, as the bank used to access the register used depends on this. This was not done correctly, and only the first bank was used leading to the two upper PHYs being unstable when using the VSC8584. This patch fixes it. Fixes: 1bbe0ecc2a1a ("net: phy: mscc: macsec initialization") Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- drivers/net/phy/mscc/mscc.h | 2 ++ drivers/net/phy/mscc/mscc_mac.h | 6 +++--- drivers/net/phy/mscc/mscc_macsec.c | 16 ++++++++++------ drivers/net/phy/mscc/mscc_macsec.h | 3 ++- drivers/net/phy/mscc/mscc_main.c | 4 ++++ 5 files changed, 21 insertions(+), 10 deletions(-) diff --git a/drivers/net/phy/mscc/mscc.h b/drivers/net/phy/mscc/mscc.h index 030bf8b600df..414e3b31bb1f 100644 --- a/drivers/net/phy/mscc/mscc.h +++ b/drivers/net/phy/mscc/mscc.h @@ -354,6 +354,8 @@ struct vsc8531_private { u64 *stats; int nstats; bool pkg_init; + /* PHY address within the package. */ + u8 addr; /* For multiple port PHYs; the MDIO address of the base PHY in the * package. */ diff --git a/drivers/net/phy/mscc/mscc_mac.h b/drivers/net/phy/mscc/mscc_mac.h index fcb5ba5e5d03..59b6837c60b3 100644 --- a/drivers/net/phy/mscc/mscc_mac.h +++ b/drivers/net/phy/mscc/mscc_mac.h @@ -152,8 +152,8 @@ #define MSCC_MAC_PAUSE_CFG_STATE_PAUSE_STATE BIT(0) #define MSCC_MAC_PAUSE_CFG_STATE_MAC_TX_PAUSE_GEN BIT(4) -#define MSCC_PROC_0_IP_1588_TOP_CFG_STAT_MODE_CTL 0x2 -#define MSCC_PROC_0_IP_1588_TOP_CFG_STAT_MODE_CTL_PROTOCOL_MODE(x) (x) -#define MSCC_PROC_0_IP_1588_TOP_CFG_STAT_MODE_CTL_PROTOCOL_MODE_M GENMASK(2, 0) +#define MSCC_PROC_IP_1588_TOP_CFG_STAT_MODE_CTL 0x2 +#define MSCC_PROC_IP_1588_TOP_CFG_STAT_MODE_CTL_PROTOCOL_MODE(x) (x) +#define MSCC_PROC_IP_1588_TOP_CFG_STAT_MODE_CTL_PROTOCOL_MODE_M GENMASK(2, 0) #endif /* _MSCC_PHY_LINE_MAC_H_ */ diff --git a/drivers/net/phy/mscc/mscc_macsec.c b/drivers/net/phy/mscc/mscc_macsec.c index e99e2cd72a0c..b4d3dc4068e2 100644 --- a/drivers/net/phy/mscc/mscc_macsec.c +++ b/drivers/net/phy/mscc/mscc_macsec.c @@ -316,6 +316,8 @@ static void vsc8584_macsec_mac_init(struct phy_device *phydev, /* Must be called with mdio_lock taken */ static int __vsc8584_macsec_init(struct phy_device *phydev) { + struct vsc8531_private *priv = phydev->priv; + enum macsec_bank proc_bank; u32 val; vsc8584_macsec_block_init(phydev, MACSEC_INGR); @@ -351,12 +353,14 @@ static int __vsc8584_macsec_init(struct phy_device *phydev) val |= MSCC_FCBUF_ENA_CFG_TX_ENA | MSCC_FCBUF_ENA_CFG_RX_ENA; vsc8584_macsec_phy_write(phydev, FC_BUFFER, MSCC_FCBUF_ENA_CFG, val); - val = vsc8584_macsec_phy_read(phydev, IP_1588, - MSCC_PROC_0_IP_1588_TOP_CFG_STAT_MODE_CTL); - val &= ~MSCC_PROC_0_IP_1588_TOP_CFG_STAT_MODE_CTL_PROTOCOL_MODE_M; - val |= MSCC_PROC_0_IP_1588_TOP_CFG_STAT_MODE_CTL_PROTOCOL_MODE(4); - vsc8584_macsec_phy_write(phydev, IP_1588, - MSCC_PROC_0_IP_1588_TOP_CFG_STAT_MODE_CTL, val); + proc_bank = (priv->addr < 2) ? PROC_0 : PROC_2; + + val = vsc8584_macsec_phy_read(phydev, proc_bank, + MSCC_PROC_IP_1588_TOP_CFG_STAT_MODE_CTL); + val &= ~MSCC_PROC_IP_1588_TOP_CFG_STAT_MODE_CTL_PROTOCOL_MODE_M; + val |= MSCC_PROC_IP_1588_TOP_CFG_STAT_MODE_CTL_PROTOCOL_MODE(4); + vsc8584_macsec_phy_write(phydev, proc_bank, + MSCC_PROC_IP_1588_TOP_CFG_STAT_MODE_CTL, val); return 0; } diff --git a/drivers/net/phy/mscc/mscc_macsec.h b/drivers/net/phy/mscc/mscc_macsec.h index d0783944d106..d751f2946b79 100644 --- a/drivers/net/phy/mscc/mscc_macsec.h +++ b/drivers/net/phy/mscc/mscc_macsec.h @@ -64,7 +64,8 @@ enum macsec_bank { FC_BUFFER = 0x04, HOST_MAC = 0x05, LINE_MAC = 0x06, - IP_1588 = 0x0e, + PROC_0 = 0x0e, + PROC_2 = 0x0f, MACSEC_INGR = 0x38, MACSEC_EGR = 0x3c, }; diff --git a/drivers/net/phy/mscc/mscc_main.c b/drivers/net/phy/mscc/mscc_main.c index acddef79f4e8..c8aa6d905d8e 100644 --- a/drivers/net/phy/mscc/mscc_main.c +++ b/drivers/net/phy/mscc/mscc_main.c @@ -1347,6 +1347,8 @@ static int vsc8584_config_init(struct phy_device *phydev) else vsc8531->base_addr = phydev->mdio.addr - addr; + vsc8531->addr = addr; + /* Some parts of the init sequence are identical for every PHY in the * package. Some parts are modifying the GPIO register bank which is a * set of registers that are affecting all PHYs, a few resetting the @@ -1771,6 +1773,8 @@ static int vsc8514_config_init(struct phy_device *phydev) else vsc8531->base_addr = phydev->mdio.addr - addr; + vsc8531->addr = addr; + /* Some parts of the init sequence are identical for every PHY in the * package. Some parts are modifying the GPIO register bank which is a * set of registers that are affecting all PHYs, a few resetting the -- cgit v1.2.3 From 4c64b83d03f4aafcdf710caad994cbc855802e74 Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Fri, 22 May 2020 20:09:28 +0300 Subject: net: ethernet: ti: cpsw: fix ASSERT_RTNL() warning during suspend vlan_for_each() are required to be called with rtnl_lock taken, otherwise ASSERT_RTNL() warning will be triggered - which happens now during System resume from suspend: cpsw_suspend() |- cpsw_ndo_stop() |- __hw_addr_ref_unsync_dev() |- cpsw_purge_all_mc() |- vlan_for_each() |- ASSERT_RTNL(); Hence, fix it by surrounding cpsw_ndo_stop() by rtnl_lock/unlock() calls. Fixes: 15180eca569b ("net: ethernet: ti: cpsw: fix vlan mcast") Signed-off-by: Grygorii Strashko Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/cpsw.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c index c2c5bf87da01..ffeb8633e530 100644 --- a/drivers/net/ethernet/ti/cpsw.c +++ b/drivers/net/ethernet/ti/cpsw.c @@ -1753,11 +1753,15 @@ static int cpsw_suspend(struct device *dev) struct cpsw_common *cpsw = dev_get_drvdata(dev); int i; + rtnl_lock(); + for (i = 0; i < cpsw->data.slaves; i++) if (cpsw->slaves[i].ndev) if (netif_running(cpsw->slaves[i].ndev)) cpsw_ndo_stop(cpsw->slaves[i].ndev); + rtnl_unlock(); + /* Select sleep pin state */ pinctrl_pm_select_sleep_state(dev); -- cgit v1.2.3 From febfd9d3c7f74063e8e630b15413ca91b567f963 Mon Sep 17 00:00:00 2001 From: Qiushi Wu Date: Fri, 22 May 2020 14:07:15 -0500 Subject: net/mlx4_core: fix a memory leak bug. In function mlx4_opreq_action(), pointer "mailbox" is not released, when mlx4_cmd_box() return and error, causing a memory leak bug. Fix this issue by going to "out" label, mlx4_free_cmd_mailbox() can free this pointer. Fixes: fe6f700d6cbb ("net/mlx4_core: Respond to operation request by firmware") Signed-off-by: Qiushi Wu Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/fw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c index 6e501af0e532..f6ff9620a137 100644 --- a/drivers/net/ethernet/mellanox/mlx4/fw.c +++ b/drivers/net/ethernet/mellanox/mlx4/fw.c @@ -2734,7 +2734,7 @@ void mlx4_opreq_action(struct work_struct *work) if (err) { mlx4_err(dev, "Failed to retrieve required operation: %d\n", err); - return; + goto out; } MLX4_GET(modifier, outbox, GET_OP_REQ_MODIFIER_OFFSET); MLX4_GET(token, outbox, GET_OP_REQ_TOKEN_OFFSET); -- cgit v1.2.3 From 539d39ad0c61b35f69565a037d7586deaf6d6166 Mon Sep 17 00:00:00 2001 From: Dinghao Liu Date: Sat, 23 May 2020 16:08:20 +0800 Subject: net: smsc911x: Fix runtime PM imbalance on error Remove runtime PM usage counter decrement when the increment function has not been called to keep the counter balanced. Signed-off-by: Dinghao Liu Signed-off-by: David S. Miller --- drivers/net/ethernet/smsc/smsc911x.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/smsc/smsc911x.c b/drivers/net/ethernet/smsc/smsc911x.c index 49a6a9167af4..fc168f85e7af 100644 --- a/drivers/net/ethernet/smsc/smsc911x.c +++ b/drivers/net/ethernet/smsc/smsc911x.c @@ -2493,20 +2493,20 @@ static int smsc911x_drv_probe(struct platform_device *pdev) retval = smsc911x_init(dev); if (retval < 0) - goto out_disable_resources; + goto out_init_fail; netif_carrier_off(dev); retval = smsc911x_mii_init(pdev, dev); if (retval) { SMSC_WARN(pdata, probe, "Error %i initialising mii", retval); - goto out_disable_resources; + goto out_init_fail; } retval = register_netdev(dev); if (retval) { SMSC_WARN(pdata, probe, "Error %i registering device", retval); - goto out_disable_resources; + goto out_init_fail; } else { SMSC_TRACE(pdata, probe, "Network interface: \"%s\"", dev->name); @@ -2547,9 +2547,10 @@ static int smsc911x_drv_probe(struct platform_device *pdev) return 0; -out_disable_resources: +out_init_fail: pm_runtime_put(&pdev->dev); pm_runtime_disable(&pdev->dev); +out_disable_resources: (void)smsc911x_disable_resources(pdev); out_enable_resources_fail: smsc911x_free_resources(pdev); -- cgit v1.2.3