From 94579ac3f6d0820adc83b5dc5358ead0158101e9 Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Mon, 1 Jun 2020 16:39:37 -0500 Subject: xfrm: Fix double ESP trailer insertion in IPsec crypto offload. During IPsec performance testing, we see bad ICMP checksum. The error packet has duplicated ESP trailer due to double validate_xmit_xfrm calls. The first call is from ip_output, but the packet cannot be sent because netif_xmit_frozen_or_stopped is true and the packet gets dev_requeue_skb. The second call is from NET_TX softirq. However after the first call, the packet already has the ESP trailer. Fix by marking the skb with XFRM_XMIT bit after the packet is handled by validate_xmit_xfrm to avoid duplicate ESP trailer insertion. Fixes: f6e27114a60a ("net: Add a xfrm validate function to validate_xmit_skb") Signed-off-by: Huy Nguyen Reviewed-by: Boris Pismenny Reviewed-by: Raed Salem Reviewed-by: Saeed Mahameed Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 1 + net/xfrm/xfrm_device.c | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 094fe682f5d7..c7d213c9f9d8 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1008,6 +1008,7 @@ struct xfrm_offload { #define XFRM_GRO 32 #define XFRM_ESP_NO_TRAILER 64 #define XFRM_DEV_RESUME 128 +#define XFRM_XMIT 256 __u32 status; #define CRYPTO_SUCCESS 1 diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index f50d1f97cf8e..626096bd0d29 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -108,7 +108,7 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur struct xfrm_offload *xo = xfrm_offload(skb); struct sec_path *sp; - if (!xo) + if (!xo || (xo->flags & XFRM_XMIT)) return skb; if (!(features & NETIF_F_HW_ESP)) @@ -129,6 +129,8 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur return skb; } + xo->flags |= XFRM_XMIT; + if (skb_is_gso(skb)) { struct net_device *dev = skb->dev; -- cgit v1.2.3 From a4902d914e508f3691fa7ef885a76d2b7e735805 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Mon, 11 May 2020 13:00:15 +1000 Subject: xfrm: merge fixup for "remove output_finish indirection from xfrm_state_afinfo" Signed-off-by: Stephen Rothwell Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_output.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index e4c23f69f69f..a7ab19353313 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -574,16 +574,12 @@ int xfrm_output(struct sock *sk, struct sk_buff *skb) switch (x->outer_mode.family) { case AF_INET: memset(IPCB(skb), 0, sizeof(*IPCB(skb))); -#ifdef CONFIG_NETFILTER IPCB(skb)->flags |= IPSKB_XFRM_TRANSFORMED; -#endif break; case AF_INET6: memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); -#ifdef CONFIG_NETFILTER IP6CB(skb)->flags |= IP6SKB_XFRM_TRANSFORMED; -#endif break; } -- cgit v1.2.3 From 7d4e3919592593594296944ae7da1a207706e8c5 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 10 Jun 2020 09:14:35 -0700 Subject: esp, ah: consolidate the crypto algorithm selections Instead of duplicating the algorithm selections between INET_AH and INET6_AH and between INET_ESP and INET6_ESP, create new tristates XFRM_AH and XFRM_ESP that do the algorithm selections, and make these be selected by the corresponding INET* options. Suggested-by: Herbert Xu Acked-by: Herbert Xu Cc: Corentin Labbe Cc: Greg Kroah-Hartman Cc: Steffen Klassert Signed-off-by: Eric Biggers Signed-off-by: Steffen Klassert --- net/ipv4/Kconfig | 16 ++-------------- net/ipv6/Kconfig | 16 ++-------------- net/xfrm/Kconfig | 20 ++++++++++++++++++++ 3 files changed, 24 insertions(+), 28 deletions(-) diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 23ba5045e3d3..39a7a21744dc 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -340,11 +340,7 @@ config NET_FOU_IP_TUNNELS config INET_AH tristate "IP: AH transformation" - select XFRM_ALGO - select CRYPTO - select CRYPTO_HMAC - select CRYPTO_MD5 - select CRYPTO_SHA1 + select XFRM_AH ---help--- Support for IPsec AH. @@ -352,15 +348,7 @@ config INET_AH config INET_ESP tristate "IP: ESP transformation" - select XFRM_ALGO - select CRYPTO - select CRYPTO_AUTHENC - select CRYPTO_HMAC - select CRYPTO_MD5 - select CRYPTO_CBC - select CRYPTO_SHA1 - select CRYPTO_DES - select CRYPTO_ECHAINIV + select XFRM_ESP ---help--- Support for IPsec ESP. diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index 4f03aece2980..70313f16319d 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -49,11 +49,7 @@ config IPV6_OPTIMISTIC_DAD config INET6_AH tristate "IPv6: AH transformation" - select XFRM_ALGO - select CRYPTO - select CRYPTO_HMAC - select CRYPTO_MD5 - select CRYPTO_SHA1 + select XFRM_AH ---help--- Support for IPsec AH. @@ -61,15 +57,7 @@ config INET6_AH config INET6_ESP tristate "IPv6: ESP transformation" - select XFRM_ALGO - select CRYPTO - select CRYPTO_AUTHENC - select CRYPTO_HMAC - select CRYPTO_MD5 - select CRYPTO_CBC - select CRYPTO_SHA1 - select CRYPTO_DES - select CRYPTO_ECHAINIV + select XFRM_ESP ---help--- Support for IPsec ESP. diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig index b7fd9c838416..169c22140709 100644 --- a/net/xfrm/Kconfig +++ b/net/xfrm/Kconfig @@ -67,6 +67,26 @@ config XFRM_STATISTICS If unsure, say N. +config XFRM_AH + tristate + select XFRM_ALGO + select CRYPTO + select CRYPTO_HMAC + select CRYPTO_MD5 + select CRYPTO_SHA1 + +config XFRM_ESP + tristate + select XFRM_ALGO + select CRYPTO + select CRYPTO_AUTHENC + select CRYPTO_HMAC + select CRYPTO_MD5 + select CRYPTO_CBC + select CRYPTO_SHA1 + select CRYPTO_DES + select CRYPTO_ECHAINIV + config XFRM_IPCOMP tristate select XFRM_ALGO -- cgit v1.2.3 From 37ea0f18fb19e0646c166037043232915cd9e995 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 10 Jun 2020 09:14:36 -0700 Subject: esp: select CRYPTO_SEQIV Commit f23efcbcc523 ("crypto: ctr - no longer needs CRYPTO_SEQIV") made CRYPTO_CTR stop selecting CRYPTO_SEQIV. This breaks IPsec for most users since GCM and several other encryption algorithms require "seqiv" -- and RFC 8221 lists AES-GCM as "MUST" be implemented. Just make XFRM_ESP select CRYPTO_SEQIV. Fixes: f23efcbcc523 ("crypto: ctr - no longer needs CRYPTO_SEQIV") Acked-by: Herbert Xu Cc: Corentin Labbe Cc: Greg Kroah-Hartman Cc: Steffen Klassert Signed-off-by: Eric Biggers Signed-off-by: Steffen Klassert --- net/xfrm/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig index 169c22140709..b2ff8df2c836 100644 --- a/net/xfrm/Kconfig +++ b/net/xfrm/Kconfig @@ -86,6 +86,7 @@ config XFRM_ESP select CRYPTO_SHA1 select CRYPTO_DES select CRYPTO_ECHAINIV + select CRYPTO_SEQIV config XFRM_IPCOMP tristate -- cgit v1.2.3 From be01369859b8aa07346e497381bb46d377da0d8c Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 10 Jun 2020 09:14:37 -0700 Subject: esp, ah: modernize the crypto algorithm selections The crypto algorithms selected by the ESP and AH kconfig options are out-of-date with the guidance of RFC 8221, which lists the legacy algorithms MD5 and DES as "MUST NOT" be implemented, and some more modern algorithms like AES-GCM and HMAC-SHA256 as "MUST" be implemented. But the options select the legacy algorithms, not the modern ones. Therefore, modify these options to select the MUST algorithms -- and *only* the MUST algorithms. Also improve the help text. Note that other algorithms may still be explicitly enabled in the kconfig, and the choice of which to actually use is still controlled by userspace. This change only modifies the list of algorithms for which kernel support is guaranteed to be present. Suggested-by: Herbert Xu Suggested-by: Steffen Klassert Acked-by: Herbert Xu Cc: Corentin Labbe Cc: Greg Kroah-Hartman Signed-off-by: Eric Biggers Signed-off-by: Steffen Klassert --- net/ipv4/Kconfig | 18 ++++++++++++++++-- net/ipv6/Kconfig | 18 ++++++++++++++++-- net/xfrm/Kconfig | 15 +++++++++------ 3 files changed, 41 insertions(+), 10 deletions(-) diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 39a7a21744dc..dc9dfaef77e5 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -342,7 +342,14 @@ config INET_AH tristate "IP: AH transformation" select XFRM_AH ---help--- - Support for IPsec AH. + Support for IPsec AH (Authentication Header). + + AH can be used with various authentication algorithms. Besides + enabling AH support itself, this option enables the generic + implementations of the algorithms that RFC 8221 lists as MUST be + implemented. If you need any other algorithms, you'll need to enable + them in the crypto API. You should also enable accelerated + implementations of any needed algorithms when available. If unsure, say Y. @@ -350,7 +357,14 @@ config INET_ESP tristate "IP: ESP transformation" select XFRM_ESP ---help--- - Support for IPsec ESP. + Support for IPsec ESP (Encapsulating Security Payload). + + ESP can be used with various encryption and authentication algorithms. + Besides enabling ESP support itself, this option enables the generic + implementations of the algorithms that RFC 8221 lists as MUST be + implemented. If you need any other algorithms, you'll need to enable + them in the crypto API. You should also enable accelerated + implementations of any needed algorithms when available. If unsure, say Y. diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index 70313f16319d..414a68b16869 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -51,7 +51,14 @@ config INET6_AH tristate "IPv6: AH transformation" select XFRM_AH ---help--- - Support for IPsec AH. + Support for IPsec AH (Authentication Header). + + AH can be used with various authentication algorithms. Besides + enabling AH support itself, this option enables the generic + implementations of the algorithms that RFC 8221 lists as MUST be + implemented. If you need any other algorithms, you'll need to enable + them in the crypto API. You should also enable accelerated + implementations of any needed algorithms when available. If unsure, say Y. @@ -59,7 +66,14 @@ config INET6_ESP tristate "IPv6: ESP transformation" select XFRM_ESP ---help--- - Support for IPsec ESP. + Support for IPsec ESP (Encapsulating Security Payload). + + ESP can be used with various encryption and authentication algorithms. + Besides enabling ESP support itself, this option enables the generic + implementations of the algorithms that RFC 8221 lists as MUST be + implemented. If you need any other algorithms, you'll need to enable + them in the crypto API. You should also enable accelerated + implementations of any needed algorithms when available. If unsure, say Y. diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig index b2ff8df2c836..e77ba529229c 100644 --- a/net/xfrm/Kconfig +++ b/net/xfrm/Kconfig @@ -67,26 +67,29 @@ config XFRM_STATISTICS If unsure, say N. +# This option selects XFRM_ALGO along with the AH authentication algorithms that +# RFC 8221 lists as MUST be implemented. config XFRM_AH tristate select XFRM_ALGO select CRYPTO select CRYPTO_HMAC - select CRYPTO_MD5 - select CRYPTO_SHA1 + select CRYPTO_SHA256 +# This option selects XFRM_ALGO along with the ESP encryption and authentication +# algorithms that RFC 8221 lists as MUST be implemented. config XFRM_ESP tristate select XFRM_ALGO select CRYPTO + select CRYPTO_AES select CRYPTO_AUTHENC - select CRYPTO_HMAC - select CRYPTO_MD5 select CRYPTO_CBC - select CRYPTO_SHA1 - select CRYPTO_DES select CRYPTO_ECHAINIV + select CRYPTO_GCM + select CRYPTO_HMAC select CRYPTO_SEQIV + select CRYPTO_SHA256 config XFRM_IPCOMP tristate -- cgit v1.2.3 From b0659d8a950d424e57cc0a67afc4740ee561224e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 15 Jun 2020 14:49:26 -0700 Subject: bpf: Fix definition of bpf_ringbuf_output() helper in UAPI comments Fix definition of bpf_ringbuf_output() in UAPI header comments, which is used to generate libbpf's bpf_helper_defs.h header. Return value is a number (error code), not a pointer. Fixes: 457f44363a88 ("bpf: Implement BPF ring buffer and verifier support for it") Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200615214926.3638836-1-andriin@fb.com --- include/uapi/linux/bpf.h | 2 +- tools/include/uapi/linux/bpf.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 19684813faae..974a71342aea 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3168,7 +3168,7 @@ union bpf_attr { * Return * The id is returned or 0 in case the id could not be retrieved. * - * void *bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags) + * int bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags) * Description * Copy *size* bytes from *data* into a ring buffer *ringbuf*. * If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 19684813faae..974a71342aea 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3168,7 +3168,7 @@ union bpf_attr { * Return * The id is returned or 0 in case the id could not be retrieved. * - * void *bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags) + * int bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags) * Description * Copy *size* bytes from *data* into a ring buffer *ringbuf*. * If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of -- cgit v1.2.3 From c34a06c56df7c513cd19f591b26fe7d814c778cc Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 15 Jun 2020 15:53:55 -0700 Subject: tools/bpftool: Add ringbuf map to a list of known map types Add symbolic name "ringbuf" to map to BPF_MAP_TYPE_RINGBUF. Without this, users will see "type 27" instead of "ringbuf" in `map show` output. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200615225355.366256-1-andriin@fb.com --- tools/bpf/bpftool/map.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c index c5fac8068ba1..99109a6afe17 100644 --- a/tools/bpf/bpftool/map.c +++ b/tools/bpf/bpftool/map.c @@ -49,6 +49,7 @@ const char * const map_type_name[] = { [BPF_MAP_TYPE_STACK] = "stack", [BPF_MAP_TYPE_SK_STORAGE] = "sk_storage", [BPF_MAP_TYPE_STRUCT_OPS] = "struct_ops", + [BPF_MAP_TYPE_RINGBUF] = "ringbuf", }; const size_t map_type_name_size = ARRAY_SIZE(map_type_name); -- cgit v1.2.3 From 6903cdae9f9f08d61e49c16cbef11c293e33a615 Mon Sep 17 00:00:00 2001 From: Gaurav Singh Date: Fri, 12 Jun 2020 14:53:27 -0400 Subject: bpf, xdp, samples: Fix null pointer dereference in *_user code Memset on the pointer right after malloc can cause a NULL pointer deference if it failed to allocate memory. A simple fix is to replace malloc()/memset() pair with a simple call to calloc(). Fixes: 0fca931a6f21 ("samples/bpf: program demonstrating access to xdp_rxq_info") Signed-off-by: Gaurav Singh Signed-off-by: Daniel Borkmann Acked-by: Jesper Dangaard Brouer Acked-by: John Fastabend --- samples/bpf/xdp_monitor_user.c | 8 ++------ samples/bpf/xdp_redirect_cpu_user.c | 7 ++----- samples/bpf/xdp_rxq_info_user.c | 13 +++---------- 3 files changed, 7 insertions(+), 21 deletions(-) diff --git a/samples/bpf/xdp_monitor_user.c b/samples/bpf/xdp_monitor_user.c index dd558cbb2309..ef53b93db573 100644 --- a/samples/bpf/xdp_monitor_user.c +++ b/samples/bpf/xdp_monitor_user.c @@ -509,11 +509,8 @@ static void *alloc_rec_per_cpu(int record_size) { unsigned int nr_cpus = bpf_num_possible_cpus(); void *array; - size_t size; - size = record_size * nr_cpus; - array = malloc(size); - memset(array, 0, size); + array = calloc(nr_cpus, record_size); if (!array) { fprintf(stderr, "Mem alloc error (nr_cpus:%u)\n", nr_cpus); exit(EXIT_FAIL_MEM); @@ -528,8 +525,7 @@ static struct stats_record *alloc_stats_record(void) int i; /* Alloc main stats_record structure */ - rec = malloc(sizeof(*rec)); - memset(rec, 0, sizeof(*rec)); + rec = calloc(1, sizeof(*rec)); if (!rec) { fprintf(stderr, "Mem alloc error\n"); exit(EXIT_FAIL_MEM); diff --git a/samples/bpf/xdp_redirect_cpu_user.c b/samples/bpf/xdp_redirect_cpu_user.c index f3468168982e..f4e755e0dd73 100644 --- a/samples/bpf/xdp_redirect_cpu_user.c +++ b/samples/bpf/xdp_redirect_cpu_user.c @@ -207,11 +207,8 @@ static struct datarec *alloc_record_per_cpu(void) { unsigned int nr_cpus = bpf_num_possible_cpus(); struct datarec *array; - size_t size; - size = sizeof(struct datarec) * nr_cpus; - array = malloc(size); - memset(array, 0, size); + array = calloc(nr_cpus, sizeof(struct datarec)); if (!array) { fprintf(stderr, "Mem alloc error (nr_cpus:%u)\n", nr_cpus); exit(EXIT_FAIL_MEM); @@ -226,11 +223,11 @@ static struct stats_record *alloc_stats_record(void) size = sizeof(*rec) + n_cpus * sizeof(struct record); rec = malloc(size); - memset(rec, 0, size); if (!rec) { fprintf(stderr, "Mem alloc error\n"); exit(EXIT_FAIL_MEM); } + memset(rec, 0, size); rec->rx_cnt.cpu = alloc_record_per_cpu(); rec->redir_err.cpu = alloc_record_per_cpu(); rec->kthread.cpu = alloc_record_per_cpu(); diff --git a/samples/bpf/xdp_rxq_info_user.c b/samples/bpf/xdp_rxq_info_user.c index 4fe47502ebed..caa4e7ffcfc7 100644 --- a/samples/bpf/xdp_rxq_info_user.c +++ b/samples/bpf/xdp_rxq_info_user.c @@ -198,11 +198,8 @@ static struct datarec *alloc_record_per_cpu(void) { unsigned int nr_cpus = bpf_num_possible_cpus(); struct datarec *array; - size_t size; - size = sizeof(struct datarec) * nr_cpus; - array = malloc(size); - memset(array, 0, size); + array = calloc(nr_cpus, sizeof(struct datarec)); if (!array) { fprintf(stderr, "Mem alloc error (nr_cpus:%u)\n", nr_cpus); exit(EXIT_FAIL_MEM); @@ -214,11 +211,8 @@ static struct record *alloc_record_per_rxq(void) { unsigned int nr_rxqs = bpf_map__def(rx_queue_index_map)->max_entries; struct record *array; - size_t size; - size = sizeof(struct record) * nr_rxqs; - array = malloc(size); - memset(array, 0, size); + array = calloc(nr_rxqs, sizeof(struct record)); if (!array) { fprintf(stderr, "Mem alloc error (nr_rxqs:%u)\n", nr_rxqs); exit(EXIT_FAIL_MEM); @@ -232,8 +226,7 @@ static struct stats_record *alloc_stats_record(void) struct stats_record *rec; int i; - rec = malloc(sizeof(*rec)); - memset(rec, 0, sizeof(*rec)); + rec = calloc(1, sizeof(struct stats_record)); if (!rec) { fprintf(stderr, "Mem alloc error\n"); exit(EXIT_FAIL_MEM); -- cgit v1.2.3 From 02553b91da5deb63c8562b47529b09b734659af0 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 15 Jun 2020 22:04:30 -0700 Subject: bpf: bpf_probe_read_kernel_str() has to return amount of data read on success During recent refactorings, bpf_probe_read_kernel_str() started returning 0 on success, instead of amount of data successfully read. This majorly breaks applications relying on bpf_probe_read_kernel_str() and bpf_probe_read_str() and their results. Fix this by returning actual number of bytes read. Fixes: 8d92db5c04d1 ("bpf: rework the compat kernel probe handling") Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Reviewed-by: Christoph Hellwig Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200616050432.1902042-1-andriin@fb.com --- kernel/trace/bpf_trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index e729c9e587a0..a3ac7de98baa 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -241,7 +241,7 @@ bpf_probe_read_kernel_str_common(void *dst, u32 size, const void *unsafe_ptr) if (unlikely(ret < 0)) goto fail; - return 0; + return ret; fail: memset(dst, 0, size); return ret; -- cgit v1.2.3 From 1c7fb20d6b0acd452cb93d447051eac7724edfa7 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Tue, 16 Jun 2020 13:33:03 +0200 Subject: tools, bpftool: Add ringbuf map type to map command docs Commit c34a06c56df7 ("tools/bpftool: Add ringbuf map to a list of known map types") added the symbolic "ringbuf" name. Document it in the bpftool map command docs and usage as well. Signed-off-by: Tobias Klauser Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200616113303.8123-1-tklauser@distanz.ch --- tools/bpf/bpftool/Documentation/bpftool-map.rst | 2 +- tools/bpf/bpftool/map.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst index 31101643e57c..70c78faa47ab 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-map.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst @@ -49,7 +49,7 @@ MAP COMMANDS | | **lru_percpu_hash** | **lpm_trie** | **array_of_maps** | **hash_of_maps** | | **devmap** | **devmap_hash** | **sockmap** | **cpumap** | **xskmap** | **sockhash** | | **cgroup_storage** | **reuseport_sockarray** | **percpu_cgroup_storage** -| | **queue** | **stack** | **sk_storage** | **struct_ops** } +| | **queue** | **stack** | **sk_storage** | **struct_ops** | **ringbuf** } DESCRIPTION =========== diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c index 99109a6afe17..1d3b60651078 100644 --- a/tools/bpf/bpftool/map.c +++ b/tools/bpf/bpftool/map.c @@ -1591,7 +1591,7 @@ static int do_help(int argc, char **argv) " lru_percpu_hash | lpm_trie | array_of_maps | hash_of_maps |\n" " devmap | devmap_hash | sockmap | cpumap | xskmap | sockhash |\n" " cgroup_storage | reuseport_sockarray | percpu_cgroup_storage |\n" - " queue | stack | sk_storage | struct_ops }\n" + " queue | stack | sk_storage | struct_ops | ringbuf }\n" " " HELP_SPEC_OPTIONS "\n" "", bin_name, argv[-2]); -- cgit v1.2.3 From 3ff2351651a2ecb73ec9d29119793bde190b2850 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 16 Jun 2020 18:35:18 +0800 Subject: xdp: Handle frame_sz in xdp_convert_zc_to_xdp_frame() In commit 34cc0b338a61 we only handled the frame_sz in convert_to_xdp_frame(). This patch will also handle frame_sz in xdp_convert_zc_to_xdp_frame(). Fixes: 34cc0b338a61 ("xdp: Xdp_frame add member frame_sz and handle in convert_to_xdp_frame") Signed-off-by: Hangbin Liu Signed-off-by: Alexei Starovoitov Acked-by: Jesper Dangaard Brouer Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200616103518.2963410-1-liuhangbin@gmail.com --- net/core/xdp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/core/xdp.c b/net/core/xdp.c index 90f44f382115..3c45f99e26d5 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -462,6 +462,7 @@ struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp) xdpf->len = totsize - metasize; xdpf->headroom = 0; xdpf->metasize = metasize; + xdpf->frame_sz = PAGE_SIZE; xdpf->mem.type = MEM_TYPE_PAGE_ORDER0; xsk_buff_free(xdp); -- cgit v1.2.3 From 99c51064fb06146b3d494b745c947e438a10aaa7 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Tue, 16 Jun 2020 16:28:29 +0200 Subject: devmap: Use bpf_map_area_alloc() for allocating hash buckets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Syzkaller discovered that creating a hash of type devmap_hash with a large number of entries can hit the memory allocator limit for allocating contiguous memory regions. There's really no reason to use kmalloc_array() directly in the devmap code, so just switch it to the existing bpf_map_area_alloc() function that is used elsewhere. Fixes: 6f9d451ab1a3 ("xdp: Add devmap_hash map type for looking up devices by hashed index") Reported-by: Xiumei Mu Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200616142829.114173-1-toke@redhat.com --- kernel/bpf/devmap.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 0cbb72cdaf63..5fdbc776a760 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -86,12 +86,13 @@ static DEFINE_PER_CPU(struct list_head, dev_flush_list); static DEFINE_SPINLOCK(dev_map_lock); static LIST_HEAD(dev_map_list); -static struct hlist_head *dev_map_create_hash(unsigned int entries) +static struct hlist_head *dev_map_create_hash(unsigned int entries, + int numa_node) { int i; struct hlist_head *hash; - hash = kmalloc_array(entries, sizeof(*hash), GFP_KERNEL); + hash = bpf_map_area_alloc(entries * sizeof(*hash), numa_node); if (hash != NULL) for (i = 0; i < entries; i++) INIT_HLIST_HEAD(&hash[i]); @@ -145,7 +146,8 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) return -EINVAL; if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { - dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets); + dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets, + dtab->map.numa_node); if (!dtab->dev_index_head) goto free_charge; @@ -232,7 +234,7 @@ static void dev_map_free(struct bpf_map *map) } } - kfree(dtab->dev_index_head); + bpf_map_area_free(dtab->dev_index_head); } else { for (i = 0; i < dtab->map.max_entries; i++) { struct bpf_dtab_netdev *dev; -- cgit v1.2.3 From d8fe449a9c51a37d844ab607e14e2f5c657d3cf2 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 16 Jun 2020 18:04:14 -0700 Subject: bpf: Don't return EINVAL from {get,set}sockopt when optlen > PAGE_SIZE Attaching to these hooks can break iptables because its optval is usually quite big, or at least bigger than the current PAGE_SIZE limit. David also mentioned some SCTP options can be big (around 256k). For such optvals we expose only the first PAGE_SIZE bytes to the BPF program. BPF program has two options: 1. Set ctx->optlen to 0 to indicate that the BPF's optval should be ignored and the kernel should use original userspace value. 2. Set ctx->optlen to something that's smaller than the PAGE_SIZE. v5: * use ctx->optlen == 0 with trimmed buffer (Alexei Starovoitov) * update the docs accordingly v4: * use temporary buffer to avoid optval == optval_end == NULL; this removes the corner case in the verifier that might assume non-zero PTR_TO_PACKET/PTR_TO_PACKET_END. v3: * don't increase the limit, bypass the argument v2: * proper comments formatting (Jakub Kicinski) Fixes: 0d01da6afc54 ("bpf: implement getsockopt and setsockopt hooks") Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Cc: David Laight Link: https://lore.kernel.org/bpf/20200617010416.93086-1-sdf@google.com --- kernel/bpf/cgroup.c | 53 +++++++++++++++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 20 deletions(-) diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 4d76f16524cc..ac53102e244a 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1276,16 +1276,23 @@ static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp, static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen) { - if (unlikely(max_optlen > PAGE_SIZE) || max_optlen < 0) + if (unlikely(max_optlen < 0)) return -EINVAL; + if (unlikely(max_optlen > PAGE_SIZE)) { + /* We don't expose optvals that are greater than PAGE_SIZE + * to the BPF program. + */ + max_optlen = PAGE_SIZE; + } + ctx->optval = kzalloc(max_optlen, GFP_USER); if (!ctx->optval) return -ENOMEM; ctx->optval_end = ctx->optval + max_optlen; - return 0; + return max_optlen; } static void sockopt_free_buf(struct bpf_sockopt_kern *ctx) @@ -1319,13 +1326,13 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, */ max_optlen = max_t(int, 16, *optlen); - ret = sockopt_alloc_buf(&ctx, max_optlen); - if (ret) - return ret; + max_optlen = sockopt_alloc_buf(&ctx, max_optlen); + if (max_optlen < 0) + return max_optlen; ctx.optlen = *optlen; - if (copy_from_user(ctx.optval, optval, *optlen) != 0) { + if (copy_from_user(ctx.optval, optval, min(*optlen, max_optlen)) != 0) { ret = -EFAULT; goto out; } @@ -1353,8 +1360,14 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, /* export any potential modifications */ *level = ctx.level; *optname = ctx.optname; - *optlen = ctx.optlen; - *kernel_optval = ctx.optval; + + /* optlen == 0 from BPF indicates that we should + * use original userspace data. + */ + if (ctx.optlen != 0) { + *optlen = ctx.optlen; + *kernel_optval = ctx.optval; + } } out: @@ -1385,12 +1398,12 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT)) return retval; - ret = sockopt_alloc_buf(&ctx, max_optlen); - if (ret) - return ret; - ctx.optlen = max_optlen; + max_optlen = sockopt_alloc_buf(&ctx, max_optlen); + if (max_optlen < 0) + return max_optlen; + if (!retval) { /* If kernel getsockopt finished successfully, * copy whatever was returned to the user back @@ -1404,10 +1417,8 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, goto out; } - if (ctx.optlen > max_optlen) - ctx.optlen = max_optlen; - - if (copy_from_user(ctx.optval, optval, ctx.optlen) != 0) { + if (copy_from_user(ctx.optval, optval, + min(ctx.optlen, max_optlen)) != 0) { ret = -EFAULT; goto out; } @@ -1436,10 +1447,12 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, goto out; } - if (copy_to_user(optval, ctx.optval, ctx.optlen) || - put_user(ctx.optlen, optlen)) { - ret = -EFAULT; - goto out; + if (ctx.optlen != 0) { + if (copy_to_user(optval, ctx.optval, ctx.optlen) || + put_user(ctx.optlen, optlen)) { + ret = -EFAULT; + goto out; + } } ret = ctx.retval; -- cgit v1.2.3 From a0cb12b03132befbdb29d835ca330c18792e8134 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 16 Jun 2020 18:04:15 -0700 Subject: selftests/bpf: Make sure optvals > PAGE_SIZE are bypassed We are relying on the fact, that we can pass > sizeof(int) optvals to the SOL_IP+IP_FREEBIND option (the kernel will take first 4 bytes). In the BPF program we check that we can only touch PAGE_SIZE bytes, but the real optlen is PAGE_SIZE * 2. In both cases, we override it to some predefined value and trim the optlen. Also, let's modify exiting IP_TOS usecase to test optlen=0 case where BPF program just bypasses the data as is. Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200617010416.93086-2-sdf@google.com --- .../testing/selftests/bpf/prog_tests/sockopt_sk.c | 46 +++++++++++++++--- tools/testing/selftests/bpf/progs/sockopt_sk.c | 54 +++++++++++++++++++++- 2 files changed, 91 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c index 2061a6beac0f..5f54c6aec7f0 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c +++ b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c @@ -13,6 +13,7 @@ static int getsetsockopt(void) char cc[16]; /* TCP_CA_NAME_MAX */ } buf = {}; socklen_t optlen; + char *big_buf = NULL; fd = socket(AF_INET, SOCK_STREAM, 0); if (fd < 0) { @@ -22,24 +23,31 @@ static int getsetsockopt(void) /* IP_TOS - BPF bypass */ - buf.u8[0] = 0x08; - err = setsockopt(fd, SOL_IP, IP_TOS, &buf, 1); + optlen = getpagesize() * 2; + big_buf = calloc(1, optlen); + if (!big_buf) { + log_err("Couldn't allocate two pages"); + goto err; + } + + *(int *)big_buf = 0x08; + err = setsockopt(fd, SOL_IP, IP_TOS, big_buf, optlen); if (err) { log_err("Failed to call setsockopt(IP_TOS)"); goto err; } - buf.u8[0] = 0x00; + memset(big_buf, 0, optlen); optlen = 1; - err = getsockopt(fd, SOL_IP, IP_TOS, &buf, &optlen); + err = getsockopt(fd, SOL_IP, IP_TOS, big_buf, &optlen); if (err) { log_err("Failed to call getsockopt(IP_TOS)"); goto err; } - if (buf.u8[0] != 0x08) { - log_err("Unexpected getsockopt(IP_TOS) buf[0] 0x%02x != 0x08", - buf.u8[0]); + if (*(int *)big_buf != 0x08) { + log_err("Unexpected getsockopt(IP_TOS) optval 0x%x != 0x08", + *(int *)big_buf); goto err; } @@ -78,6 +86,28 @@ static int getsetsockopt(void) goto err; } + /* IP_FREEBIND - BPF can't access optval past PAGE_SIZE */ + + optlen = getpagesize() * 2; + memset(big_buf, 0, optlen); + + err = setsockopt(fd, SOL_IP, IP_FREEBIND, big_buf, optlen); + if (err != 0) { + log_err("Failed to call setsockopt, ret=%d", err); + goto err; + } + + err = getsockopt(fd, SOL_IP, IP_FREEBIND, big_buf, &optlen); + if (err != 0) { + log_err("Failed to call getsockopt, ret=%d", err); + goto err; + } + + if (optlen != 1 || *(__u8 *)big_buf != 0x55) { + log_err("Unexpected IP_FREEBIND getsockopt, optlen=%d, optval=0x%x", + optlen, *(__u8 *)big_buf); + } + /* SO_SNDBUF is overwritten */ buf.u32 = 0x01010101; @@ -124,9 +154,11 @@ static int getsetsockopt(void) goto err; } + free(big_buf); close(fd); return 0; err: + free(big_buf); close(fd); return -1; } diff --git a/tools/testing/selftests/bpf/progs/sockopt_sk.c b/tools/testing/selftests/bpf/progs/sockopt_sk.c index d5a5eeb5fb52..712df7b49cb1 100644 --- a/tools/testing/selftests/bpf/progs/sockopt_sk.c +++ b/tools/testing/selftests/bpf/progs/sockopt_sk.c @@ -8,6 +8,10 @@ char _license[] SEC("license") = "GPL"; __u32 _version SEC("version") = 1; +#ifndef PAGE_SIZE +#define PAGE_SIZE 4096 +#endif + #define SOL_CUSTOM 0xdeadbeef struct sockopt_sk { @@ -28,12 +32,14 @@ int _getsockopt(struct bpf_sockopt *ctx) __u8 *optval = ctx->optval; struct sockopt_sk *storage; - if (ctx->level == SOL_IP && ctx->optname == IP_TOS) + if (ctx->level == SOL_IP && ctx->optname == IP_TOS) { /* Not interested in SOL_IP:IP_TOS; * let next BPF program in the cgroup chain or kernel * handle it. */ + ctx->optlen = 0; /* bypass optval>PAGE_SIZE */ return 1; + } if (ctx->level == SOL_SOCKET && ctx->optname == SO_SNDBUF) { /* Not interested in SOL_SOCKET:SO_SNDBUF; @@ -51,6 +57,26 @@ int _getsockopt(struct bpf_sockopt *ctx) return 1; } + if (ctx->level == SOL_IP && ctx->optname == IP_FREEBIND) { + if (optval + 1 > optval_end) + return 0; /* EPERM, bounds check */ + + ctx->retval = 0; /* Reset system call return value to zero */ + + /* Always export 0x55 */ + optval[0] = 0x55; + ctx->optlen = 1; + + /* Userspace buffer is PAGE_SIZE * 2, but BPF + * program can only see the first PAGE_SIZE + * bytes of data. + */ + if (optval_end - optval != PAGE_SIZE) + return 0; /* EPERM, unexpected data size */ + + return 1; + } + if (ctx->level != SOL_CUSTOM) return 0; /* EPERM, deny everything except custom level */ @@ -81,12 +107,14 @@ int _setsockopt(struct bpf_sockopt *ctx) __u8 *optval = ctx->optval; struct sockopt_sk *storage; - if (ctx->level == SOL_IP && ctx->optname == IP_TOS) + if (ctx->level == SOL_IP && ctx->optname == IP_TOS) { /* Not interested in SOL_IP:IP_TOS; * let next BPF program in the cgroup chain or kernel * handle it. */ + ctx->optlen = 0; /* bypass optval>PAGE_SIZE */ return 1; + } if (ctx->level == SOL_SOCKET && ctx->optname == SO_SNDBUF) { /* Overwrite SO_SNDBUF value */ @@ -112,6 +140,28 @@ int _setsockopt(struct bpf_sockopt *ctx) return 1; } + if (ctx->level == SOL_IP && ctx->optname == IP_FREEBIND) { + /* Original optlen is larger than PAGE_SIZE. */ + if (ctx->optlen != PAGE_SIZE * 2) + return 0; /* EPERM, unexpected data size */ + + if (optval + 1 > optval_end) + return 0; /* EPERM, bounds check */ + + /* Make sure we can trim the buffer. */ + optval[0] = 0; + ctx->optlen = 1; + + /* Usepace buffer is PAGE_SIZE * 2, but BPF + * program can only see the first PAGE_SIZE + * bytes of data. + */ + if (optval_end - optval != PAGE_SIZE) + return 0; /* EPERM, unexpected data size */ + + return 1; + } + if (ctx->level != SOL_CUSTOM) return 0; /* EPERM, deny everything except custom level */ -- cgit v1.2.3 From 8030e250d882db174cbcd88273570ffb36a13080 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 16 Jun 2020 18:04:16 -0700 Subject: bpf: Document optval > PAGE_SIZE behavior for sockopt hooks Extend existing doc with more details about requiring ctx->optlen = 0 for handling optval > PAGE_SIZE. Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200617010416.93086-3-sdf@google.com --- Documentation/bpf/prog_cgroup_sockopt.rst | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/Documentation/bpf/prog_cgroup_sockopt.rst b/Documentation/bpf/prog_cgroup_sockopt.rst index c47d974629ae..172f957204bf 100644 --- a/Documentation/bpf/prog_cgroup_sockopt.rst +++ b/Documentation/bpf/prog_cgroup_sockopt.rst @@ -86,6 +86,20 @@ then the next program in the chain (A) will see those changes, *not* the original input ``setsockopt`` arguments. The potentially modified values will be then passed down to the kernel. +Large optval +============ +When the ``optval`` is greater than the ``PAGE_SIZE``, the BPF program +can access only the first ``PAGE_SIZE`` of that data. So it has to options: + +* Set ``optlen`` to zero, which indicates that the kernel should + use the original buffer from the userspace. Any modifications + done by the BPF program to the ``optval`` are ignored. +* Set ``optlen`` to the value less than ``PAGE_SIZE``, which + indicates that the kernel should use BPF's trimmed ``optval``. + +When the BPF program returns with the ``optlen`` greater than +``PAGE_SIZE``, the userspace will receive ``EFAULT`` errno. + Example ======= -- cgit v1.2.3 From aadf9dcef9d4cd68c73a4ab934f93319c4becc47 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 17 Jun 2020 22:50:33 +0100 Subject: rxrpc: Fix trace string The trace symbol printer (__print_symbolic()) ignores symbols that map to an empty string and prints the hex value instead. Fix the symbol for rxrpc_cong_no_change to " -" instead of "" to avoid this. Fixes: b54a134a7de4 ("rxrpc: Fix handling of enums-to-string translation in tracing") Signed-off-by: David Howells --- include/trace/events/rxrpc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index ba9efdc848f9..059b6e45a028 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -400,7 +400,7 @@ enum rxrpc_tx_point { EM(rxrpc_cong_begin_retransmission, " Retrans") \ EM(rxrpc_cong_cleared_nacks, " Cleared") \ EM(rxrpc_cong_new_low_nack, " NewLowN") \ - EM(rxrpc_cong_no_change, "") \ + EM(rxrpc_cong_no_change, " -") \ EM(rxrpc_cong_progress, " Progres") \ EM(rxrpc_cong_retransmit_again, " ReTxAgn") \ EM(rxrpc_cong_rtt_window_end, " RttWinE") \ -- cgit v1.2.3 From e869e7a17798d85829fa7d4f9bbe1eebd4b2d3f6 Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Mon, 15 Jun 2020 10:54:56 +0800 Subject: net: usb: ax88179_178a: fix packet alignment padding Using a AX88179 device (0b95:1790), I see two bytes of appended data on every RX packet. For example, this 48-byte ping, using 0xff as a payload byte: 04:20:22.528472 IP 192.168.1.1 > 192.168.1.2: ICMP echo request, id 2447, seq 1, length 64 0x0000: 000a cd35 ea50 000a cd35 ea4f 0800 4500 0x0010: 0054 c116 4000 4001 f63e c0a8 0101 c0a8 0x0020: 0102 0800 b633 098f 0001 87ea cd5e 0000 0x0030: 0000 dcf2 0600 0000 0000 ffff ffff ffff 0x0040: ffff ffff ffff ffff ffff ffff ffff ffff 0x0050: ffff ffff ffff ffff ffff ffff ffff ffff 0x0060: ffff 961f Those last two bytes - 96 1f - aren't part of the original packet. In the ax88179 RX path, the usbnet rx_fixup function trims a 2-byte 'alignment pseudo header' from the start of the packet, and sets the length from a per-packet field populated by hardware. It looks like that length field *includes* the 2-byte header; the current driver assumes that it's excluded. This change trims the 2-byte alignment header after we've set the packet length, so the resulting packet length is correct. While we're moving the comment around, this also fixes the spelling of 'pseudo'. Signed-off-by: Jeremy Kerr Signed-off-by: David S. Miller --- drivers/net/usb/ax88179_178a.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/net/usb/ax88179_178a.c b/drivers/net/usb/ax88179_178a.c index 950711448f39..a38e868e44d4 100644 --- a/drivers/net/usb/ax88179_178a.c +++ b/drivers/net/usb/ax88179_178a.c @@ -1491,10 +1491,10 @@ static int ax88179_rx_fixup(struct usbnet *dev, struct sk_buff *skb) } if (pkt_cnt == 0) { - /* Skip IP alignment psudo header */ - skb_pull(skb, 2); skb->len = pkt_len; - skb_set_tail_pointer(skb, pkt_len); + /* Skip IP alignment pseudo header */ + skb_pull(skb, 2); + skb_set_tail_pointer(skb, skb->len); skb->truesize = pkt_len + sizeof(struct sk_buff); ax88179_rx_checksum(skb, pkt_hdr); return 1; @@ -1503,8 +1503,9 @@ static int ax88179_rx_fixup(struct usbnet *dev, struct sk_buff *skb) ax_skb = skb_clone(skb, GFP_ATOMIC); if (ax_skb) { ax_skb->len = pkt_len; - ax_skb->data = skb->data + 2; - skb_set_tail_pointer(ax_skb, pkt_len); + /* Skip IP alignment pseudo header */ + skb_pull(ax_skb, 2); + skb_set_tail_pointer(ax_skb, ax_skb->len); ax_skb->truesize = pkt_len + sizeof(struct sk_buff); ax88179_rx_checksum(ax_skb, pkt_hdr); usbnet_skb_return(dev, ax_skb); -- cgit v1.2.3 From a2ad7c21ad8cf1ce4ad65e13df1c2a1c29b38ac5 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 17 Jun 2020 23:01:23 +0100 Subject: rxrpc: Fix handling of rwind from an ACK packet The handling of the receive window size (rwind) from a received ACK packet is not correct. The rxrpc_input_ackinfo() function currently checks the current Tx window size against the rwind from the ACK to see if it has changed, but then limits the rwind size before storing it in the tx_winsize member and, if it increased, wake up the transmitting process. This means that if rwind > RXRPC_RXTX_BUFF_SIZE - 1, this path will always be followed. Fix this by limiting rwind before we compare it to tx_winsize. The effect of this can be seen by enabling the rxrpc_rx_rwind_change tracepoint. Fixes: 702f2ac87a9a ("rxrpc: Wake up the transmitter if Rx window size increases on the peer") Signed-off-by: David Howells --- net/rxrpc/input.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 299ac98e9754..767579328a06 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -722,13 +722,12 @@ static void rxrpc_input_ackinfo(struct rxrpc_call *call, struct sk_buff *skb, ntohl(ackinfo->rxMTU), ntohl(ackinfo->maxMTU), rwind, ntohl(ackinfo->jumbo_max)); + if (rwind > RXRPC_RXTX_BUFF_SIZE - 1) + rwind = RXRPC_RXTX_BUFF_SIZE - 1; if (call->tx_winsize != rwind) { - if (rwind > RXRPC_RXTX_BUFF_SIZE - 1) - rwind = RXRPC_RXTX_BUFF_SIZE - 1; if (rwind > call->tx_winsize) wake = true; - trace_rxrpc_rx_rwind_change(call, sp->hdr.serial, - ntohl(ackinfo->rwind), wake); + trace_rxrpc_rx_rwind_change(call, sp->hdr.serial, rwind, wake); call->tx_winsize = rwind; } -- cgit v1.2.3 From 02c28dffb13abbaaedece1e4a6493b48ad3f913a Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 17 Jun 2020 15:46:33 +0100 Subject: rxrpc: Fix afs large storage transmission performance drop Commit 2ad6691d988c, which moved the modification of the status annotation for a packet in the Tx buffer prior to the retransmission moved the state clearance, but managed to lose the bit that set it to UNACK. Consequently, if a retransmission occurs, the packet is accidentally changed to the ACK state (ie. 0) by masking it off, which means that the packet isn't counted towards the tally of newly-ACK'd packets if it gets hard-ACK'd. This then prevents the congestion control algorithm from recovering properly. Fix by reinstating the change of state to UNACK. Spotted by the generic/460 xfstest. Fixes: 2ad6691d988c ("rxrpc: Fix race between incoming ACK parser and retransmitter") Signed-off-by: David Howells --- net/rxrpc/call_event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index aa1c8eee6557..6be2672a65ea 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -253,7 +253,7 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j) * confuse things */ annotation &= ~RXRPC_TX_ANNO_MASK; - annotation |= RXRPC_TX_ANNO_RESENT; + annotation |= RXRPC_TX_ANNO_UNACK | RXRPC_TX_ANNO_RESENT; call->rxtx_annotations[ix] = annotation; skb = call->rxtx_buffer[ix]; -- cgit v1.2.3 From 3103b6feb4454646558eedc50ece728bc469f341 Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Mon, 15 Jun 2020 18:14:59 -0700 Subject: ionic: no link check while resetting queues If the driver is busy resetting queues after a change in MTU or queue parameters, don't bother checking the link, wait until the next watchdog cycle. Fixes: 987c0871e8ae ("ionic: check for linkup in watchdog") Signed-off-by: Shannon Nelson Acked-by: Jonathan Toppins Signed-off-by: David S. Miller --- drivers/net/ethernet/pensando/ionic/ionic_lif.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_lif.c b/drivers/net/ethernet/pensando/ionic/ionic_lif.c index 9d8c969f21cb..bfadc4934702 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_lif.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_lif.c @@ -96,7 +96,8 @@ static void ionic_link_status_check(struct ionic_lif *lif) u16 link_status; bool link_up; - if (!test_bit(IONIC_LIF_F_LINK_CHECK_REQUESTED, lif->state)) + if (!test_bit(IONIC_LIF_F_LINK_CHECK_REQUESTED, lif->state) || + test_bit(IONIC_LIF_F_QUEUE_RESET, lif->state)) return; link_status = le16_to_cpu(lif->info->status.link_status); -- cgit v1.2.3 From ef7232da6bcd4294cbb2d424bc35885721570f01 Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Tue, 16 Jun 2020 08:06:26 -0700 Subject: ionic: export features for vlans to use Set up vlan_features for use by any vlans above us. Fixes: beead698b173 ("ionic: Add the basic NDO callbacks for netdev support") Signed-off-by: Shannon Nelson Acked-by: Jonathan Toppins Signed-off-by: David S. Miller --- drivers/net/ethernet/pensando/ionic/ionic_lif.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_lif.c b/drivers/net/ethernet/pensando/ionic/ionic_lif.c index bfadc4934702..8f29ef133743 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_lif.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_lif.c @@ -1246,6 +1246,7 @@ static int ionic_init_nic_features(struct ionic_lif *lif) netdev->hw_features |= netdev->hw_enc_features; netdev->features |= netdev->hw_features; + netdev->vlan_features |= netdev->features & ~NETIF_F_VLAN_FEATURES; netdev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE; -- cgit v1.2.3 From b4748553f53f2971e07d2619f13d461daac0f3bb Mon Sep 17 00:00:00 2001 From: Sascha Hauer Date: Tue, 16 Jun 2020 10:31:39 +0200 Subject: net: ethernet: mvneta: Fix Serdes configuration for SoCs without comphy The MVNETA_SERDES_CFG register is only available on older SoCs like the Armada XP. On newer SoCs like the Armada 38x the fields are moved to comphy. This patch moves the writes to this register next to the comphy initialization, so that depending on the SoC either comphy or MVNETA_SERDES_CFG is configured. With this we no longer write to the MVNETA_SERDES_CFG on SoCs where it doesn't exist. Suggested-by: Russell King Signed-off-by: Sascha Hauer Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mvneta.c | 80 +++++++++++++++++++---------------- 1 file changed, 44 insertions(+), 36 deletions(-) diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index 946925bbcb2d..987aaaabe4dc 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -106,6 +106,7 @@ #define MVNETA_TX_IN_PRGRS BIT(1) #define MVNETA_TX_FIFO_EMPTY BIT(8) #define MVNETA_RX_MIN_FRAME_SIZE 0x247c +/* Only exists on Armada XP and Armada 370 */ #define MVNETA_SERDES_CFG 0x24A0 #define MVNETA_SGMII_SERDES_PROTO 0x0cc7 #define MVNETA_QSGMII_SERDES_PROTO 0x0667 @@ -3529,26 +3530,55 @@ static int mvneta_setup_txqs(struct mvneta_port *pp) return 0; } -static int mvneta_comphy_init(struct mvneta_port *pp) +static int mvneta_comphy_init(struct mvneta_port *pp, phy_interface_t interface) { int ret; - if (!pp->comphy) - return 0; - - ret = phy_set_mode_ext(pp->comphy, PHY_MODE_ETHERNET, - pp->phy_interface); + ret = phy_set_mode_ext(pp->comphy, PHY_MODE_ETHERNET, interface); if (ret) return ret; return phy_power_on(pp->comphy); } +static int mvneta_config_interface(struct mvneta_port *pp, + phy_interface_t interface) +{ + int ret = 0; + + if (pp->comphy) { + if (interface == PHY_INTERFACE_MODE_SGMII || + interface == PHY_INTERFACE_MODE_1000BASEX || + interface == PHY_INTERFACE_MODE_2500BASEX) { + ret = mvneta_comphy_init(pp, interface); + } + } else { + switch (interface) { + case PHY_INTERFACE_MODE_QSGMII: + mvreg_write(pp, MVNETA_SERDES_CFG, + MVNETA_QSGMII_SERDES_PROTO); + break; + + case PHY_INTERFACE_MODE_SGMII: + case PHY_INTERFACE_MODE_1000BASEX: + mvreg_write(pp, MVNETA_SERDES_CFG, + MVNETA_SGMII_SERDES_PROTO); + break; + default: + return -EINVAL; + } + } + + pp->phy_interface = interface; + + return ret; +} + static void mvneta_start_dev(struct mvneta_port *pp) { int cpu; - WARN_ON(mvneta_comphy_init(pp)); + WARN_ON(mvneta_config_interface(pp, pp->phy_interface)); mvneta_max_rx_size_set(pp, pp->pkt_size); mvneta_txq_max_tx_size_set(pp, pp->pkt_size); @@ -3926,14 +3956,10 @@ static void mvneta_mac_config(struct phylink_config *config, unsigned int mode, if (state->speed == SPEED_2500) new_ctrl4 |= MVNETA_GMAC4_SHORT_PREAMBLE_ENABLE; - if (pp->comphy && pp->phy_interface != state->interface && - (state->interface == PHY_INTERFACE_MODE_SGMII || - state->interface == PHY_INTERFACE_MODE_1000BASEX || - state->interface == PHY_INTERFACE_MODE_2500BASEX)) { - pp->phy_interface = state->interface; - - WARN_ON(phy_power_off(pp->comphy)); - WARN_ON(mvneta_comphy_init(pp)); + if (pp->phy_interface != state->interface) { + if (pp->comphy) + WARN_ON(phy_power_off(pp->comphy)); + WARN_ON(mvneta_config_interface(pp, state->interface)); } if (new_ctrl0 != gmac_ctrl0) @@ -4977,20 +5003,10 @@ static void mvneta_conf_mbus_windows(struct mvneta_port *pp, } /* Power up the port */ -static int mvneta_port_power_up(struct mvneta_port *pp, int phy_mode) +static void mvneta_port_power_up(struct mvneta_port *pp, int phy_mode) { /* MAC Cause register should be cleared */ mvreg_write(pp, MVNETA_UNIT_INTR_CAUSE, 0); - - if (phy_mode == PHY_INTERFACE_MODE_QSGMII) - mvreg_write(pp, MVNETA_SERDES_CFG, MVNETA_QSGMII_SERDES_PROTO); - else if (phy_mode == PHY_INTERFACE_MODE_SGMII || - phy_interface_mode_is_8023z(phy_mode)) - mvreg_write(pp, MVNETA_SERDES_CFG, MVNETA_SGMII_SERDES_PROTO); - else if (!phy_interface_mode_is_rgmii(phy_mode)) - return -EINVAL; - - return 0; } /* Device initialization routine */ @@ -5176,11 +5192,7 @@ static int mvneta_probe(struct platform_device *pdev) if (err < 0) goto err_netdev; - err = mvneta_port_power_up(pp, phy_mode); - if (err < 0) { - dev_err(&pdev->dev, "can't power up port\n"); - goto err_netdev; - } + mvneta_port_power_up(pp, phy_mode); /* Armada3700 network controller does not support per-cpu * operation, so only single NAPI should be initialized. @@ -5334,11 +5346,7 @@ static int mvneta_resume(struct device *device) } } mvneta_defaults_set(pp); - err = mvneta_port_power_up(pp, pp->phy_interface); - if (err < 0) { - dev_err(device, "can't power up port\n"); - return err; - } + mvneta_port_power_up(pp, pp->phy_interface); netif_device_attach(dev); -- cgit v1.2.3 From 1a642ca7f38992b086101fe204a1ae3c90ed8016 Mon Sep 17 00:00:00 2001 From: Sascha Hauer Date: Tue, 16 Jun 2020 10:31:40 +0200 Subject: net: ethernet: mvneta: Add 2500BaseX support for SoCs without comphy The older SoCs like Armada XP support a 2500BaseX mode in the datasheets referred to as DR-SGMII (Double rated SGMII) or HS-SGMII (High Speed SGMII). This is an upclocked 1000BaseX mode, thus PHY_INTERFACE_MODE_2500BASEX is the appropriate mode define for it. adding support for it merely means writing the correct magic value into the MVNETA_SERDES_CFG register. Signed-off-by: Sascha Hauer Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mvneta.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index 987aaaabe4dc..af6000172848 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -110,6 +110,7 @@ #define MVNETA_SERDES_CFG 0x24A0 #define MVNETA_SGMII_SERDES_PROTO 0x0cc7 #define MVNETA_QSGMII_SERDES_PROTO 0x0667 +#define MVNETA_HSGMII_SERDES_PROTO 0x1107 #define MVNETA_TYPE_PRIO 0x24bc #define MVNETA_FORCE_UNI BIT(21) #define MVNETA_TXQ_CMD_1 0x24e4 @@ -3564,6 +3565,11 @@ static int mvneta_config_interface(struct mvneta_port *pp, mvreg_write(pp, MVNETA_SERDES_CFG, MVNETA_SGMII_SERDES_PROTO); break; + + case PHY_INTERFACE_MODE_2500BASEX: + mvreg_write(pp, MVNETA_SERDES_CFG, + MVNETA_HSGMII_SERDES_PROTO); + break; default: return -EINVAL; } -- cgit v1.2.3 From 814152a89ed52c722ab92e9fbabcac3cb8a39245 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Tue, 16 Jun 2020 09:39:21 +0000 Subject: net: fix memleak in register_netdevice() I got a memleak report when doing some fuzz test: unreferenced object 0xffff888112584000 (size 13599): comm "ip", pid 3048, jiffies 4294911734 (age 343.491s) hex dump (first 32 bytes): 74 61 70 30 00 00 00 00 00 00 00 00 00 00 00 00 tap0............ 00 ee d9 19 81 88 ff ff 00 00 00 00 00 00 00 00 ................ backtrace: [<000000002f60ba65>] __kmalloc_node+0x309/0x3a0 [<0000000075b211ec>] kvmalloc_node+0x7f/0xc0 [<00000000d3a97396>] alloc_netdev_mqs+0x76/0xfc0 [<00000000609c3655>] __tun_chr_ioctl+0x1456/0x3d70 [<000000001127ca24>] ksys_ioctl+0xe5/0x130 [<00000000b7d5e66a>] __x64_sys_ioctl+0x6f/0xb0 [<00000000e1023498>] do_syscall_64+0x56/0xa0 [<000000009ec0eb12>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 unreferenced object 0xffff888111845cc0 (size 8): comm "ip", pid 3048, jiffies 4294911734 (age 343.491s) hex dump (first 8 bytes): 74 61 70 30 00 88 ff ff tap0.... backtrace: [<000000004c159777>] kstrdup+0x35/0x70 [<00000000d8b496ad>] kstrdup_const+0x3d/0x50 [<00000000494e884a>] kvasprintf_const+0xf1/0x180 [<0000000097880a2b>] kobject_set_name_vargs+0x56/0x140 [<000000008fbdfc7b>] dev_set_name+0xab/0xe0 [<000000005b99e3b4>] netdev_register_kobject+0xc0/0x390 [<00000000602704fe>] register_netdevice+0xb61/0x1250 [<000000002b7ca244>] __tun_chr_ioctl+0x1cd1/0x3d70 [<000000001127ca24>] ksys_ioctl+0xe5/0x130 [<00000000b7d5e66a>] __x64_sys_ioctl+0x6f/0xb0 [<00000000e1023498>] do_syscall_64+0x56/0xa0 [<000000009ec0eb12>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 unreferenced object 0xffff88811886d800 (size 512): comm "ip", pid 3048, jiffies 4294911734 (age 343.491s) hex dump (first 32 bytes): 00 00 00 00 ad 4e ad de ff ff ff ff 00 00 00 00 .....N.......... ff ff ff ff ff ff ff ff c0 66 3d a3 ff ff ff ff .........f=..... backtrace: [<0000000050315800>] device_add+0x61e/0x1950 [<0000000021008dfb>] netdev_register_kobject+0x17e/0x390 [<00000000602704fe>] register_netdevice+0xb61/0x1250 [<000000002b7ca244>] __tun_chr_ioctl+0x1cd1/0x3d70 [<000000001127ca24>] ksys_ioctl+0xe5/0x130 [<00000000b7d5e66a>] __x64_sys_ioctl+0x6f/0xb0 [<00000000e1023498>] do_syscall_64+0x56/0xa0 [<000000009ec0eb12>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 If call_netdevice_notifiers() failed, then rollback_registered() calls netdev_unregister_kobject() which holds the kobject. The reference cannot be put because the netdev won't be add to todo list, so it will leads a memleak, we need put the reference to avoid memleak. Reported-by: Hulk Robot Signed-off-by: Yang Yingliang Signed-off-by: David S. Miller --- net/core/dev.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/core/dev.c b/net/core/dev.c index 6bc2388141f6..44a14b41ad82 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -9547,6 +9547,13 @@ int register_netdevice(struct net_device *dev) rcu_barrier(); dev->reg_state = NETREG_UNREGISTERED; + /* We should put the kobject that hold in + * netdev_unregister_kobject(), otherwise + * the net device cannot be freed when + * driver calls free_netdev(), because the + * kobject is being hold. + */ + kobject_put(&dev->dev.kobj); } /* * Prevent userspace races by waiting until the network -- cgit v1.2.3 From fb7861d14c8d7edac65b2fcb6e8031cb138457b2 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Tue, 16 Jun 2020 15:52:05 +0000 Subject: net: core: reduce recursion limit value In the current code, ->ndo_start_xmit() can be executed recursively only 10 times because of stack memory. But, in the case of the vxlan, 10 recursion limit value results in a stack overflow. In the current code, the nested interface is limited by 8 depth. There is no critical reason that the recursion limitation value should be 10. So, it would be good to be the same value with the limitation value of nesting interface depth. Test commands: ip link add vxlan10 type vxlan vni 10 dstport 4789 srcport 4789 4789 ip link set vxlan10 up ip a a 192.168.10.1/24 dev vxlan10 ip n a 192.168.10.2 dev vxlan10 lladdr fc:22:33:44:55:66 nud permanent for i in {9..0} do let A=$i+1 ip link add vxlan$i type vxlan vni $i dstport 4789 srcport 4789 4789 ip link set vxlan$i up ip a a 192.168.$i.1/24 dev vxlan$i ip n a 192.168.$i.2 dev vxlan$i lladdr fc:22:33:44:55:66 nud permanent bridge fdb add fc:22:33:44:55:66 dev vxlan$A dst 192.168.$i.2 self done hping3 192.168.10.2 -2 -d 60000 Splat looks like: [ 103.814237][ T1127] ============================================================================= [ 103.871955][ T1127] BUG kmalloc-2k (Tainted: G B ): Padding overwritten. 0x00000000897a2e4f-0x000 [ 103.873187][ T1127] ----------------------------------------------------------------------------- [ 103.873187][ T1127] [ 103.874252][ T1127] INFO: Slab 0x000000005cccc724 objects=5 used=5 fp=0x0000000000000000 flags=0x10000000001020 [ 103.881323][ T1127] CPU: 3 PID: 1127 Comm: hping3 Tainted: G B 5.7.0+ #575 [ 103.882131][ T1127] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 [ 103.883006][ T1127] Call Trace: [ 103.883324][ T1127] dump_stack+0x96/0xdb [ 103.883716][ T1127] slab_err+0xad/0xd0 [ 103.884106][ T1127] ? _raw_spin_unlock+0x1f/0x30 [ 103.884620][ T1127] ? get_partial_node.isra.78+0x140/0x360 [ 103.885214][ T1127] slab_pad_check.part.53+0xf7/0x160 [ 103.885769][ T1127] ? pskb_expand_head+0x110/0xe10 [ 103.886316][ T1127] check_slab+0x97/0xb0 [ 103.886763][ T1127] alloc_debug_processing+0x84/0x1a0 [ 103.887308][ T1127] ___slab_alloc+0x5a5/0x630 [ 103.887765][ T1127] ? pskb_expand_head+0x110/0xe10 [ 103.888265][ T1127] ? lock_downgrade+0x730/0x730 [ 103.888762][ T1127] ? pskb_expand_head+0x110/0xe10 [ 103.889244][ T1127] ? __slab_alloc+0x3e/0x80 [ 103.889675][ T1127] __slab_alloc+0x3e/0x80 [ 103.890108][ T1127] __kmalloc_node_track_caller+0xc7/0x420 [ ... ] Fixes: 11a766ce915f ("net: Increase xmit RECURSION_LIMIT to 10.") Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 6fc613ed8eae..39e28e11863c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3157,7 +3157,7 @@ static inline int dev_recursion_level(void) return this_cpu_read(softnet_data.xmit.recursion); } -#define XMIT_RECURSION_LIMIT 10 +#define XMIT_RECURSION_LIMIT 8 static inline bool dev_xmit_recursion(void) { return unlikely(__this_cpu_read(softnet_data.xmit.recursion) > -- cgit v1.2.3 From dafabb6590cb15f300b77c095d50312e2c7c8e0f Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Tue, 16 Jun 2020 16:04:00 +0000 Subject: ip6_gre: fix use-after-free in ip6gre_tunnel_lookup() In the datapath, the ip6gre_tunnel_lookup() is used and it internally uses fallback tunnel device pointer, which is fb_tunnel_dev. This pointer variable should be set to NULL when a fb interface is deleted. But there is no routine to set fb_tunnel_dev pointer to NULL. So, this pointer will be still used after interface is deleted and it eventually results in the use-after-free problem. Test commands: ip netns add A ip netns add B ip link add eth0 type veth peer name eth1 ip link set eth0 netns A ip link set eth1 netns B ip netns exec A ip link set lo up ip netns exec A ip link set eth0 up ip netns exec A ip link add ip6gre1 type ip6gre local fc:0::1 \ remote fc:0::2 ip netns exec A ip -6 a a fc:100::1/64 dev ip6gre1 ip netns exec A ip link set ip6gre1 up ip netns exec A ip -6 a a fc:0::1/64 dev eth0 ip netns exec A ip link set ip6gre0 up ip netns exec B ip link set lo up ip netns exec B ip link set eth1 up ip netns exec B ip link add ip6gre1 type ip6gre local fc:0::2 \ remote fc:0::1 ip netns exec B ip -6 a a fc:100::2/64 dev ip6gre1 ip netns exec B ip link set ip6gre1 up ip netns exec B ip -6 a a fc:0::2/64 dev eth1 ip netns exec B ip link set ip6gre0 up ip netns exec A ping fc:100::2 -s 60000 & ip netns del B Splat looks like: [ 73.087285][ C1] BUG: KASAN: use-after-free in ip6gre_tunnel_lookup+0x1064/0x13f0 [ip6_gre] [ 73.088361][ C1] Read of size 4 at addr ffff888040559218 by task ping/1429 [ 73.089317][ C1] [ 73.089638][ C1] CPU: 1 PID: 1429 Comm: ping Not tainted 5.7.0+ #602 [ 73.090531][ C1] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 [ 73.091725][ C1] Call Trace: [ 73.092160][ C1] [ 73.092556][ C1] dump_stack+0x96/0xdb [ 73.093122][ C1] print_address_description.constprop.6+0x2cc/0x450 [ 73.094016][ C1] ? ip6gre_tunnel_lookup+0x1064/0x13f0 [ip6_gre] [ 73.094894][ C1] ? ip6gre_tunnel_lookup+0x1064/0x13f0 [ip6_gre] [ 73.095767][ C1] ? ip6gre_tunnel_lookup+0x1064/0x13f0 [ip6_gre] [ 73.096619][ C1] kasan_report+0x154/0x190 [ 73.097209][ C1] ? ip6gre_tunnel_lookup+0x1064/0x13f0 [ip6_gre] [ 73.097989][ C1] ip6gre_tunnel_lookup+0x1064/0x13f0 [ip6_gre] [ 73.098750][ C1] ? gre_del_protocol+0x60/0x60 [gre] [ 73.099500][ C1] gre_rcv+0x1c5/0x1450 [ip6_gre] [ 73.100199][ C1] ? ip6gre_header+0xf00/0xf00 [ip6_gre] [ 73.100985][ C1] ? rcu_read_lock_sched_held+0xc0/0xc0 [ 73.101830][ C1] ? ip6_input_finish+0x5/0xf0 [ 73.102483][ C1] ip6_protocol_deliver_rcu+0xcbb/0x1510 [ 73.103296][ C1] ip6_input_finish+0x5b/0xf0 [ 73.103920][ C1] ip6_input+0xcd/0x2c0 [ 73.104473][ C1] ? ip6_input_finish+0xf0/0xf0 [ 73.105115][ C1] ? rcu_read_lock_held+0x90/0xa0 [ 73.105783][ C1] ? rcu_read_lock_sched_held+0xc0/0xc0 [ 73.106548][ C1] ipv6_rcv+0x1f1/0x300 [ ... ] Suggested-by: Eric Dumazet Fixes: c12b395a4664 ("gre: Support GRE over IPv6") Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 781ca8c07a0d..6532bde82b40 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -127,6 +127,7 @@ static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev, gre_proto == htons(ETH_P_ERSPAN2)) ? ARPHRD_ETHER : ARPHRD_IP6GRE; int score, cand_score = 4; + struct net_device *ndev; for_each_ip_tunnel_rcu(t, ign->tunnels_r_l[h0 ^ h1]) { if (!ipv6_addr_equal(local, &t->parms.laddr) || @@ -238,9 +239,9 @@ static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev, if (t && t->dev->flags & IFF_UP) return t; - dev = ign->fb_tunnel_dev; - if (dev && dev->flags & IFF_UP) - return netdev_priv(dev); + ndev = READ_ONCE(ign->fb_tunnel_dev); + if (ndev && ndev->flags & IFF_UP) + return netdev_priv(ndev); return NULL; } @@ -413,6 +414,8 @@ static void ip6gre_tunnel_uninit(struct net_device *dev) ip6gre_tunnel_unlink_md(ign, t); ip6gre_tunnel_unlink(ign, t); + if (ign->fb_tunnel_dev == dev) + WRITE_ONCE(ign->fb_tunnel_dev, NULL); dst_cache_reset(&t->dst_cache); dev_put(dev); } -- cgit v1.2.3 From ba61539c6ae57f4146284a5cb4f7b7ed8d42bf45 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Tue, 16 Jun 2020 16:51:51 +0000 Subject: ip_tunnel: fix use-after-free in ip_tunnel_lookup() In the datapath, the ip_tunnel_lookup() is used and it internally uses fallback tunnel device pointer, which is fb_tunnel_dev. This pointer variable should be set to NULL when a fb interface is deleted. But there is no routine to set fb_tunnel_dev pointer to NULL. So, this pointer will be still used after interface is deleted and it eventually results in the use-after-free problem. Test commands: ip netns add A ip netns add B ip link add eth0 type veth peer name eth1 ip link set eth0 netns A ip link set eth1 netns B ip netns exec A ip link set lo up ip netns exec A ip link set eth0 up ip netns exec A ip link add gre1 type gre local 10.0.0.1 \ remote 10.0.0.2 ip netns exec A ip link set gre1 up ip netns exec A ip a a 10.0.100.1/24 dev gre1 ip netns exec A ip a a 10.0.0.1/24 dev eth0 ip netns exec B ip link set lo up ip netns exec B ip link set eth1 up ip netns exec B ip link add gre1 type gre local 10.0.0.2 \ remote 10.0.0.1 ip netns exec B ip link set gre1 up ip netns exec B ip a a 10.0.100.2/24 dev gre1 ip netns exec B ip a a 10.0.0.2/24 dev eth1 ip netns exec A hping3 10.0.100.2 -2 --flood -d 60000 & ip netns del B Splat looks like: [ 77.793450][ C3] ================================================================== [ 77.794702][ C3] BUG: KASAN: use-after-free in ip_tunnel_lookup+0xcc4/0xf30 [ 77.795573][ C3] Read of size 4 at addr ffff888060bd9c84 by task hping3/2905 [ 77.796398][ C3] [ 77.796664][ C3] CPU: 3 PID: 2905 Comm: hping3 Not tainted 5.8.0-rc1+ #616 [ 77.797474][ C3] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 [ 77.798453][ C3] Call Trace: [ 77.798815][ C3] [ 77.799142][ C3] dump_stack+0x9d/0xdb [ 77.799605][ C3] print_address_description.constprop.7+0x2cc/0x450 [ 77.800365][ C3] ? ip_tunnel_lookup+0xcc4/0xf30 [ 77.800908][ C3] ? ip_tunnel_lookup+0xcc4/0xf30 [ 77.801517][ C3] ? ip_tunnel_lookup+0xcc4/0xf30 [ 77.802145][ C3] kasan_report+0x154/0x190 [ 77.802821][ C3] ? ip_tunnel_lookup+0xcc4/0xf30 [ 77.803503][ C3] ip_tunnel_lookup+0xcc4/0xf30 [ 77.804165][ C3] __ipgre_rcv+0x1ab/0xaa0 [ip_gre] [ 77.804862][ C3] ? rcu_read_lock_sched_held+0xc0/0xc0 [ 77.805621][ C3] gre_rcv+0x304/0x1910 [ip_gre] [ 77.806293][ C3] ? lock_acquire+0x1a9/0x870 [ 77.806925][ C3] ? gre_rcv+0xfe/0x354 [gre] [ 77.807559][ C3] ? erspan_xmit+0x2e60/0x2e60 [ip_gre] [ 77.808305][ C3] ? rcu_read_lock_sched_held+0xc0/0xc0 [ 77.809032][ C3] ? rcu_read_lock_held+0x90/0xa0 [ 77.809713][ C3] gre_rcv+0x1b8/0x354 [gre] [ ... ] Suggested-by: Eric Dumazet Fixes: c54419321455 ("GRE: Refactor GRE tunneling code.") Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- net/ipv4/ip_tunnel.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index f4f1d11eab50..0c1f36404471 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -85,9 +85,10 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, __be32 remote, __be32 local, __be32 key) { - unsigned int hash; struct ip_tunnel *t, *cand = NULL; struct hlist_head *head; + struct net_device *ndev; + unsigned int hash; hash = ip_tunnel_hash(key, remote); head = &itn->tunnels[hash]; @@ -162,8 +163,9 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, if (t && t->dev->flags & IFF_UP) return t; - if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP) - return netdev_priv(itn->fb_tunnel_dev); + ndev = READ_ONCE(itn->fb_tunnel_dev); + if (ndev && ndev->flags & IFF_UP) + return netdev_priv(ndev); return NULL; } @@ -1259,9 +1261,9 @@ void ip_tunnel_uninit(struct net_device *dev) struct ip_tunnel_net *itn; itn = net_generic(net, tunnel->ip_tnl_net_id); - /* fb_tunnel_dev will be unregisted in net-exit call. */ - if (itn->fb_tunnel_dev != dev) - ip_tunnel_del(itn, netdev_priv(dev)); + ip_tunnel_del(itn, netdev_priv(dev)); + if (itn->fb_tunnel_dev == dev) + WRITE_ONCE(itn->fb_tunnel_dev, NULL); dst_cache_reset(&tunnel->dst_cache); } -- cgit v1.2.3 From 7024339a1cfa0d5bbfa628a7a35b67789cc86d7f Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Tue, 16 Jun 2020 22:25:20 +0200 Subject: net/sched: act_gate: fix NULL dereference in tcf_gate_init() it is possible to see a KASAN use-after-free, immediately followed by a NULL dereference crash, with the following command: # tc action add action gate index 3 cycle-time 100000000ns \ > cycle-time-ext 100000000ns clockid CLOCK_TAI BUG: KASAN: use-after-free in tcf_action_init_1+0x8eb/0x960 Write of size 1 at addr ffff88810a5908bc by task tc/883 CPU: 0 PID: 883 Comm: tc Not tainted 5.7.0+ #188 Hardware name: Red Hat KVM, BIOS 1.11.1-4.module+el8.1.0+4066+0f1aadab 04/01/2014 Call Trace: dump_stack+0x75/0xa0 print_address_description.constprop.6+0x1a/0x220 kasan_report.cold.9+0x37/0x7c tcf_action_init_1+0x8eb/0x960 tcf_action_init+0x157/0x2a0 tcf_action_add+0xd9/0x2f0 tc_ctl_action+0x2a3/0x39d rtnetlink_rcv_msg+0x5f3/0x920 netlink_rcv_skb+0x120/0x380 netlink_unicast+0x439/0x630 netlink_sendmsg+0x714/0xbf0 sock_sendmsg+0xe2/0x110 ____sys_sendmsg+0x5b4/0x890 ___sys_sendmsg+0xe9/0x160 __sys_sendmsg+0xd3/0x170 do_syscall_64+0x9a/0x370 entry_SYSCALL_64_after_hwframe+0x44/0xa9 [...] KASAN: null-ptr-deref in range [0x0000000000000070-0x0000000000000077] CPU: 0 PID: 883 Comm: tc Tainted: G B 5.7.0+ #188 Hardware name: Red Hat KVM, BIOS 1.11.1-4.module+el8.1.0+4066+0f1aadab 04/01/2014 RIP: 0010:tcf_action_fill_size+0xa3/0xf0 [....] RSP: 0018:ffff88813a48f250 EFLAGS: 00010212 RAX: dffffc0000000000 RBX: 0000000000000094 RCX: ffffffffa47c3eb6 RDX: 000000000000000e RSI: 0000000000000008 RDI: 0000000000000070 RBP: ffff88810a590800 R08: 0000000000000004 R09: ffffed1027491e03 R10: 0000000000000003 R11: ffffed1027491e03 R12: 0000000000000000 R13: 0000000000000000 R14: dffffc0000000000 R15: ffff88810a590800 FS: 00007f62cae8ce40(0000) GS:ffff888147c00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f62c9d20a10 CR3: 000000013a52a000 CR4: 0000000000340ef0 Call Trace: tcf_action_init+0x172/0x2a0 tcf_action_add+0xd9/0x2f0 tc_ctl_action+0x2a3/0x39d rtnetlink_rcv_msg+0x5f3/0x920 netlink_rcv_skb+0x120/0x380 netlink_unicast+0x439/0x630 netlink_sendmsg+0x714/0xbf0 sock_sendmsg+0xe2/0x110 ____sys_sendmsg+0x5b4/0x890 ___sys_sendmsg+0xe9/0x160 __sys_sendmsg+0xd3/0x170 do_syscall_64+0x9a/0x370 entry_SYSCALL_64_after_hwframe+0x44/0xa9 this is caused by the test on 'cycletime_ext', that is still unassigned when the action is newly created. This makes the action .init() return 0 without calling tcf_idr_insert(), hence the UAF + crash. rework the logic that prevents zero values of cycle-time, as follows: 1) 'tcfg_cycletime_ext' seems to be unused in the action software path, and it was already possible by other means to obtain non-zero cycletime and zero cycletime-ext. So, removing that test should not cause any damage. 2) while at it, we must prevent overwriting configuration data with wrong ones: use a temporary variable for 'tcfg_cycletime', and validate it preserving the original semantic (that allowed computing the cycle time as the sum of all intervals, when not specified by TCA_GATE_CYCLE_TIME). 3) remove the test on 'tcfg_cycletime', no more useful, and avoid returning -EFAULT, which did not seem an appropriate return value for a wrong netlink attribute. v3: fix uninitialized 'cycletime' (thanks to Vladimir Oltean) v2: remove useless 'return;' at the end of void gate_get_start_time() Fixes: a51c328df310 ("net: qos: introduce a gate control flow action") CC: Ivan Vecera Signed-off-by: Davide Caratti Acked-by: Vladimir Oltean Tested-by: Vladimir Oltean Signed-off-by: David S. Miller --- net/sched/act_gate.c | 36 +++++++++++++----------------------- 1 file changed, 13 insertions(+), 23 deletions(-) diff --git a/net/sched/act_gate.c b/net/sched/act_gate.c index 9c628591f452..94e560c2f81d 100644 --- a/net/sched/act_gate.c +++ b/net/sched/act_gate.c @@ -32,7 +32,7 @@ static ktime_t gate_get_time(struct tcf_gate *gact) return KTIME_MAX; } -static int gate_get_start_time(struct tcf_gate *gact, ktime_t *start) +static void gate_get_start_time(struct tcf_gate *gact, ktime_t *start) { struct tcf_gate_params *param = &gact->param; ktime_t now, base, cycle; @@ -43,18 +43,13 @@ static int gate_get_start_time(struct tcf_gate *gact, ktime_t *start) if (ktime_after(base, now)) { *start = base; - return 0; + return; } cycle = param->tcfg_cycletime; - /* cycle time should not be zero */ - if (!cycle) - return -EFAULT; - n = div64_u64(ktime_sub_ns(now, base), cycle); *start = ktime_add_ns(base, (n + 1) * cycle); - return 0; } static void gate_start_timer(struct tcf_gate *gact, ktime_t start) @@ -287,12 +282,12 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla, enum tk_offsets tk_offset = TK_OFFS_TAI; struct nlattr *tb[TCA_GATE_MAX + 1]; struct tcf_chain *goto_ch = NULL; + u64 cycletime = 0, basetime = 0; struct tcf_gate_params *p; s32 clockid = CLOCK_TAI; struct tcf_gate *gact; struct tc_gate *parm; int ret = 0, err; - u64 basetime = 0; u32 gflags = 0; s32 prio = -1; ktime_t start; @@ -375,11 +370,8 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla, spin_lock_bh(&gact->tcf_lock); p = &gact->param; - if (tb[TCA_GATE_CYCLE_TIME]) { - p->tcfg_cycletime = nla_get_u64(tb[TCA_GATE_CYCLE_TIME]); - if (!p->tcfg_cycletime_ext) - goto chain_put; - } + if (tb[TCA_GATE_CYCLE_TIME]) + cycletime = nla_get_u64(tb[TCA_GATE_CYCLE_TIME]); if (tb[TCA_GATE_ENTRY_LIST]) { err = parse_gate_list(tb[TCA_GATE_ENTRY_LIST], p, extack); @@ -387,14 +379,19 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla, goto chain_put; } - if (!p->tcfg_cycletime) { + if (!cycletime) { struct tcfg_gate_entry *entry; ktime_t cycle = 0; list_for_each_entry(entry, &p->entries, list) cycle = ktime_add_ns(cycle, entry->interval); - p->tcfg_cycletime = cycle; + cycletime = cycle; + if (!cycletime) { + err = -EINVAL; + goto chain_put; + } } + p->tcfg_cycletime = cycletime; if (tb[TCA_GATE_CYCLE_TIME_EXT]) p->tcfg_cycletime_ext = @@ -408,14 +405,7 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla, gact->tk_offset = tk_offset; hrtimer_init(&gact->hitimer, clockid, HRTIMER_MODE_ABS_SOFT); gact->hitimer.function = gate_timer_func; - - err = gate_get_start_time(gact, &start); - if (err < 0) { - NL_SET_ERR_MSG(extack, - "Internal error: failed get start time"); - release_entry_list(&p->entries); - goto chain_put; - } + gate_get_start_time(gact, &start); gact->current_close_time = start; gact->current_gate_status = GATE_ACT_GATE_OPEN | GATE_ACT_PENDING; -- cgit v1.2.3 From c362a06e96eae16ccb7b0f4f93f19224279b8610 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Tue, 16 Jun 2020 22:25:21 +0200 Subject: net/sched: act_gate: fix configuration of the periodic timer assigning a dummy value of 'clock_id' to avoid cancellation of the cycle timer before its initialization was a temporary solution, and we still need to handle the case where act_gate timer parameters are changed by commands like the following one: # tc action replace action gate the fix consists in the following items: 1) remove the workaround assignment of 'clock_id', and init the list of entries before the first error path after IDR atomic check/allocation 2) validate 'clock_id' earlier: there is no need to do IDR atomic check/allocation if we know that 'clock_id' is a bad value 3) use a dedicated function, 'gate_setup_timer()', to ensure that the timer is cancelled and re-initialized on action overwrite, and also ensure we initialize the timer in the error path of tcf_gate_init() v3: improve comment in the error path of tcf_gate_init() (thanks to Vladimir Oltean) v2: avoid 'goto' in gate_setup_timer (thanks to Cong Wang) CC: Ivan Vecera Fixes: a01c245438c5 ("net/sched: fix a couple of splats in the error path of tfc_gate_init()") Fixes: a51c328df310 ("net: qos: introduce a gate control flow action") Signed-off-by: Davide Caratti Acked-by: Vladimir Oltean Tested-by: Vladimir Oltean Signed-off-by: David S. Miller --- net/sched/act_gate.c | 90 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 55 insertions(+), 35 deletions(-) diff --git a/net/sched/act_gate.c b/net/sched/act_gate.c index 94e560c2f81d..323ae7f6315d 100644 --- a/net/sched/act_gate.c +++ b/net/sched/act_gate.c @@ -272,6 +272,27 @@ release_list: return err; } +static void gate_setup_timer(struct tcf_gate *gact, u64 basetime, + enum tk_offsets tko, s32 clockid, + bool do_init) +{ + if (!do_init) { + if (basetime == gact->param.tcfg_basetime && + tko == gact->tk_offset && + clockid == gact->param.tcfg_clockid) + return; + + spin_unlock_bh(&gact->tcf_lock); + hrtimer_cancel(&gact->hitimer); + spin_lock_bh(&gact->tcf_lock); + } + gact->param.tcfg_basetime = basetime; + gact->param.tcfg_clockid = clockid; + gact->tk_offset = tko; + hrtimer_init(&gact->hitimer, clockid, HRTIMER_MODE_ABS_SOFT); + gact->hitimer.function = gate_timer_func; +} + static int tcf_gate_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, @@ -303,6 +324,27 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla, if (!tb[TCA_GATE_PARMS]) return -EINVAL; + if (tb[TCA_GATE_CLOCKID]) { + clockid = nla_get_s32(tb[TCA_GATE_CLOCKID]); + switch (clockid) { + case CLOCK_REALTIME: + tk_offset = TK_OFFS_REAL; + break; + case CLOCK_MONOTONIC: + tk_offset = TK_OFFS_MAX; + break; + case CLOCK_BOOTTIME: + tk_offset = TK_OFFS_BOOT; + break; + case CLOCK_TAI: + tk_offset = TK_OFFS_TAI; + break; + default: + NL_SET_ERR_MSG(extack, "Invalid 'clockid'"); + return -EINVAL; + } + } + parm = nla_data(tb[TCA_GATE_PARMS]); index = parm->index; @@ -326,10 +368,6 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla, tcf_idr_release(*a, bind); return -EEXIST; } - if (ret == ACT_P_CREATED) { - to_gate(*a)->param.tcfg_clockid = -1; - INIT_LIST_HEAD(&(to_gate(*a)->param.entries)); - } if (tb[TCA_GATE_PRIORITY]) prio = nla_get_s32(tb[TCA_GATE_PRIORITY]); @@ -340,33 +378,14 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla, if (tb[TCA_GATE_FLAGS]) gflags = nla_get_u32(tb[TCA_GATE_FLAGS]); - if (tb[TCA_GATE_CLOCKID]) { - clockid = nla_get_s32(tb[TCA_GATE_CLOCKID]); - switch (clockid) { - case CLOCK_REALTIME: - tk_offset = TK_OFFS_REAL; - break; - case CLOCK_MONOTONIC: - tk_offset = TK_OFFS_MAX; - break; - case CLOCK_BOOTTIME: - tk_offset = TK_OFFS_BOOT; - break; - case CLOCK_TAI: - tk_offset = TK_OFFS_TAI; - break; - default: - NL_SET_ERR_MSG(extack, "Invalid 'clockid'"); - goto release_idr; - } - } + gact = to_gate(*a); + if (ret == ACT_P_CREATED) + INIT_LIST_HEAD(&gact->param.entries); err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); if (err < 0) goto release_idr; - gact = to_gate(*a); - spin_lock_bh(&gact->tcf_lock); p = &gact->param; @@ -397,14 +416,10 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla, p->tcfg_cycletime_ext = nla_get_u64(tb[TCA_GATE_CYCLE_TIME_EXT]); + gate_setup_timer(gact, basetime, tk_offset, clockid, + ret == ACT_P_CREATED); p->tcfg_priority = prio; - p->tcfg_basetime = basetime; - p->tcfg_clockid = clockid; p->tcfg_flags = gflags; - - gact->tk_offset = tk_offset; - hrtimer_init(&gact->hitimer, clockid, HRTIMER_MODE_ABS_SOFT); - gact->hitimer.function = gate_timer_func; gate_get_start_time(gact, &start); gact->current_close_time = start; @@ -433,6 +448,13 @@ chain_put: if (goto_ch) tcf_chain_put_by_act(goto_ch); release_idr: + /* action is not inserted in any list: it's safe to init hitimer + * without taking tcf_lock. + */ + if (ret == ACT_P_CREATED) + gate_setup_timer(gact, gact->param.tcfg_basetime, + gact->tk_offset, gact->param.tcfg_clockid, + true); tcf_idr_release(*a, bind); return err; } @@ -443,9 +465,7 @@ static void tcf_gate_cleanup(struct tc_action *a) struct tcf_gate_params *p; p = &gact->param; - if (p->tcfg_clockid != -1) - hrtimer_cancel(&gact->hitimer); - + hrtimer_cancel(&gact->hitimer); release_entry_list(&p->entries); } -- cgit v1.2.3 From 5b3b396c77677b51a6bd1aacd0fe2a04d1efa0ff Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 17 Jun 2020 02:58:41 +0300 Subject: net: dsa: sja1105: remove debugging code in sja1105_vl_gate This shouldn't be there. Fixes: 834f8933d5dd ("net: dsa: sja1105: implement tc-gate using time-triggered virtual links") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/dsa/sja1105/sja1105_vl.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/net/dsa/sja1105/sja1105_vl.c b/drivers/net/dsa/sja1105/sja1105_vl.c index bdfd6c4e190d..32eca3e660e1 100644 --- a/drivers/net/dsa/sja1105/sja1105_vl.c +++ b/drivers/net/dsa/sja1105/sja1105_vl.c @@ -588,14 +588,10 @@ int sja1105_vl_gate(struct sja1105_private *priv, int port, if (priv->vlan_state == SJA1105_VLAN_UNAWARE && key->type != SJA1105_KEY_VLAN_UNAWARE_VL) { - dev_err(priv->ds->dev, "1: vlan state %d key type %d\n", - priv->vlan_state, key->type); NL_SET_ERR_MSG_MOD(extack, "Can only gate based on DMAC"); return -EOPNOTSUPP; } else if (key->type != SJA1105_KEY_VLAN_AWARE_VL) { - dev_err(priv->ds->dev, "2: vlan state %d key type %d\n", - priv->vlan_state, key->type); NL_SET_ERR_MSG_MOD(extack, "Can only gate based on {DMAC, VID, PCP}"); return -EOPNOTSUPP; -- cgit v1.2.3 From c6ae970bcc8e6f0b761111680ba78f7be43fb05a Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 17 Jun 2020 02:58:42 +0300 Subject: net: dsa: sja1105: fix checks for VLAN state in redirect action This action requires the VLAN awareness state of the switch to be of the same type as the key that's being added: - If the switch is unaware of VLAN, then the tc filter key must only contain the destination MAC address. - If the switch is VLAN-aware, the key must also contain the VLAN ID and PCP. But this check doesn't work unless we verify the VLAN awareness state on both the "if" and the "else" branches. Fixes: dfacc5a23e22 ("net: dsa: sja1105: support flow-based redirection via virtual links") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/dsa/sja1105/sja1105_vl.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/dsa/sja1105/sja1105_vl.c b/drivers/net/dsa/sja1105/sja1105_vl.c index 32eca3e660e1..a176f39a052b 100644 --- a/drivers/net/dsa/sja1105/sja1105_vl.c +++ b/drivers/net/dsa/sja1105/sja1105_vl.c @@ -342,7 +342,9 @@ int sja1105_vl_redirect(struct sja1105_private *priv, int port, NL_SET_ERR_MSG_MOD(extack, "Can only redirect based on DMAC"); return -EOPNOTSUPP; - } else if (key->type != SJA1105_KEY_VLAN_AWARE_VL) { + } else if ((priv->vlan_state == SJA1105_VLAN_BEST_EFFORT || + priv->vlan_state == SJA1105_VLAN_FILTERING_FULL) && + key->type != SJA1105_KEY_VLAN_AWARE_VL) { NL_SET_ERR_MSG_MOD(extack, "Can only redirect based on {DMAC, VID, PCP}"); return -EOPNOTSUPP; -- cgit v1.2.3 From 5182a6222dd0f27c41cda2706bdc58d48a0d8f96 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 17 Jun 2020 02:58:43 +0300 Subject: net: dsa: sja1105: fix checks for VLAN state in gate action This action requires the VLAN awareness state of the switch to be of the same type as the key that's being added: - If the switch is unaware of VLAN, then the tc filter key must only contain the destination MAC address. - If the switch is VLAN-aware, the key must also contain the VLAN ID and PCP. But this check doesn't work unless we verify the VLAN awareness state on both the "if" and the "else" branches. Fixes: 834f8933d5dd ("net: dsa: sja1105: implement tc-gate using time-triggered virtual links") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/dsa/sja1105/sja1105_vl.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/dsa/sja1105/sja1105_vl.c b/drivers/net/dsa/sja1105/sja1105_vl.c index a176f39a052b..0056f9c1e471 100644 --- a/drivers/net/dsa/sja1105/sja1105_vl.c +++ b/drivers/net/dsa/sja1105/sja1105_vl.c @@ -593,7 +593,9 @@ int sja1105_vl_gate(struct sja1105_private *priv, int port, NL_SET_ERR_MSG_MOD(extack, "Can only gate based on DMAC"); return -EOPNOTSUPP; - } else if (key->type != SJA1105_KEY_VLAN_AWARE_VL) { + } else if ((priv->vlan_state == SJA1105_VLAN_BEST_EFFORT || + priv->vlan_state == SJA1105_VLAN_FILTERING_FULL) && + key->type != SJA1105_KEY_VLAN_AWARE_VL) { NL_SET_ERR_MSG_MOD(extack, "Can only gate based on {DMAC, VID, PCP}"); return -EOPNOTSUPP; -- cgit v1.2.3 From 5eea3a63ff4aba6a26002e657a6d21934b7e2b96 Mon Sep 17 00:00:00 2001 From: guodeqing Date: Wed, 17 Jun 2020 10:07:16 +0800 Subject: net: Fix the arp error in some cases ie., $ ifconfig eth0 6.6.6.6 netmask 255.255.255.0 $ ip rule add from 6.6.6.6 table 6666 $ ip route add 9.9.9.9 via 6.6.6.6 $ ping -I 6.6.6.6 9.9.9.9 PING 9.9.9.9 (9.9.9.9) from 6.6.6.6 : 56(84) bytes of data. 3 packets transmitted, 0 received, 100% packet loss, time 2079ms $ arp Address HWtype HWaddress Flags Mask Iface 6.6.6.6 (incomplete) eth0 The arp request address is error, this is because fib_table_lookup in fib_check_nh lookup the destnation 9.9.9.9 nexthop, the scope of the fib result is RT_SCOPE_LINK,the correct scope is RT_SCOPE_HOST. Here I add a check of whether this is RT_TABLE_MAIN to solve this problem. Fixes: 3bfd847203c6 ("net: Use passed in table for nexthop lookups") Signed-off-by: guodeqing Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index e53871e4a097..1f75dc686b6b 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1109,7 +1109,7 @@ static int fib_check_nh_v4_gw(struct net *net, struct fib_nh *nh, u32 table, if (fl4.flowi4_scope < RT_SCOPE_LINK) fl4.flowi4_scope = RT_SCOPE_LINK; - if (table) + if (table && table != RT_TABLE_MAIN) tbl = fib_get_table(net, table); if (tbl) -- cgit v1.2.3 From 8fd4de1275580a1befa1456d1070eaf6489fb48f Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 17 Jun 2020 12:08:56 +0200 Subject: mptcp: cache msk on MP_JOIN init_req The msk ownership is transferred to the child socket at 3rd ack time, so that we avoid more lookups later. If the request does not reach the 3rd ack, the MSK reference is dropped at request sock release time. As a side effect, fallback is now tracked by a NULL msk reference instead of zeroed 'mp_join' field. This will simplify the next patch. Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/protocol.h | 1 + net/mptcp/subflow.c | 39 +++++++++++++++++---------------------- 2 files changed, 18 insertions(+), 22 deletions(-) diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index db56535dfc29..c6eeaf3e8dcb 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -249,6 +249,7 @@ struct mptcp_subflow_request_sock { u64 thmac; u32 local_nonce; u32 remote_nonce; + struct mptcp_sock *msk; }; static inline struct mptcp_subflow_request_sock * diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index bbdb74b8bc3c..4068bdb2523b 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -69,6 +69,9 @@ static void subflow_req_destructor(struct request_sock *req) pr_debug("subflow_req=%p", subflow_req); + if (subflow_req->msk) + sock_put((struct sock *)subflow_req->msk); + if (subflow_req->mp_capable) mptcp_token_destroy_request(subflow_req->token); tcp_request_sock_ops.destructor(req); @@ -86,8 +89,8 @@ static void subflow_generate_hmac(u64 key1, u64 key2, u32 nonce1, u32 nonce2, } /* validate received token and create truncated hmac and nonce for SYN-ACK */ -static bool subflow_token_join_request(struct request_sock *req, - const struct sk_buff *skb) +static struct mptcp_sock *subflow_token_join_request(struct request_sock *req, + const struct sk_buff *skb) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); u8 hmac[SHA256_DIGEST_SIZE]; @@ -97,13 +100,13 @@ static bool subflow_token_join_request(struct request_sock *req, msk = mptcp_token_get_sock(subflow_req->token); if (!msk) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINNOTOKEN); - return false; + return NULL; } local_id = mptcp_pm_get_local_id(msk, (struct sock_common *)req); if (local_id < 0) { sock_put((struct sock *)msk); - return false; + return NULL; } subflow_req->local_id = local_id; @@ -114,9 +117,7 @@ static bool subflow_token_join_request(struct request_sock *req, subflow_req->remote_nonce, hmac); subflow_req->thmac = get_unaligned_be64(hmac); - - sock_put((struct sock *)msk); - return true; + return msk; } static void subflow_init_req(struct request_sock *req, @@ -133,6 +134,7 @@ static void subflow_init_req(struct request_sock *req, subflow_req->mp_capable = 0; subflow_req->mp_join = 0; + subflow_req->msk = NULL; #ifdef CONFIG_TCP_MD5SIG /* no MPTCP if MD5SIG is enabled on this socket or we may run out of @@ -166,12 +168,9 @@ static void subflow_init_req(struct request_sock *req, subflow_req->remote_id = mp_opt.join_id; subflow_req->token = mp_opt.token; subflow_req->remote_nonce = mp_opt.nonce; - pr_debug("token=%u, remote_nonce=%u", subflow_req->token, - subflow_req->remote_nonce); - if (!subflow_token_join_request(req, skb)) { - subflow_req->mp_join = 0; - // @@ need to trigger RST - } + subflow_req->msk = subflow_token_join_request(req, skb); + pr_debug("token=%u, remote_nonce=%u msk=%p", subflow_req->token, + subflow_req->remote_nonce, subflow_req->msk); } } @@ -354,10 +353,9 @@ static bool subflow_hmac_valid(const struct request_sock *req, const struct mptcp_subflow_request_sock *subflow_req; u8 hmac[SHA256_DIGEST_SIZE]; struct mptcp_sock *msk; - bool ret; subflow_req = mptcp_subflow_rsk(req); - msk = mptcp_token_get_sock(subflow_req->token); + msk = subflow_req->msk; if (!msk) return false; @@ -365,12 +363,7 @@ static bool subflow_hmac_valid(const struct request_sock *req, subflow_req->remote_nonce, subflow_req->local_nonce, hmac); - ret = true; - if (crypto_memneq(hmac, mp_opt->hmac, MPTCPOPT_HMAC_LEN)) - ret = false; - - sock_put((struct sock *)msk); - return ret; + return !crypto_memneq(hmac, mp_opt->hmac, MPTCPOPT_HMAC_LEN); } static void mptcp_sock_destruct(struct sock *sk) @@ -522,10 +515,12 @@ create_child: } else if (ctx->mp_join) { struct mptcp_sock *owner; - owner = mptcp_token_get_sock(ctx->token); + owner = subflow_req->msk; if (!owner) goto dispose_child; + /* move the msk reference ownership to the subflow */ + subflow_req->msk = NULL; ctx->conn = (struct sock *)owner; if (!mptcp_finish_join(child)) goto dispose_child; -- cgit v1.2.3 From 9e365ff576b7c1623bbc5ef31ec652c533e2f65e Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 17 Jun 2020 12:08:57 +0200 Subject: mptcp: drop MP_JOIN request sock on syn cookies Currently any MPTCP socket using syn cookies will fallback to TCP at 3rd ack time. In case of MP_JOIN requests, the RFC mandate closing the child and sockets, but the existing error paths do not handle the syncookie scenario correctly. Address the issue always forcing the child shutdown in case of MP_JOIN fallback. Fixes: ae2dd7164943 ("mptcp: handle tcp fallback when using syn cookies") Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/subflow.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 4068bdb2523b..3838a0b3a21f 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -431,22 +431,25 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk); struct mptcp_subflow_request_sock *subflow_req; struct mptcp_options_received mp_opt; - bool fallback_is_fatal = false; + bool fallback, fallback_is_fatal; struct sock *new_msk = NULL; - bool fallback = false; struct sock *child; pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn); - /* we need later a valid 'mp_capable' value even when options are not - * parsed + /* After child creation we must look for 'mp_capable' even when options + * are not parsed */ mp_opt.mp_capable = 0; - if (tcp_rsk(req)->is_mptcp == 0) + + /* hopefully temporary handling for MP_JOIN+syncookie */ + subflow_req = mptcp_subflow_rsk(req); + fallback_is_fatal = subflow_req->mp_join; + fallback = !tcp_rsk(req)->is_mptcp; + if (fallback) goto create_child; /* if the sk is MP_CAPABLE, we try to fetch the client key */ - subflow_req = mptcp_subflow_rsk(req); if (subflow_req->mp_capable) { if (TCP_SKB_CB(skb)->seq != subflow_req->ssn_offset + 1) { /* here we can receive and accept an in-window, @@ -467,12 +470,11 @@ create_msk: if (!new_msk) fallback = true; } else if (subflow_req->mp_join) { - fallback_is_fatal = true; mptcp_get_options(skb, &mp_opt); if (!mp_opt.mp_join || !subflow_hmac_valid(req, &mp_opt)) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC); - return NULL; + fallback = true; } } -- cgit v1.2.3 From e2dfcfba00ba4a414617ef4c5a8501fe21567eb3 Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Wed, 17 Jun 2020 16:54:52 +0200 Subject: s390/qeth: fix error handling for isolation mode cmds Current(?) OSA devices also store their cmd-specific return codes for SET_ACCESS_CONTROL cmds into the top-level cmd->hdr.return_code. So once we added stricter checking for the top-level field a while ago, none of the error logic that rolls back the user's configuration to its old state is applied any longer. For this specific cmd, go back to the old model where we peek into the cmd structure even though the top-level field indicated an error. Fixes: 686c97ee29c8 ("s390/qeth: fix error handling in adapter command callbacks") Signed-off-by: Julian Wiedmann Signed-off-by: David S. Miller --- drivers/s390/net/qeth_core_main.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c index 18a0fb75a710..9636415f810b 100644 --- a/drivers/s390/net/qeth_core_main.c +++ b/drivers/s390/net/qeth_core_main.c @@ -4544,9 +4544,6 @@ static int qeth_setadpparms_set_access_ctrl_cb(struct qeth_card *card, int fallback = *(int *)reply->param; QETH_CARD_TEXT(card, 4, "setaccb"); - if (cmd->hdr.return_code) - return -EIO; - qeth_setadpparms_inspect_rc(cmd); access_ctrl_req = &cmd->data.setadapterparms.data.set_access_ctrl; QETH_CARD_TEXT_(card, 2, "rc=%d", @@ -4556,7 +4553,7 @@ static int qeth_setadpparms_set_access_ctrl_cb(struct qeth_card *card, QETH_DBF_MESSAGE(3, "ERR:SET_ACCESS_CTRL(%#x) on device %x: %#x\n", access_ctrl_req->subcmd_code, CARD_DEVID(card), cmd->data.setadapterparms.hdr.return_code); - switch (cmd->data.setadapterparms.hdr.return_code) { + switch (qeth_setadpparms_inspect_rc(cmd)) { case SET_ACCESS_CTRL_RC_SUCCESS: if (card->options.isolation == ISOLATION_MODE_NONE) { dev_info(&card->gdev->dev, -- cgit v1.2.3 From 8cebedb643832586162aa5313cae781173c1f81b Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Wed, 17 Jun 2020 16:54:53 +0200 Subject: s390/qeth: let isolation mode override HW offload restrictions When a device is configured with ISOLATION_MODE_FWD, traffic never goes through the internal switch. Don't apply the offload restrictions in this case. Fixes: c619e9a6f52f ("s390/qeth: don't use restricted offloads for local traffic") Signed-off-by: Julian Wiedmann Signed-off-by: David S. Miller --- drivers/s390/net/qeth_core_main.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c index 9636415f810b..88e998de2d03 100644 --- a/drivers/s390/net/qeth_core_main.c +++ b/drivers/s390/net/qeth_core_main.c @@ -6837,9 +6837,11 @@ netdev_features_t qeth_features_check(struct sk_buff *skb, struct net_device *dev, netdev_features_t features) { + struct qeth_card *card = dev->ml_priv; + /* Traffic with local next-hop is not eligible for some offloads: */ - if (skb->ip_summed == CHECKSUM_PARTIAL) { - struct qeth_card *card = dev->ml_priv; + if (skb->ip_summed == CHECKSUM_PARTIAL && + card->options.isolation != ISOLATION_MODE_FWD) { netdev_features_t restricted = 0; if (skb_is_gso(skb) && !netif_needs_gso(skb, features)) -- cgit v1.2.3 From 4c98045c9b74feab837be58986c0517d3cc661f1 Mon Sep 17 00:00:00 2001 From: Martin Date: Wed, 17 Jun 2020 22:30:23 +0530 Subject: bareudp: Fixed multiproto mode configuration Code to handle multiproto configuration is missing. Fixes: 4b5f67232d95 ("net: Special handling for IP & MPLS") Signed-off-by: Martin Signed-off-by: David S. Miller --- drivers/net/bareudp.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/bareudp.c b/drivers/net/bareudp.c index 5d3c691a1c66..3dd46cd55114 100644 --- a/drivers/net/bareudp.c +++ b/drivers/net/bareudp.c @@ -572,6 +572,9 @@ static int bareudp2info(struct nlattr *data[], struct bareudp_conf *conf, if (data[IFLA_BAREUDP_SRCPORT_MIN]) conf->sport_min = nla_get_u16(data[IFLA_BAREUDP_SRCPORT_MIN]); + if (data[IFLA_BAREUDP_MULTIPROTO_MODE]) + conf->multi_proto_mode = true; + return 0; } -- cgit v1.2.3 From 3a2656a211caf35e56afc9425e6e518fa52f7fbc Mon Sep 17 00:00:00 2001 From: David Christensen Date: Wed, 17 Jun 2020 11:51:17 -0700 Subject: tg3: driver sleeps indefinitely when EEH errors exceed eeh_max_freezes The driver function tg3_io_error_detected() calls napi_disable twice, without an intervening napi_enable, when the number of EEH errors exceeds eeh_max_freezes, resulting in an indefinite sleep while holding rtnl_lock. Add check for pcierr_recovery which skips code already executed for the "Frozen" state. Signed-off-by: David Christensen Reviewed-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/tg3.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c index 7a3b22b35238..ebff1fc0d8ce 100644 --- a/drivers/net/ethernet/broadcom/tg3.c +++ b/drivers/net/ethernet/broadcom/tg3.c @@ -18168,8 +18168,8 @@ static pci_ers_result_t tg3_io_error_detected(struct pci_dev *pdev, rtnl_lock(); - /* We probably don't have netdev yet */ - if (!netdev || !netif_running(netdev)) + /* Could be second call or maybe we don't have netdev yet */ + if (!netdev || tp->pcierr_recovery || !netif_running(netdev)) goto done; /* We needn't recover from permanent error */ -- cgit v1.2.3 From eddbf5d0204e550ee59de02bdc19fe90d4203dd6 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Wed, 17 Jun 2020 20:42:47 +0000 Subject: net: ethtool: add missing NETIF_F_GSO_FRAGLIST feature string Commit 3b33583265ed ("net: Add fraglist GRO/GSO feature flags") missed an entry for NETIF_F_GSO_FRAGLIST in netdev_features_strings array. As a result, fraglist GSO feature is not shown in 'ethtool -k' output and can't be toggled on/off. The fix is trivial. Fixes: 3b33583265ed ("net: Add fraglist GRO/GSO feature flags") Signed-off-by: Alexander Lobakin Reviewed-by: Michal Kubecek Signed-off-by: David S. Miller --- net/ethtool/common.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ethtool/common.c b/net/ethtool/common.c index 423e640e3876..47f63526818e 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -43,6 +43,7 @@ const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = { [NETIF_F_GSO_SCTP_BIT] = "tx-sctp-segmentation", [NETIF_F_GSO_ESP_BIT] = "tx-esp-segmentation", [NETIF_F_GSO_UDP_L4_BIT] = "tx-udp-segmentation", + [NETIF_F_GSO_FRAGLIST_BIT] = "tx-gso-list", [NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc", [NETIF_F_SCTP_CRC_BIT] = "tx-checksum-sctp", -- cgit v1.2.3 From 8dbe4c5d5e40fe140221024f7b16bec9f310bf70 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 17 Jun 2020 20:42:44 -0700 Subject: net: dsa: bcm_sf2: Fix node reference count of_find_node_by_name() will do an of_node_put() on the "from" argument. With CONFIG_OF_DYNAMIC enabled which checks for device_node reference counts, we would be getting a warning like this: [ 6.347230] refcount_t: increment on 0; use-after-free. [ 6.352498] WARNING: CPU: 3 PID: 77 at lib/refcount.c:156 refcount_inc_checked+0x38/0x44 [ 6.360601] Modules linked in: [ 6.363661] CPU: 3 PID: 77 Comm: kworker/3:1 Tainted: G W 5.4.46-gb78b3e9956e6 #13 [ 6.372546] Hardware name: BCM97278SV (DT) [ 6.376649] Workqueue: events deferred_probe_work_func [ 6.381796] pstate: 60000005 (nZCv daif -PAN -UAO) [ 6.386595] pc : refcount_inc_checked+0x38/0x44 [ 6.391133] lr : refcount_inc_checked+0x38/0x44 ... [ 6.478791] Call trace: [ 6.481243] refcount_inc_checked+0x38/0x44 [ 6.485433] kobject_get+0x3c/0x4c [ 6.488840] of_node_get+0x24/0x34 [ 6.492247] of_irq_find_parent+0x3c/0xe0 [ 6.496263] of_irq_parse_one+0xe4/0x1d0 [ 6.500191] irq_of_parse_and_map+0x44/0x84 [ 6.504381] bcm_sf2_sw_probe+0x22c/0x844 [ 6.508397] platform_drv_probe+0x58/0xa8 [ 6.512413] really_probe+0x238/0x3fc [ 6.516081] driver_probe_device+0x11c/0x12c [ 6.520358] __device_attach_driver+0xa8/0x100 [ 6.524808] bus_for_each_drv+0xb4/0xd0 [ 6.528650] __device_attach+0xd0/0x164 [ 6.532493] device_initial_probe+0x24/0x30 [ 6.536682] bus_probe_device+0x38/0x98 [ 6.540524] deferred_probe_work_func+0xa8/0xd4 [ 6.545061] process_one_work+0x178/0x288 [ 6.549078] process_scheduled_works+0x44/0x48 [ 6.553529] worker_thread+0x218/0x270 [ 6.557285] kthread+0xdc/0xe4 [ 6.560344] ret_from_fork+0x10/0x18 [ 6.563925] ---[ end trace 68f65caf69bb152a ]--- Fix this by adding a of_node_get() to increment the reference count prior to the call. Fixes: afa3b592953b ("net: dsa: bcm_sf2: Ensure correct sub-node is parsed") Signed-off-by: Florian Fainelli Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/dsa/bcm_sf2.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c index c7ac63f41918..946e41f020a5 100644 --- a/drivers/net/dsa/bcm_sf2.c +++ b/drivers/net/dsa/bcm_sf2.c @@ -1147,6 +1147,8 @@ static int bcm_sf2_sw_probe(struct platform_device *pdev) set_bit(0, priv->cfp.used); set_bit(0, priv->cfp.unique); + /* Balance of_node_put() done by of_find_node_by_name() */ + of_node_get(dn); ports = of_find_node_by_name(dn, "ports"); if (ports) { bcm_sf2_identify_ports(priv, ports); -- cgit v1.2.3 From 0ad6f6e767ec2f613418cbc7ebe5ec4c35af540c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 17 Jun 2020 22:23:25 -0700 Subject: net: increment xmit_recursion level in dev_direct_xmit() Back in commit f60e5990d9c1 ("ipv6: protect skb->sk accesses from recursive dereference inside the stack") Hannes added code so that IPv6 stack would not trust skb->sk for typical cases where packet goes through 'standard' xmit path (__dev_queue_xmit()) Alas af_packet had a dev_direct_xmit() path that was not dealing yet with xmit_recursion level. Also change sk_mc_loop() to dump a stack once only. Without this patch, syzbot was able to trigger : [1] [ 153.567378] WARNING: CPU: 7 PID: 11273 at net/core/sock.c:721 sk_mc_loop+0x51/0x70 [ 153.567378] Modules linked in: nfnetlink ip6table_raw ip6table_filter iptable_raw iptable_nat nf_nat nf_conntrack nf_defrag_ipv4 nf_defrag_ipv6 iptable_filter macsec macvtap tap macvlan 8021q hsr wireguard libblake2s blake2s_x86_64 libblake2s_generic udp_tunnel ip6_udp_tunnel libchacha20poly1305 poly1305_x86_64 chacha_x86_64 libchacha curve25519_x86_64 libcurve25519_generic netdevsim batman_adv dummy team bridge stp llc w1_therm wire i2c_mux_pca954x i2c_mux cdc_acm ehci_pci ehci_hcd mlx4_en mlx4_ib ib_uverbs ib_core mlx4_core [ 153.567386] CPU: 7 PID: 11273 Comm: b159172088 Not tainted 5.8.0-smp-DEV #273 [ 153.567387] RIP: 0010:sk_mc_loop+0x51/0x70 [ 153.567388] Code: 66 83 f8 0a 75 24 0f b6 4f 12 b8 01 00 00 00 31 d2 d3 e0 a9 bf ef ff ff 74 07 48 8b 97 f0 02 00 00 0f b6 42 3a 83 e0 01 5d c3 <0f> 0b b8 01 00 00 00 5d c3 0f b6 87 18 03 00 00 5d c0 e8 04 83 e0 [ 153.567388] RSP: 0018:ffff95c69bb93990 EFLAGS: 00010212 [ 153.567388] RAX: 0000000000000011 RBX: ffff95c6e0ee3e00 RCX: 0000000000000007 [ 153.567389] RDX: ffff95c69ae50000 RSI: ffff95c6c30c3000 RDI: ffff95c6c30c3000 [ 153.567389] RBP: ffff95c69bb93990 R08: ffff95c69a77f000 R09: 0000000000000008 [ 153.567389] R10: 0000000000000040 R11: 00003e0e00026128 R12: ffff95c6c30c3000 [ 153.567390] R13: ffff95c6cc4fd500 R14: ffff95c6f84500c0 R15: ffff95c69aa13c00 [ 153.567390] FS: 00007fdc3a283700(0000) GS:ffff95c6ff9c0000(0000) knlGS:0000000000000000 [ 153.567390] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 153.567391] CR2: 00007ffee758e890 CR3: 0000001f9ba20003 CR4: 00000000001606e0 [ 153.567391] Call Trace: [ 153.567391] ip6_finish_output2+0x34e/0x550 [ 153.567391] __ip6_finish_output+0xe7/0x110 [ 153.567391] ip6_finish_output+0x2d/0xb0 [ 153.567392] ip6_output+0x77/0x120 [ 153.567392] ? __ip6_finish_output+0x110/0x110 [ 153.567392] ip6_local_out+0x3d/0x50 [ 153.567392] ipvlan_queue_xmit+0x56c/0x5e0 [ 153.567393] ? ksize+0x19/0x30 [ 153.567393] ipvlan_start_xmit+0x18/0x50 [ 153.567393] dev_direct_xmit+0xf3/0x1c0 [ 153.567393] packet_direct_xmit+0x69/0xa0 [ 153.567394] packet_sendmsg+0xbf0/0x19b0 [ 153.567394] ? plist_del+0x62/0xb0 [ 153.567394] sock_sendmsg+0x65/0x70 [ 153.567394] sock_write_iter+0x93/0xf0 [ 153.567394] new_sync_write+0x18e/0x1a0 [ 153.567395] __vfs_write+0x29/0x40 [ 153.567395] vfs_write+0xb9/0x1b0 [ 153.567395] ksys_write+0xb1/0xe0 [ 153.567395] __x64_sys_write+0x1a/0x20 [ 153.567395] do_syscall_64+0x43/0x70 [ 153.567396] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 153.567396] RIP: 0033:0x453549 [ 153.567396] Code: Bad RIP value. [ 153.567396] RSP: 002b:00007fdc3a282cc8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 [ 153.567397] RAX: ffffffffffffffda RBX: 00000000004d32d0 RCX: 0000000000453549 [ 153.567397] RDX: 0000000000000020 RSI: 0000000020000300 RDI: 0000000000000003 [ 153.567398] RBP: 00000000004d32d8 R08: 0000000000000000 R09: 0000000000000000 [ 153.567398] R10: 0000000000000000 R11: 0000000000000246 R12: 00000000004d32dc [ 153.567398] R13: 00007ffee742260f R14: 00007fdc3a282dc0 R15: 00007fdc3a283700 [ 153.567399] ---[ end trace c1d5ae2b1059ec62 ]--- f60e5990d9c1 ("ipv6: protect skb->sk accesses from recursive dereference inside the stack") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/core/dev.c | 2 ++ net/core/sock.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/net/core/dev.c b/net/core/dev.c index 44a14b41ad82..90b59fc50dc9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4192,10 +4192,12 @@ int dev_direct_xmit(struct sk_buff *skb, u16 queue_id) local_bh_disable(); + dev_xmit_recursion_inc(); HARD_TX_LOCK(dev, txq, smp_processor_id()); if (!netif_xmit_frozen_or_drv_stopped(txq)) ret = netdev_start_xmit(skb, dev, txq, false); HARD_TX_UNLOCK(dev, txq); + dev_xmit_recursion_dec(); local_bh_enable(); diff --git a/net/core/sock.c b/net/core/sock.c index 6c4acf1f0220..94391da27754 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -718,7 +718,7 @@ bool sk_mc_loop(struct sock *sk) return inet6_sk(sk)->mc_loop; #endif } - WARN_ON(1); + WARN_ON_ONCE(1); return true; } EXPORT_SYMBOL(sk_mc_loop); -- cgit v1.2.3 From f140ad9fe2ae16f385f8fe4dc9cf67bb4c51d794 Mon Sep 17 00:00:00 2001 From: Ciara Loftus Date: Tue, 9 Jun 2020 13:19:43 +0000 Subject: ixgbe: protect ring accesses with READ- and WRITE_ONCE READ_ONCE should be used when reading rings prior to accessing the statistics pointer. Introduce this as well as the corresponding WRITE_ONCE usage when allocating and freeing the rings, to ensure protected access. Signed-off-by: Ciara Loftus Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c | 12 ++++++------ drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 14 +++++++++++--- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c index fd9f5d41b594..2e35c5706cf1 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c @@ -921,7 +921,7 @@ static int ixgbe_alloc_q_vector(struct ixgbe_adapter *adapter, ring->queue_index = txr_idx; /* assign ring to adapter */ - adapter->tx_ring[txr_idx] = ring; + WRITE_ONCE(adapter->tx_ring[txr_idx], ring); /* update count and index */ txr_count--; @@ -948,7 +948,7 @@ static int ixgbe_alloc_q_vector(struct ixgbe_adapter *adapter, set_ring_xdp(ring); /* assign ring to adapter */ - adapter->xdp_ring[xdp_idx] = ring; + WRITE_ONCE(adapter->xdp_ring[xdp_idx], ring); /* update count and index */ xdp_count--; @@ -991,7 +991,7 @@ static int ixgbe_alloc_q_vector(struct ixgbe_adapter *adapter, ring->queue_index = rxr_idx; /* assign ring to adapter */ - adapter->rx_ring[rxr_idx] = ring; + WRITE_ONCE(adapter->rx_ring[rxr_idx], ring); /* update count and index */ rxr_count--; @@ -1020,13 +1020,13 @@ static void ixgbe_free_q_vector(struct ixgbe_adapter *adapter, int v_idx) ixgbe_for_each_ring(ring, q_vector->tx) { if (ring_is_xdp(ring)) - adapter->xdp_ring[ring->queue_index] = NULL; + WRITE_ONCE(adapter->xdp_ring[ring->queue_index], NULL); else - adapter->tx_ring[ring->queue_index] = NULL; + WRITE_ONCE(adapter->tx_ring[ring->queue_index], NULL); } ixgbe_for_each_ring(ring, q_vector->rx) - adapter->rx_ring[ring->queue_index] = NULL; + WRITE_ONCE(adapter->rx_ring[ring->queue_index], NULL); adapter->q_vector[v_idx] = NULL; napi_hash_del(&q_vector->napi); diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index f162b8b8f345..97a423ecf808 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -7051,7 +7051,10 @@ void ixgbe_update_stats(struct ixgbe_adapter *adapter) } for (i = 0; i < adapter->num_rx_queues; i++) { - struct ixgbe_ring *rx_ring = adapter->rx_ring[i]; + struct ixgbe_ring *rx_ring = READ_ONCE(adapter->rx_ring[i]); + + if (!rx_ring) + continue; non_eop_descs += rx_ring->rx_stats.non_eop_descs; alloc_rx_page += rx_ring->rx_stats.alloc_rx_page; alloc_rx_page_failed += rx_ring->rx_stats.alloc_rx_page_failed; @@ -7072,15 +7075,20 @@ void ixgbe_update_stats(struct ixgbe_adapter *adapter) packets = 0; /* gather some stats to the adapter struct that are per queue */ for (i = 0; i < adapter->num_tx_queues; i++) { - struct ixgbe_ring *tx_ring = adapter->tx_ring[i]; + struct ixgbe_ring *tx_ring = READ_ONCE(adapter->tx_ring[i]); + + if (!tx_ring) + continue; restart_queue += tx_ring->tx_stats.restart_queue; tx_busy += tx_ring->tx_stats.tx_busy; bytes += tx_ring->stats.bytes; packets += tx_ring->stats.packets; } for (i = 0; i < adapter->num_xdp_queues; i++) { - struct ixgbe_ring *xdp_ring = adapter->xdp_ring[i]; + struct ixgbe_ring *xdp_ring = READ_ONCE(adapter->xdp_ring[i]); + if (!xdp_ring) + continue; restart_queue += xdp_ring->tx_stats.restart_queue; tx_busy += xdp_ring->tx_stats.tx_busy; bytes += xdp_ring->stats.bytes; -- cgit v1.2.3 From d59e267912cd90b0adf33b4659050d831e746317 Mon Sep 17 00:00:00 2001 From: Ciara Loftus Date: Tue, 9 Jun 2020 13:19:44 +0000 Subject: i40e: protect ring accesses with READ- and WRITE_ONCE READ_ONCE should be used when reading rings prior to accessing the statistics pointer. Introduce this as well as the corresponding WRITE_ONCE usage when allocating and freeing the rings, to ensure protected access. Signed-off-by: Ciara Loftus Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/i40e/i40e_main.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 5d807c8004f8..56ecd6c3f236 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -439,11 +439,15 @@ static void i40e_get_netdev_stats_struct(struct net_device *netdev, i40e_get_netdev_stats_struct_tx(ring, stats); if (i40e_enabled_xdp_vsi(vsi)) { - ring++; + ring = READ_ONCE(vsi->xdp_rings[i]); + if (!ring) + continue; i40e_get_netdev_stats_struct_tx(ring, stats); } - ring++; + ring = READ_ONCE(vsi->rx_rings[i]); + if (!ring) + continue; do { start = u64_stats_fetch_begin_irq(&ring->syncp); packets = ring->stats.packets; @@ -787,6 +791,8 @@ static void i40e_update_vsi_stats(struct i40e_vsi *vsi) for (q = 0; q < vsi->num_queue_pairs; q++) { /* locate Tx ring */ p = READ_ONCE(vsi->tx_rings[q]); + if (!p) + continue; do { start = u64_stats_fetch_begin_irq(&p->syncp); @@ -800,8 +806,11 @@ static void i40e_update_vsi_stats(struct i40e_vsi *vsi) tx_linearize += p->tx_stats.tx_linearize; tx_force_wb += p->tx_stats.tx_force_wb; - /* Rx queue is part of the same block as Tx queue */ - p = &p[1]; + /* locate Rx ring */ + p = READ_ONCE(vsi->rx_rings[q]); + if (!p) + continue; + do { start = u64_stats_fetch_begin_irq(&p->syncp); packets = p->stats.packets; @@ -10824,10 +10833,10 @@ static void i40e_vsi_clear_rings(struct i40e_vsi *vsi) if (vsi->tx_rings && vsi->tx_rings[0]) { for (i = 0; i < vsi->alloc_queue_pairs; i++) { kfree_rcu(vsi->tx_rings[i], rcu); - vsi->tx_rings[i] = NULL; - vsi->rx_rings[i] = NULL; + WRITE_ONCE(vsi->tx_rings[i], NULL); + WRITE_ONCE(vsi->rx_rings[i], NULL); if (vsi->xdp_rings) - vsi->xdp_rings[i] = NULL; + WRITE_ONCE(vsi->xdp_rings[i], NULL); } } } @@ -10861,7 +10870,7 @@ static int i40e_alloc_rings(struct i40e_vsi *vsi) if (vsi->back->hw_features & I40E_HW_WB_ON_ITR_CAPABLE) ring->flags = I40E_TXR_FLAGS_WB_ON_ITR; ring->itr_setting = pf->tx_itr_default; - vsi->tx_rings[i] = ring++; + WRITE_ONCE(vsi->tx_rings[i], ring++); if (!i40e_enabled_xdp_vsi(vsi)) goto setup_rx; @@ -10879,7 +10888,7 @@ static int i40e_alloc_rings(struct i40e_vsi *vsi) ring->flags = I40E_TXR_FLAGS_WB_ON_ITR; set_ring_xdp(ring); ring->itr_setting = pf->tx_itr_default; - vsi->xdp_rings[i] = ring++; + WRITE_ONCE(vsi->xdp_rings[i], ring++); setup_rx: ring->queue_index = i; @@ -10892,7 +10901,7 @@ setup_rx: ring->size = 0; ring->dcb_tc = 0; ring->itr_setting = pf->rx_itr_default; - vsi->rx_rings[i] = ring; + WRITE_ONCE(vsi->rx_rings[i], ring); } return 0; -- cgit v1.2.3 From b1d95cc2391ffac0c5b27256a4fb0d2cfb021a29 Mon Sep 17 00:00:00 2001 From: Ciara Loftus Date: Tue, 9 Jun 2020 13:19:45 +0000 Subject: ice: protect ring accesses with WRITE_ONCE The READ_ONCE macro is used when reading rings prior to accessing the statistics pointer. The corresponding WRITE_ONCE usage when allocating and freeing the rings to ensure protected access was not in place. Introduce this. Signed-off-by: Ciara Loftus Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_lib.c | 8 ++++---- drivers/net/ethernet/intel/ice/ice_main.c | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c index 28b46cc9f5cb..2e3a39cea2c0 100644 --- a/drivers/net/ethernet/intel/ice/ice_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_lib.c @@ -1194,7 +1194,7 @@ static void ice_vsi_clear_rings(struct ice_vsi *vsi) for (i = 0; i < vsi->alloc_txq; i++) { if (vsi->tx_rings[i]) { kfree_rcu(vsi->tx_rings[i], rcu); - vsi->tx_rings[i] = NULL; + WRITE_ONCE(vsi->tx_rings[i], NULL); } } } @@ -1202,7 +1202,7 @@ static void ice_vsi_clear_rings(struct ice_vsi *vsi) for (i = 0; i < vsi->alloc_rxq; i++) { if (vsi->rx_rings[i]) { kfree_rcu(vsi->rx_rings[i], rcu); - vsi->rx_rings[i] = NULL; + WRITE_ONCE(vsi->rx_rings[i], NULL); } } } @@ -1235,7 +1235,7 @@ static int ice_vsi_alloc_rings(struct ice_vsi *vsi) ring->vsi = vsi; ring->dev = dev; ring->count = vsi->num_tx_desc; - vsi->tx_rings[i] = ring; + WRITE_ONCE(vsi->tx_rings[i], ring); } /* Allocate Rx rings */ @@ -1254,7 +1254,7 @@ static int ice_vsi_alloc_rings(struct ice_vsi *vsi) ring->netdev = vsi->netdev; ring->dev = dev; ring->count = vsi->num_rx_desc; - vsi->rx_rings[i] = ring; + WRITE_ONCE(vsi->rx_rings[i], ring); } return 0; diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 082825e3cb39..4cbd49c87568 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -1702,7 +1702,7 @@ static int ice_xdp_alloc_setup_rings(struct ice_vsi *vsi) xdp_ring->netdev = NULL; xdp_ring->dev = dev; xdp_ring->count = vsi->num_tx_desc; - vsi->xdp_rings[i] = xdp_ring; + WRITE_ONCE(vsi->xdp_rings[i], xdp_ring); if (ice_setup_tx_ring(xdp_ring)) goto free_xdp_rings; ice_set_ring_xdp(xdp_ring); -- cgit v1.2.3 From 3995ecbabc6f8019b3a55aeb551eeb9598b8b354 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Fri, 12 Jun 2020 13:47:31 +0200 Subject: i40e: fix crash when Rx descriptor count is changed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the AF_XDP buffer allocator was introduced, the Rx SW ring "rx_bi" allocation was moved from i40e_setup_rx_descriptors() function, and was instead done in the i40e_configure_rx_ring() function. This broke the ethtool set_ringparam() hook for changing the Rx descriptor count, which was relying on i40e_setup_rx_descriptors() to handle the allocation. Fix this by adding an explicit i40e_alloc_rx_bi() call to i40e_set_ringparam(). Fixes: be1222b585fd ("i40e: Separate kernel allocated rx_bi rings from AF_XDP rings") Signed-off-by: Björn Töpel Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/i40e/i40e_ethtool.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c index aa8026b1eb81..67806b7b2f49 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c +++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c @@ -2070,6 +2070,9 @@ static int i40e_set_ringparam(struct net_device *netdev, */ rx_rings[i].tail = hw->hw_addr + I40E_PRTGEN_STATUS; err = i40e_setup_rx_descriptors(&rx_rings[i]); + if (err) + goto rx_unwind; + err = i40e_alloc_rx_bi(&rx_rings[i]); if (err) goto rx_unwind; -- cgit v1.2.3 From 24f5aa53afbf72ffd7d2587cd2edd44342e0270f Mon Sep 17 00:00:00 2001 From: Flavio Suligoi Date: Fri, 19 Jun 2020 11:02:04 +0200 Subject: net: ethernet: neterion: vxge: fix spelling mistake Fix typo: "trigered" --> "triggered" Signed-off-by: Flavio Suligoi Signed-off-by: David S. Miller --- drivers/net/ethernet/neterion/vxge/vxge-config.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/neterion/vxge/vxge-config.h b/drivers/net/ethernet/neterion/vxge/vxge-config.h index 628fa9b2f741..373165119850 100644 --- a/drivers/net/ethernet/neterion/vxge/vxge-config.h +++ b/drivers/net/ethernet/neterion/vxge/vxge-config.h @@ -297,7 +297,7 @@ struct vxge_hw_fifo_config { * @greedy_return: If Set it forces the device to return absolutely all RxD * that are consumed and still on board when a timer interrupt * triggers. If Clear, then if the device has already returned - * RxD before current timer interrupt trigerred and after the + * RxD before current timer interrupt triggered and after the * previous timer interrupt triggered, then the device is not * forced to returned the rest of the consumed RxD that it has * on board which account for a byte count less than the one -- cgit v1.2.3 From e2c0b9712452296cb50a9faf07cc688862238f24 Mon Sep 17 00:00:00 2001 From: Stefan Schmidt Date: Tue, 16 Jun 2020 08:47:32 +0200 Subject: docs: net: ieee802154: change link to new project URL We finally came around to setup a new project website. Update the reference here. Signed-off-by: Stefan Schmidt --- Documentation/networking/ieee802154.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/networking/ieee802154.rst b/Documentation/networking/ieee802154.rst index 36ca823a1122..6f4bf8447a21 100644 --- a/Documentation/networking/ieee802154.rst +++ b/Documentation/networking/ieee802154.rst @@ -30,8 +30,8 @@ Socket API The address family, socket addresses etc. are defined in the include/net/af_ieee802154.h header or in the special header -in the userspace package (see either http://wpan.cakelab.org/ or the -git tree at https://github.com/linux-wpan/wpan-tools). +in the userspace package (see either https://linux-wpan.org/wpan-tools.html +or the git tree at https://github.com/linux-wpan/wpan-tools). 6LoWPAN Linux implementation ============================ -- cgit v1.2.3 From e795a61a85e65b4f12c2ed3529911ac0f013fa40 Mon Sep 17 00:00:00 2001 From: Stefan Schmidt Date: Tue, 16 Jun 2020 08:49:10 +0200 Subject: MAINTAINERS: update ieee802154 project website URL Update URL to our new home. Signed-off-by: Stefan Schmidt --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 301330e02bca..a09e61268c08 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8333,7 +8333,7 @@ M: Alexander Aring M: Stefan Schmidt L: linux-wpan@vger.kernel.org S: Maintained -W: http://wpan.cakelab.org/ +W: https://linux-wpan.org/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/sschmidt/wpan.git T: git git://git.kernel.org/pub/scm/linux/kernel/git/sschmidt/wpan-next.git F: Documentation/networking/ieee802154.rst -- cgit v1.2.3 From 6564cfefb01c4c5b8c357f300feac5cb1585e1a9 Mon Sep 17 00:00:00 2001 From: Flavio Suligoi Date: Fri, 19 Jun 2020 11:18:11 +0200 Subject: net: ethernet: oki-semi: pch_gbe: fix spelling mistake Fix typo: "Triger" --> "Trigger" Signed-off-by: Flavio Suligoi Signed-off-by: David S. Miller --- drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe.h b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe.h index 32b9d7705404..55cef5b16aa5 100644 --- a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe.h +++ b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe.h @@ -147,7 +147,7 @@ struct pch_gbe_regs { #define PCH_GBE_RH_ALM_FULL_8 0x00001000 /* 8 words */ #define PCH_GBE_RH_ALM_FULL_16 0x00002000 /* 16 words */ #define PCH_GBE_RH_ALM_FULL_32 0x00003000 /* 32 words */ -/* RX FIFO Read Triger Threshold */ +/* RX FIFO Read Trigger Threshold */ #define PCH_GBE_RH_RD_TRG_4 0x00000000 /* 4 words */ #define PCH_GBE_RH_RD_TRG_8 0x00000200 /* 8 words */ #define PCH_GBE_RH_RD_TRG_16 0x00000400 /* 16 words */ -- cgit v1.2.3 From 5a8d7f126c97d04d893f5e5be2b286437a0d01b0 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 19 Jun 2020 11:47:46 -0700 Subject: of: of_mdio: Correct loop scanning logic Commit 209c65b61d94 ("drivers/of/of_mdio.c:fix of_mdiobus_register()") introduced a break of the loop on the premise that a successful registration should exit the loop. The premise is correct but not to code, because rc && rc != -ENODEV is just a special error condition, that means we would exit the loop even with rc == -ENODEV which is absolutely not correct since this is the error code to indicate to the MDIO bus layer that scanning should continue. Fix this by explicitly checking for rc = 0 as the only valid condition to break out of the loop. Fixes: 209c65b61d94 ("drivers/of/of_mdio.c:fix of_mdiobus_register()") Reviewed-by: Andrew Lunn Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/of/of_mdio.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/of/of_mdio.c b/drivers/of/of_mdio.c index a04afe79529c..ef6f818ce5b3 100644 --- a/drivers/of/of_mdio.c +++ b/drivers/of/of_mdio.c @@ -314,10 +314,15 @@ int of_mdiobus_register(struct mii_bus *mdio, struct device_node *np) child, addr); if (of_mdiobus_child_is_phy(child)) { + /* -ENODEV is the return code that PHYLIB has + * standardized on to indicate that bus + * scanning should continue. + */ rc = of_mdiobus_register_phy(mdio, child, addr); - if (rc && rc != -ENODEV) + if (!rc) + break; + if (rc != -ENODEV) goto unregister; - break; } } } -- cgit v1.2.3 From b2ffc75e2e990b09903f9d15ccd53bc5f3a4217c Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 19 Jun 2020 11:47:47 -0700 Subject: net: phy: Check harder for errors in get_phy_id() Commit 02a6efcab675 ("net: phy: allow scanning busses with missing phys") added a special condition to return -ENODEV in case -ENODEV or -EIO was returned from the first read of the MII_PHYSID1 register. In case the MDIO bus data line pull-up is not strong enough, the MDIO bus controller will not flag this as a read error. This can happen when a pluggable daughter card is not connected and weak internal pull-ups are used (since that is the only option, otherwise the pins are floating). The second read of MII_PHYSID2 will be correctly flagged an error though, but now we will return -EIO which will be treated as a hard error, thus preventing MDIO bus scanning loops to continue succesfully. Apply the same logic to both register reads, thus allowing the scanning logic to proceed. Fixes: 02a6efcab675 ("net: phy: allow scanning busses with missing phys") Reviewed-by: Andrew Lunn Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/phy_device.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 04946de74fa0..85ba95b598b5 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -794,8 +794,10 @@ static int get_phy_id(struct mii_bus *bus, int addr, u32 *phy_id, /* Grab the bits from PHYIR2, and put them in the lower half */ phy_reg = mdiobus_read(bus, addr, MII_PHYSID2); - if (phy_reg < 0) - return -EIO; + if (phy_reg < 0) { + /* returning -ENODEV doesn't stop bus scanning */ + return (phy_reg == -EIO || phy_reg == -ENODEV) ? -ENODEV : -EIO; + } *phy_id |= phy_reg; -- cgit v1.2.3 From faa620876b01d6744f1599e279042bb8149247ab Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Thu, 18 Jun 2020 11:37:40 +0300 Subject: net: macb: undo operations in case of failure Undo previously done operation in case macb_phylink_connect() fails. Since macb_reset_hw() is the 1st undo operation the napi_exit label was renamed to reset_hw. Fixes: 7897b071ac3b ("net: macb: convert to phylink") Signed-off-by: Claudiu Beznea Acked-by: Nicolas Ferre Signed-off-by: David S. Miller --- drivers/net/ethernet/cadence/macb_main.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index 67933079aeea..257c4920cb88 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -2558,7 +2558,7 @@ static int macb_open(struct net_device *dev) err = macb_phylink_connect(bp); if (err) - goto napi_exit; + goto reset_hw; netif_tx_start_all_queues(dev); @@ -2567,9 +2567,11 @@ static int macb_open(struct net_device *dev) return 0; -napi_exit: +reset_hw: + macb_reset_hw(bp); for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) napi_disable(&queue->napi); + macb_free_consistent(bp); pm_exit: pm_runtime_put_sync(&bp->pdev->dev); return err; -- cgit v1.2.3 From 9deba33f1b7266a3870c9da31f787b605748fc0c Mon Sep 17 00:00:00 2001 From: Claudiu Manoil Date: Thu, 18 Jun 2020 12:16:52 +0300 Subject: enetc: Fix HW_VLAN_CTAG_TX|RX toggling VLAN tag insertion/extraction offload is correctly activated at probe time but deactivation of this feature (i.e. via ethtool) is broken. Toggling works only for Tx/Rx ring 0 of a PF, and is ignored for the other rings, including the VF rings. To fix this, the existing VLAN offload toggling code was extended to all the rings assigned to a netdevice, instead of the default ring 0 (likely a leftover from the early validation days of this feature). And the code was moved to the common set_features() function to fix toggling for the VF driver too. Fixes: d4fd0404c1c9 ("enetc: Introduce basic PF and VF ENETC ethernet drivers") Signed-off-by: Claudiu Manoil Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/enetc/enetc.c | 26 +++++++++++++++++++++++++ drivers/net/ethernet/freescale/enetc/enetc_hw.h | 16 +++++++-------- drivers/net/ethernet/freescale/enetc/enetc_pf.c | 8 -------- 3 files changed, 34 insertions(+), 16 deletions(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index 298c55786fd9..96831f49925c 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -1595,6 +1595,24 @@ static int enetc_set_psfp(struct net_device *ndev, int en) return 0; } +static void enetc_enable_rxvlan(struct net_device *ndev, bool en) +{ + struct enetc_ndev_priv *priv = netdev_priv(ndev); + int i; + + for (i = 0; i < priv->num_rx_rings; i++) + enetc_bdr_enable_rxvlan(&priv->si->hw, i, en); +} + +static void enetc_enable_txvlan(struct net_device *ndev, bool en) +{ + struct enetc_ndev_priv *priv = netdev_priv(ndev); + int i; + + for (i = 0; i < priv->num_tx_rings; i++) + enetc_bdr_enable_txvlan(&priv->si->hw, i, en); +} + int enetc_set_features(struct net_device *ndev, netdev_features_t features) { @@ -1604,6 +1622,14 @@ int enetc_set_features(struct net_device *ndev, if (changed & NETIF_F_RXHASH) enetc_set_rss(ndev, !!(features & NETIF_F_RXHASH)); + if (changed & NETIF_F_HW_VLAN_CTAG_RX) + enetc_enable_rxvlan(ndev, + !!(features & NETIF_F_HW_VLAN_CTAG_RX)); + + if (changed & NETIF_F_HW_VLAN_CTAG_TX) + enetc_enable_txvlan(ndev, + !!(features & NETIF_F_HW_VLAN_CTAG_TX)); + if (changed & NETIF_F_HW_TC) err = enetc_set_psfp(ndev, !!(features & NETIF_F_HW_TC)); diff --git a/drivers/net/ethernet/freescale/enetc/enetc_hw.h b/drivers/net/ethernet/freescale/enetc/enetc_hw.h index 6314051bc6c1..ce0d321c0639 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_hw.h +++ b/drivers/net/ethernet/freescale/enetc/enetc_hw.h @@ -531,22 +531,22 @@ struct enetc_msg_cmd_header { /* Common H/W utility functions */ -static inline void enetc_enable_rxvlan(struct enetc_hw *hw, int si_idx, - bool en) +static inline void enetc_bdr_enable_rxvlan(struct enetc_hw *hw, int idx, + bool en) { - u32 val = enetc_rxbdr_rd(hw, si_idx, ENETC_RBMR); + u32 val = enetc_rxbdr_rd(hw, idx, ENETC_RBMR); val = (val & ~ENETC_RBMR_VTE) | (en ? ENETC_RBMR_VTE : 0); - enetc_rxbdr_wr(hw, si_idx, ENETC_RBMR, val); + enetc_rxbdr_wr(hw, idx, ENETC_RBMR, val); } -static inline void enetc_enable_txvlan(struct enetc_hw *hw, int si_idx, - bool en) +static inline void enetc_bdr_enable_txvlan(struct enetc_hw *hw, int idx, + bool en) { - u32 val = enetc_txbdr_rd(hw, si_idx, ENETC_TBMR); + u32 val = enetc_txbdr_rd(hw, idx, ENETC_TBMR); val = (val & ~ENETC_TBMR_VIH) | (en ? ENETC_TBMR_VIH : 0); - enetc_txbdr_wr(hw, si_idx, ENETC_TBMR, val); + enetc_txbdr_wr(hw, idx, ENETC_TBMR, val); } static inline void enetc_set_bdr_prio(struct enetc_hw *hw, int bdr_idx, diff --git a/drivers/net/ethernet/freescale/enetc/enetc_pf.c b/drivers/net/ethernet/freescale/enetc/enetc_pf.c index 824d211ec00f..4fac57dbb3c8 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_pf.c +++ b/drivers/net/ethernet/freescale/enetc/enetc_pf.c @@ -649,14 +649,6 @@ static int enetc_pf_set_features(struct net_device *ndev, netdev_features_t changed = ndev->features ^ features; struct enetc_ndev_priv *priv = netdev_priv(ndev); - if (changed & NETIF_F_HW_VLAN_CTAG_RX) - enetc_enable_rxvlan(&priv->si->hw, 0, - !!(features & NETIF_F_HW_VLAN_CTAG_RX)); - - if (changed & NETIF_F_HW_VLAN_CTAG_TX) - enetc_enable_txvlan(&priv->si->hw, 0, - !!(features & NETIF_F_HW_VLAN_CTAG_TX)); - if (changed & NETIF_F_HW_VLAN_CTAG_FILTER) { struct enetc_pf *pf = enetc_si_priv(priv->si); -- cgit v1.2.3 From 56c09de347e40804fc8dad155272fb9609e0a97b Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Thu, 18 Jun 2020 12:13:22 +0200 Subject: geneve: allow changing DF behavior after creation Currently, trying to change the DF parameter of a geneve device does nothing: # ip -d link show geneve1 14: geneve1: link/ether geneve id 1 remote 10.0.0.1 ttl auto df set dstport 6081 # ip link set geneve1 type geneve id 1 df unset # ip -d link show geneve1 14: geneve1: link/ether geneve id 1 remote 10.0.0.1 ttl auto df set dstport 6081 We just need to update the value in geneve_changelink. Fixes: a025fb5f49ad ("geneve: Allow configuration of DF behaviour") Signed-off-by: Sabrina Dubroca Reviewed-by: Stefano Brivio Signed-off-by: David S. Miller --- drivers/net/geneve.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 75266580b586..4661ef865807 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -1649,6 +1649,7 @@ static int geneve_changelink(struct net_device *dev, struct nlattr *tb[], geneve->collect_md = metadata; geneve->use_udp6_rx_checksums = use_udp6_rx_checksums; geneve->ttl_inherit = ttl_inherit; + geneve->df = df; geneve_unquiesce(geneve, gs4, gs6); return 0; -- cgit v1.2.3 From 26f2eb27d081081dbea6d3c11602c9ece36f4d9c Mon Sep 17 00:00:00 2001 From: wenxu Date: Thu, 18 Jun 2020 20:49:08 +0800 Subject: flow_offload: add flow_indr_block_cb_alloc/remove function Add flow_indr_block_cb_alloc/remove function for next fix patch. Signed-off-by: wenxu Signed-off-by: David S. Miller --- include/net/flow_offload.h | 13 +++++++++++++ net/core/flow_offload.c | 21 +++++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index f2c8311a0433..bf4343042956 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -467,6 +467,12 @@ struct flow_block_cb { struct flow_block_cb *flow_block_cb_alloc(flow_setup_cb_t *cb, void *cb_ident, void *cb_priv, void (*release)(void *cb_priv)); +struct flow_block_cb *flow_indr_block_cb_alloc(flow_setup_cb_t *cb, + void *cb_ident, void *cb_priv, + void (*release)(void *cb_priv), + struct flow_block_offload *bo, + struct net_device *dev, void *data, + void (*cleanup)(struct flow_block_cb *block_cb)); void flow_block_cb_free(struct flow_block_cb *block_cb); struct flow_block_cb *flow_block_cb_lookup(struct flow_block *block, @@ -488,6 +494,13 @@ static inline void flow_block_cb_remove(struct flow_block_cb *block_cb, list_move(&block_cb->list, &offload->cb_list); } +static inline void flow_indr_block_cb_remove(struct flow_block_cb *block_cb, + struct flow_block_offload *offload) +{ + list_del(&block_cb->indr.list); + list_move(&block_cb->list, &offload->cb_list); +} + bool flow_block_cb_is_busy(flow_setup_cb_t *cb, void *cb_ident, struct list_head *driver_block_list); diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c index 0cfc35e6be28..1fd781d155b4 100644 --- a/net/core/flow_offload.c +++ b/net/core/flow_offload.c @@ -437,6 +437,27 @@ static void flow_block_indr_init(struct flow_block_cb *flow_block, flow_block->indr.cleanup = cleanup; } +struct flow_block_cb *flow_indr_block_cb_alloc(flow_setup_cb_t *cb, + void *cb_ident, void *cb_priv, + void (*release)(void *cb_priv), + struct flow_block_offload *bo, + struct net_device *dev, void *data, + void (*cleanup)(struct flow_block_cb *block_cb)) +{ + struct flow_block_cb *block_cb; + + block_cb = flow_block_cb_alloc(cb, cb_ident, cb_priv, release); + if (IS_ERR(block_cb)) + goto out; + + flow_block_indr_init(block_cb, bo, dev, data, cleanup); + list_add(&block_cb->indr.list, &flow_block_indr_list); + +out: + return block_cb; +} +EXPORT_SYMBOL(flow_indr_block_cb_alloc); + static void __flow_block_indr_binding(struct flow_block_offload *bo, struct net_device *dev, void *data, void (*cleanup)(struct flow_block_cb *block_cb)) -- cgit v1.2.3 From 66f1939a1b705305df820d65f4d9a8457d05759c Mon Sep 17 00:00:00 2001 From: wenxu Date: Thu, 18 Jun 2020 20:49:09 +0800 Subject: flow_offload: use flow_indr_block_cb_alloc/remove function Prepare fix the bug in the next patch. use flow_indr_block_cb_alloc/remove function and remove the __flow_block_indr_binding. Signed-off-by: wenxu Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c | 19 ++++++++++++------- .../net/ethernet/mellanox/mlx5/core/en/rep/tc.c | 21 ++++++++++++++------- drivers/net/ethernet/netronome/nfp/flower/main.h | 4 +++- .../net/ethernet/netronome/nfp/flower/offload.c | 18 +++++++++++------- include/net/flow_offload.h | 4 +++- net/core/flow_offload.c | 22 +--------------------- 6 files changed, 44 insertions(+), 44 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c index 0eef4f5e4a46..3e3a8841689d 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c @@ -1889,7 +1889,8 @@ static void bnxt_tc_setup_indr_rel(void *cb_priv) } static int bnxt_tc_setup_indr_block(struct net_device *netdev, struct bnxt *bp, - struct flow_block_offload *f) + struct flow_block_offload *f, void *data, + void (*cleanup)(struct flow_block_cb *block_cb)) { struct bnxt_flower_indr_block_cb_priv *cb_priv; struct flow_block_cb *block_cb; @@ -1907,9 +1908,10 @@ static int bnxt_tc_setup_indr_block(struct net_device *netdev, struct bnxt *bp, cb_priv->bp = bp; list_add(&cb_priv->list, &bp->tc_indr_block_list); - block_cb = flow_block_cb_alloc(bnxt_tc_setup_indr_block_cb, - cb_priv, cb_priv, - bnxt_tc_setup_indr_rel); + block_cb = flow_indr_block_cb_alloc(bnxt_tc_setup_indr_block_cb, + cb_priv, cb_priv, + bnxt_tc_setup_indr_rel, f, + netdev, data, cleanup); if (IS_ERR(block_cb)) { list_del(&cb_priv->list); kfree(cb_priv); @@ -1930,7 +1932,7 @@ static int bnxt_tc_setup_indr_block(struct net_device *netdev, struct bnxt *bp, if (!block_cb) return -ENOENT; - flow_block_cb_remove(block_cb, f); + flow_indr_block_cb_remove(block_cb, f); list_del(&block_cb->driver_list); break; default: @@ -1945,14 +1947,17 @@ static bool bnxt_is_netdev_indr_offload(struct net_device *netdev) } static int bnxt_tc_setup_indr_cb(struct net_device *netdev, void *cb_priv, - enum tc_setup_type type, void *type_data) + enum tc_setup_type type, void *type_data, + void *data, + void (*cleanup)(struct flow_block_cb *block_cb)) { if (!bnxt_is_netdev_indr_offload(netdev)) return -EOPNOTSUPP; switch (type) { case TC_SETUP_BLOCK: - return bnxt_tc_setup_indr_block(netdev, cb_priv, type_data); + return bnxt_tc_setup_indr_block(netdev, cb_priv, type_data, data, + cleanup); default: break; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c index 80713123de5c..d629864fc996 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c @@ -407,7 +407,9 @@ static int mlx5e_rep_indr_setup_block(struct net_device *netdev, struct mlx5e_rep_priv *rpriv, struct flow_block_offload *f, - flow_setup_cb_t *setup_cb) + flow_setup_cb_t *setup_cb, + void *data, + void (*cleanup)(struct flow_block_cb *block_cb)) { struct mlx5e_priv *priv = netdev_priv(rpriv->netdev); struct mlx5e_rep_indr_block_priv *indr_priv; @@ -438,8 +440,9 @@ mlx5e_rep_indr_setup_block(struct net_device *netdev, list_add(&indr_priv->list, &rpriv->uplink_priv.tc_indr_block_priv_list); - block_cb = flow_block_cb_alloc(setup_cb, indr_priv, indr_priv, - mlx5e_rep_indr_block_unbind); + block_cb = flow_indr_block_cb_alloc(setup_cb, indr_priv, indr_priv, + mlx5e_rep_indr_block_unbind, + f, netdev, data, cleanup); if (IS_ERR(block_cb)) { list_del(&indr_priv->list); kfree(indr_priv); @@ -458,7 +461,7 @@ mlx5e_rep_indr_setup_block(struct net_device *netdev, if (!block_cb) return -ENOENT; - flow_block_cb_remove(block_cb, f); + flow_indr_block_cb_remove(block_cb, f); list_del(&block_cb->driver_list); return 0; default: @@ -469,15 +472,19 @@ mlx5e_rep_indr_setup_block(struct net_device *netdev, static int mlx5e_rep_indr_setup_cb(struct net_device *netdev, void *cb_priv, - enum tc_setup_type type, void *type_data) + enum tc_setup_type type, void *type_data, + void *data, + void (*cleanup)(struct flow_block_cb *block_cb)) { switch (type) { case TC_SETUP_BLOCK: return mlx5e_rep_indr_setup_block(netdev, cb_priv, type_data, - mlx5e_rep_indr_setup_tc_cb); + mlx5e_rep_indr_setup_tc_cb, + data, cleanup); case TC_SETUP_FT: return mlx5e_rep_indr_setup_block(netdev, cb_priv, type_data, - mlx5e_rep_indr_setup_ft_cb); + mlx5e_rep_indr_setup_ft_cb, + data, cleanup); default: return -EOPNOTSUPP; } diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.h b/drivers/net/ethernet/netronome/nfp/flower/main.h index 6c3dc3baf387..56b3b68e273e 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/main.h +++ b/drivers/net/ethernet/netronome/nfp/flower/main.h @@ -459,7 +459,9 @@ int nfp_flower_setup_qos_offload(struct nfp_app *app, struct net_device *netdev, struct tc_cls_matchall_offload *flow); void nfp_flower_stats_rlim_reply(struct nfp_app *app, struct sk_buff *skb); int nfp_flower_indr_setup_tc_cb(struct net_device *netdev, void *cb_priv, - enum tc_setup_type type, void *type_data); + enum tc_setup_type type, void *type_data, + void *data, + void (*cleanup)(struct flow_block_cb *block_cb)); int nfp_flower_setup_indr_block_cb(enum tc_setup_type type, void *type_data, void *cb_priv); diff --git a/drivers/net/ethernet/netronome/nfp/flower/offload.c b/drivers/net/ethernet/netronome/nfp/flower/offload.c index 695d24b9dd92..95c75250c355 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/offload.c +++ b/drivers/net/ethernet/netronome/nfp/flower/offload.c @@ -1647,7 +1647,8 @@ static void nfp_flower_setup_indr_tc_release(void *cb_priv) static int nfp_flower_setup_indr_tc_block(struct net_device *netdev, struct nfp_app *app, - struct flow_block_offload *f) + struct flow_block_offload *f, void *data, + void (*cleanup)(struct flow_block_cb *block_cb)) { struct nfp_flower_indr_block_cb_priv *cb_priv; struct nfp_flower_priv *priv = app->priv; @@ -1676,9 +1677,10 @@ nfp_flower_setup_indr_tc_block(struct net_device *netdev, struct nfp_app *app, cb_priv->app = app; list_add(&cb_priv->list, &priv->indr_block_cb_priv); - block_cb = flow_block_cb_alloc(nfp_flower_setup_indr_block_cb, - cb_priv, cb_priv, - nfp_flower_setup_indr_tc_release); + block_cb = flow_indr_block_cb_alloc(nfp_flower_setup_indr_block_cb, + cb_priv, cb_priv, + nfp_flower_setup_indr_tc_release, + f, netdev, data, cleanup); if (IS_ERR(block_cb)) { list_del(&cb_priv->list); kfree(cb_priv); @@ -1699,7 +1701,7 @@ nfp_flower_setup_indr_tc_block(struct net_device *netdev, struct nfp_app *app, if (!block_cb) return -ENOENT; - flow_block_cb_remove(block_cb, f); + flow_indr_block_cb_remove(block_cb, f); list_del(&block_cb->driver_list); return 0; default: @@ -1710,7 +1712,9 @@ nfp_flower_setup_indr_tc_block(struct net_device *netdev, struct nfp_app *app, int nfp_flower_indr_setup_tc_cb(struct net_device *netdev, void *cb_priv, - enum tc_setup_type type, void *type_data) + enum tc_setup_type type, void *type_data, + void *data, + void (*cleanup)(struct flow_block_cb *block_cb)) { if (!nfp_fl_is_netdev_to_offload(netdev)) return -EOPNOTSUPP; @@ -1718,7 +1722,7 @@ nfp_flower_indr_setup_tc_cb(struct net_device *netdev, void *cb_priv, switch (type) { case TC_SETUP_BLOCK: return nfp_flower_setup_indr_tc_block(netdev, cb_priv, - type_data); + type_data, data, cleanup); default: return -EOPNOTSUPP; } diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index bf4343042956..1961c7982273 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -545,7 +545,9 @@ static inline void flow_block_init(struct flow_block *flow_block) } typedef int flow_indr_block_bind_cb_t(struct net_device *dev, void *cb_priv, - enum tc_setup_type type, void *type_data); + enum tc_setup_type type, void *type_data, + void *data, + void (*cleanup)(struct flow_block_cb *block_cb)); int flow_indr_dev_register(flow_indr_block_bind_cb_t *cb, void *cb_priv); void flow_indr_dev_unregister(flow_indr_block_bind_cb_t *cb, void *cb_priv, diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c index 1fd781d155b4..ddd958c6040e 100644 --- a/net/core/flow_offload.c +++ b/net/core/flow_offload.c @@ -458,25 +458,6 @@ out: } EXPORT_SYMBOL(flow_indr_block_cb_alloc); -static void __flow_block_indr_binding(struct flow_block_offload *bo, - struct net_device *dev, void *data, - void (*cleanup)(struct flow_block_cb *block_cb)) -{ - struct flow_block_cb *block_cb; - - list_for_each_entry(block_cb, &bo->cb_list, list) { - switch (bo->command) { - case FLOW_BLOCK_BIND: - flow_block_indr_init(block_cb, bo, dev, data, cleanup); - list_add(&block_cb->indr.list, &flow_block_indr_list); - break; - case FLOW_BLOCK_UNBIND: - list_del(&block_cb->indr.list); - break; - } - } -} - int flow_indr_dev_setup_offload(struct net_device *dev, enum tc_setup_type type, void *data, struct flow_block_offload *bo, @@ -486,9 +467,8 @@ int flow_indr_dev_setup_offload(struct net_device *dev, mutex_lock(&flow_indr_block_lock); list_for_each_entry(this, &flow_block_indr_dev_list, list) - this->cb(dev, this->cb_priv, type, bo); + this->cb(dev, this->cb_priv, type, bo, data, cleanup); - __flow_block_indr_binding(bo, dev, data, cleanup); mutex_unlock(&flow_indr_block_lock); return list_empty(&bo->cb_list) ? -EOPNOTSUPP : 0; -- cgit v1.2.3 From a1db217861f33b8d9ea8171bcacee51186e2d5ba Mon Sep 17 00:00:00 2001 From: wenxu Date: Thu, 18 Jun 2020 20:49:10 +0800 Subject: net: flow_offload: fix flow_indr_dev_unregister path If the representor is removed, then identify the indirect flow_blocks that need to be removed by the release callback and the port representor structure. To identify the port representor structure, a new indr.cb_priv field needs to be introduced. The flow_block also needs to be removed from the driver list from the cleanup path. Fixes: 1fac52da5942 ("net: flow_offload: consolidate indirect flow_block infrastructure") Signed-off-by: wenxu Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c | 4 ++-- drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c | 5 +++-- drivers/net/ethernet/netronome/nfp/flower/main.c | 2 +- drivers/net/ethernet/netronome/nfp/flower/main.h | 3 +-- drivers/net/ethernet/netronome/nfp/flower/offload.c | 8 ++++---- include/net/flow_offload.h | 4 +++- net/core/flow_offload.c | 16 ++++++++++------ net/netfilter/nf_flow_table_offload.c | 1 + net/netfilter/nf_tables_offload.c | 1 + net/sched/cls_api.c | 1 + 10 files changed, 27 insertions(+), 18 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c index 3e3a8841689d..4a11c1e7cc02 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c @@ -1911,7 +1911,7 @@ static int bnxt_tc_setup_indr_block(struct net_device *netdev, struct bnxt *bp, block_cb = flow_indr_block_cb_alloc(bnxt_tc_setup_indr_block_cb, cb_priv, cb_priv, bnxt_tc_setup_indr_rel, f, - netdev, data, cleanup); + netdev, data, bp, cleanup); if (IS_ERR(block_cb)) { list_del(&cb_priv->list); kfree(cb_priv); @@ -2079,7 +2079,7 @@ void bnxt_shutdown_tc(struct bnxt *bp) return; flow_indr_dev_unregister(bnxt_tc_setup_indr_cb, bp, - bnxt_tc_setup_indr_block_cb); + bnxt_tc_setup_indr_rel); rhashtable_destroy(&tc_info->flow_table); rhashtable_destroy(&tc_info->l2_table); rhashtable_destroy(&tc_info->decap_l2_table); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c index d629864fc996..eefeb1cdc2ee 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c @@ -442,7 +442,8 @@ mlx5e_rep_indr_setup_block(struct net_device *netdev, block_cb = flow_indr_block_cb_alloc(setup_cb, indr_priv, indr_priv, mlx5e_rep_indr_block_unbind, - f, netdev, data, cleanup); + f, netdev, data, rpriv, + cleanup); if (IS_ERR(block_cb)) { list_del(&indr_priv->list); kfree(indr_priv); @@ -503,7 +504,7 @@ int mlx5e_rep_tc_netdevice_event_register(struct mlx5e_rep_priv *rpriv) void mlx5e_rep_tc_netdevice_event_unregister(struct mlx5e_rep_priv *rpriv) { flow_indr_dev_unregister(mlx5e_rep_indr_setup_cb, rpriv, - mlx5e_rep_indr_setup_tc_cb); + mlx5e_rep_indr_block_unbind); } #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.c b/drivers/net/ethernet/netronome/nfp/flower/main.c index c39327677a7d..bb448c82cdc2 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/main.c +++ b/drivers/net/ethernet/netronome/nfp/flower/main.c @@ -861,7 +861,7 @@ static void nfp_flower_clean(struct nfp_app *app) flush_work(&app_priv->cmsg_work); flow_indr_dev_unregister(nfp_flower_indr_setup_tc_cb, app, - nfp_flower_setup_indr_block_cb); + nfp_flower_setup_indr_tc_release); if (app_priv->flower_ext_feats & NFP_FL_FEATS_VF_RLIM) nfp_flower_qos_cleanup(app); diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.h b/drivers/net/ethernet/netronome/nfp/flower/main.h index 56b3b68e273e..7f54a620acad 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/main.h +++ b/drivers/net/ethernet/netronome/nfp/flower/main.h @@ -462,8 +462,7 @@ int nfp_flower_indr_setup_tc_cb(struct net_device *netdev, void *cb_priv, enum tc_setup_type type, void *type_data, void *data, void (*cleanup)(struct flow_block_cb *block_cb)); -int nfp_flower_setup_indr_block_cb(enum tc_setup_type type, void *type_data, - void *cb_priv); +void nfp_flower_setup_indr_tc_release(void *cb_priv); void __nfp_flower_non_repr_priv_get(struct nfp_flower_non_repr_priv *non_repr_priv); diff --git a/drivers/net/ethernet/netronome/nfp/flower/offload.c b/drivers/net/ethernet/netronome/nfp/flower/offload.c index 95c75250c355..d7340dc09b4c 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/offload.c +++ b/drivers/net/ethernet/netronome/nfp/flower/offload.c @@ -1619,8 +1619,8 @@ nfp_flower_indr_block_cb_priv_lookup(struct nfp_app *app, return NULL; } -int nfp_flower_setup_indr_block_cb(enum tc_setup_type type, - void *type_data, void *cb_priv) +static int nfp_flower_setup_indr_block_cb(enum tc_setup_type type, + void *type_data, void *cb_priv) { struct nfp_flower_indr_block_cb_priv *priv = cb_priv; struct flow_cls_offload *flower = type_data; @@ -1637,7 +1637,7 @@ int nfp_flower_setup_indr_block_cb(enum tc_setup_type type, } } -static void nfp_flower_setup_indr_tc_release(void *cb_priv) +void nfp_flower_setup_indr_tc_release(void *cb_priv) { struct nfp_flower_indr_block_cb_priv *priv = cb_priv; @@ -1680,7 +1680,7 @@ nfp_flower_setup_indr_tc_block(struct net_device *netdev, struct nfp_app *app, block_cb = flow_indr_block_cb_alloc(nfp_flower_setup_indr_block_cb, cb_priv, cb_priv, nfp_flower_setup_indr_tc_release, - f, netdev, data, cleanup); + f, netdev, data, app, cleanup); if (IS_ERR(block_cb)) { list_del(&cb_priv->list); kfree(cb_priv); diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index 1961c7982273..6315324b9dc2 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -450,6 +450,7 @@ struct flow_block_indr { struct net_device *dev; enum flow_block_binder_type binder_type; void *data; + void *cb_priv; void (*cleanup)(struct flow_block_cb *block_cb); }; @@ -472,6 +473,7 @@ struct flow_block_cb *flow_indr_block_cb_alloc(flow_setup_cb_t *cb, void (*release)(void *cb_priv), struct flow_block_offload *bo, struct net_device *dev, void *data, + void *indr_cb_priv, void (*cleanup)(struct flow_block_cb *block_cb)); void flow_block_cb_free(struct flow_block_cb *block_cb); @@ -551,7 +553,7 @@ typedef int flow_indr_block_bind_cb_t(struct net_device *dev, void *cb_priv, int flow_indr_dev_register(flow_indr_block_bind_cb_t *cb, void *cb_priv); void flow_indr_dev_unregister(flow_indr_block_bind_cb_t *cb, void *cb_priv, - flow_setup_cb_t *setup_cb); + void (*release)(void *cb_priv)); int flow_indr_dev_setup_offload(struct net_device *dev, enum tc_setup_type type, void *data, struct flow_block_offload *bo, diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c index ddd958c6040e..b739cfab796e 100644 --- a/net/core/flow_offload.c +++ b/net/core/flow_offload.c @@ -372,14 +372,15 @@ int flow_indr_dev_register(flow_indr_block_bind_cb_t *cb, void *cb_priv) } EXPORT_SYMBOL(flow_indr_dev_register); -static void __flow_block_indr_cleanup(flow_setup_cb_t *setup_cb, void *cb_priv, +static void __flow_block_indr_cleanup(void (*release)(void *cb_priv), + void *cb_priv, struct list_head *cleanup_list) { struct flow_block_cb *this, *next; list_for_each_entry_safe(this, next, &flow_block_indr_list, indr.list) { - if (this->cb == setup_cb && - this->cb_priv == cb_priv) { + if (this->release == release && + this->indr.cb_priv == cb_priv) { list_move(&this->indr.list, cleanup_list); return; } @@ -397,7 +398,7 @@ static void flow_block_indr_notify(struct list_head *cleanup_list) } void flow_indr_dev_unregister(flow_indr_block_bind_cb_t *cb, void *cb_priv, - flow_setup_cb_t *setup_cb) + void (*release)(void *cb_priv)) { struct flow_indr_dev *this, *next, *indr_dev = NULL; LIST_HEAD(cleanup_list); @@ -418,7 +419,7 @@ void flow_indr_dev_unregister(flow_indr_block_bind_cb_t *cb, void *cb_priv, return; } - __flow_block_indr_cleanup(setup_cb, cb_priv, &cleanup_list); + __flow_block_indr_cleanup(release, cb_priv, &cleanup_list); mutex_unlock(&flow_indr_block_lock); flow_block_indr_notify(&cleanup_list); @@ -429,10 +430,12 @@ EXPORT_SYMBOL(flow_indr_dev_unregister); static void flow_block_indr_init(struct flow_block_cb *flow_block, struct flow_block_offload *bo, struct net_device *dev, void *data, + void *cb_priv, void (*cleanup)(struct flow_block_cb *block_cb)) { flow_block->indr.binder_type = bo->binder_type; flow_block->indr.data = data; + flow_block->indr.cb_priv = cb_priv; flow_block->indr.dev = dev; flow_block->indr.cleanup = cleanup; } @@ -442,6 +445,7 @@ struct flow_block_cb *flow_indr_block_cb_alloc(flow_setup_cb_t *cb, void (*release)(void *cb_priv), struct flow_block_offload *bo, struct net_device *dev, void *data, + void *indr_cb_priv, void (*cleanup)(struct flow_block_cb *block_cb)) { struct flow_block_cb *block_cb; @@ -450,7 +454,7 @@ struct flow_block_cb *flow_indr_block_cb_alloc(flow_setup_cb_t *cb, if (IS_ERR(block_cb)) goto out; - flow_block_indr_init(block_cb, bo, dev, data, cleanup); + flow_block_indr_init(block_cb, bo, dev, data, indr_cb_priv, cleanup); list_add(&block_cb->indr.list, &flow_block_indr_list); out: diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index 62651e6683f6..5fff1e040168 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -950,6 +950,7 @@ static void nf_flow_table_indr_cleanup(struct flow_block_cb *block_cb) nf_flow_table_gc_cleanup(flowtable, dev); down_write(&flowtable->flow_block_lock); list_del(&block_cb->list); + list_del(&block_cb->driver_list); flow_block_cb_free(block_cb); up_write(&flowtable->flow_block_lock); } diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index 185fc82c99aa..c7cf1cde46de 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -296,6 +296,7 @@ static void nft_indr_block_cleanup(struct flow_block_cb *block_cb) nft_flow_block_offload_init(&bo, dev_net(dev), FLOW_BLOCK_UNBIND, basechain, &extack); mutex_lock(&net->nft.commit_mutex); + list_del(&block_cb->driver_list); list_move(&block_cb->list, &bo.cb_list); nft_flow_offload_unbind(&bo, basechain); mutex_unlock(&net->nft.commit_mutex); diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index a00a203b2ef5..f8028d73edf1 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -652,6 +652,7 @@ static void tc_block_indr_cleanup(struct flow_block_cb *block_cb) &block->flow_block, tcf_block_shared(block), &extack); down_write(&block->cb_lock); + list_del(&block_cb->driver_list); list_move(&block_cb->list, &bo.cb_list); up_write(&block->cb_lock); rtnl_lock(); -- cgit v1.2.3 From 3c005110d4083c52ff6c99018127adf92f3f23aa Mon Sep 17 00:00:00 2001 From: wenxu Date: Thu, 18 Jun 2020 20:49:11 +0800 Subject: net/sched: cls_api: fix nooffloaddevcnt warning dmesg log The block->nooffloaddevcnt should always count for indr block. even the indr block offload successful. The representor maybe gone away and the ingress qdisc can work in software mode. block->nooffloaddevcnt warning with following dmesg log: [ 760.667058] ##################################################### [ 760.668186] ## TEST test-ecmp-add-vxlan-encap-disable-sriov.sh ## [ 760.669179] ##################################################### [ 761.780655] :test: Fedora 30 (Thirty) [ 761.783794] :test: Linux reg-r-vrt-018-180 5.7.0+ [ 761.822890] :test: NIC ens1f0 FW 16.26.6000 PCI 0000:81:00.0 DEVICE 0x1019 ConnectX-5 Ex [ 761.860244] mlx5_core 0000:81:00.0 ens1f0: Link up [ 761.880693] IPv6: ADDRCONF(NETDEV_CHANGE): ens1f0: link becomes ready [ 762.059732] mlx5_core 0000:81:00.1 ens1f1: Link up [ 762.234341] :test: unbind vfs of ens1f0 [ 762.257825] :test: Change ens1f0 eswitch (0000:81:00.0) mode to switchdev [ 762.291363] :test: unbind vfs of ens1f1 [ 762.306914] :test: Change ens1f1 eswitch (0000:81:00.1) mode to switchdev [ 762.309237] mlx5_core 0000:81:00.1: E-Switch: Disable: mode(LEGACY), nvfs(2), active vports(3) [ 763.282598] mlx5_core 0000:81:00.1: E-Switch: Supported tc offload range - chains: 4294967294, prios: 4294967295 [ 763.362825] mlx5_core 0000:81:00.1: MLX5E: StrdRq(1) RqSz(8) StrdSz(2048) RxCqeCmprss(0) [ 763.444465] mlx5_core 0000:81:00.1 ens1f1: renamed from eth0 [ 763.460088] mlx5_core 0000:81:00.1: MLX5E: StrdRq(1) RqSz(8) StrdSz(2048) RxCqeCmprss(0) [ 763.502586] mlx5_core 0000:81:00.1: MLX5E: StrdRq(1) RqSz(8) StrdSz(2048) RxCqeCmprss(0) [ 763.552429] ens1f1_0: renamed from eth0 [ 763.569569] mlx5_core 0000:81:00.1: E-Switch: Enable: mode(OFFLOADS), nvfs(2), active vports(3) [ 763.629694] ens1f1_1: renamed from eth1 [ 764.631552] IPv6: ADDRCONF(NETDEV_CHANGE): ens1f1_0: link becomes ready [ 764.670841] :test: unbind vfs of ens1f0 [ 764.681966] :test: unbind vfs of ens1f1 [ 764.726762] mlx5_core 0000:81:00.0 ens1f0: Link up [ 764.766511] mlx5_core 0000:81:00.1 ens1f1: Link up [ 764.797325] :test: Add multipath vxlan encap rule and disable sriov [ 764.798544] :test: config multipath route [ 764.812732] mlx5_core 0000:81:00.0: lag map port 1:2 port 2:2 [ 764.874556] mlx5_core 0000:81:00.0: modify lag map port 1:1 port 2:2 [ 765.603681] :test: OK [ 765.659048] IPv6: ADDRCONF(NETDEV_CHANGE): ens1f1_1: link becomes ready [ 765.675085] :test: verify rule in hw [ 765.694237] IPv6: ADDRCONF(NETDEV_CHANGE): ens1f0: link becomes ready [ 765.711892] IPv6: ADDRCONF(NETDEV_CHANGE): ens1f1: link becomes ready [ 766.979230] :test: OK [ 768.125419] :test: OK [ 768.127519] :test: - disable sriov ens1f1 [ 768.131160] pci 0000:81:02.2: Removing from iommu group 75 [ 768.132646] pci 0000:81:02.3: Removing from iommu group 76 [ 769.179749] mlx5_core 0000:81:00.1: E-Switch: Disable: mode(OFFLOADS), nvfs(2), active vports(3) [ 769.455627] mlx5_core 0000:81:00.0: modify lag map port 1:1 port 2:1 [ 769.703990] mlx5_core 0000:81:00.1: MLX5E: StrdRq(1) RqSz(8) StrdSz(2048) RxCqeCmprss(0) [ 769.988637] mlx5_core 0000:81:00.1 ens1f1: renamed from eth0 [ 769.990022] :test: - disable sriov ens1f0 [ 769.994922] pci 0000:81:00.2: Removing from iommu group 73 [ 769.997048] pci 0000:81:00.3: Removing from iommu group 74 [ 771.035813] mlx5_core 0000:81:00.0: E-Switch: Disable: mode(OFFLOADS), nvfs(2), active vports(3) [ 771.339091] ------------[ cut here ]------------ [ 771.340812] WARNING: CPU: 6 PID: 3448 at net/sched/cls_api.c:749 tcf_block_offload_unbind.isra.0+0x5c/0x60 [ 771.341728] Modules linked in: act_mirred act_tunnel_key cls_flower dummy vxlan ip6_udp_tunnel udp_tunnel sch_ingress nfsv3 nfs_acl nfs lockd grace fscache tun bridge stp llc sunrpc rdma_ucm rdma_cm iw_cm ib_cm mlx5_ib ib_uverbs ib_core mlx5_core intel_rapl_msr intel_rapl_common sb_edac x86_pkg_temp_thermal intel_powerclamp coretemp mlxfw act_ct nf_flow_table kvm_intel nf_nat kvm nf_conntrack irqbypass crct10dif_pclmul igb crc32_pclmul nf_defrag_ipv6 libcrc32c nf_defrag_ipv4 crc32c_intel ghash_clmulni_intel ptp ipmi_ssif intel_cstate pps_c ore ses intel_uncore mei_me iTCO_wdt joydev ipmi_si iTCO_vendor_support i2c_i801 enclosure mei ioatdma dca lpc_ich wmi ipmi_devintf pcspkr acpi_power_meter ipmi_msghandler acpi_pad ast i2c_algo_bit drm_vram_helper drm_kms_helper drm_ttm_helper ttm drm mpt3sas raid_class scsi_transport_sas [ 771.347818] CPU: 6 PID: 3448 Comm: test-ecmp-add-v Not tainted 5.7.0+ #1146 [ 771.348727] Hardware name: Supermicro SYS-2028TP-DECR/X10DRT-P, BIOS 2.0b 03/30/2017 [ 771.349646] RIP: 0010:tcf_block_offload_unbind.isra.0+0x5c/0x60 [ 771.350553] Code: 4a fd ff ff 83 f8 a1 74 0e 5b 4c 89 e7 5d 41 5c 41 5d e9 07 93 89 ff 8b 83 a0 00 00 00 8d 50 ff 89 93 a0 00 00 00 85 c0 75 df <0f> 0b eb db 0f 1f 44 00 00 41 57 41 56 41 55 41 89 cd 41 54 49 89 [ 771.352420] RSP: 0018:ffffb33144cd3b00 EFLAGS: 00010246 [ 771.353353] RAX: 0000000000000000 RBX: ffff8b37cf4b2800 RCX: 0000000000000000 [ 771.354294] RDX: 00000000ffffffff RSI: ffff8b3b9aad0000 RDI: ffffffff8d5c6e20 [ 771.355245] RBP: ffff8b37eb546948 R08: ffffffffc0b7a348 R09: ffff8b3b9aad0000 [ 771.356189] R10: 0000000000000001 R11: ffff8b3ba7a0a1c0 R12: ffff8b37cf4b2850 [ 771.357123] R13: ffff8b3b9aad0000 R14: ffff8b37cf4b2820 R15: ffff8b37cf4b2820 [ 771.358039] FS: 00007f8a19b6e740(0000) GS:ffff8b3befa00000(0000) knlGS:0000000000000000 [ 771.358965] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 771.359885] CR2: 00007f3afb91c1a0 CR3: 000000045133c004 CR4: 00000000001606e0 [ 771.360825] Call Trace: [ 771.361764] __tcf_block_put+0x84/0x150 [ 771.362712] ingress_destroy+0x1b/0x20 [sch_ingress] [ 771.363658] qdisc_destroy+0x3e/0xc0 [ 771.364594] dev_shutdown+0x7a/0xa5 [ 771.365522] rollback_registered_many+0x20d/0x530 [ 771.366458] ? netdev_upper_dev_unlink+0x15d/0x1c0 [ 771.367387] unregister_netdevice_many.part.0+0xf/0x70 [ 771.368310] vxlan_netdevice_event+0xa4/0x110 [vxlan] [ 771.369454] notifier_call_chain+0x4c/0x70 [ 771.370579] rollback_registered_many+0x2f5/0x530 [ 771.371719] rollback_registered+0x56/0x90 [ 771.372843] unregister_netdevice_queue+0x73/0xb0 [ 771.373982] unregister_netdev+0x18/0x20 [ 771.375168] mlx5e_vport_rep_unload+0x56/0xc0 [mlx5_core] [ 771.376327] esw_offloads_disable+0x81/0x90 [mlx5_core] [ 771.377512] mlx5_eswitch_disable_locked.cold+0xcb/0x1af [mlx5_core] [ 771.378679] mlx5_eswitch_disable+0x44/0x60 [mlx5_core] [ 771.379822] mlx5_device_disable_sriov+0xad/0xb0 [mlx5_core] [ 771.380968] mlx5_core_sriov_configure+0xc1/0xe0 [mlx5_core] [ 771.382087] sriov_numvfs_store+0xfc/0x130 [ 771.383195] kernfs_fop_write+0xce/0x1b0 [ 771.384302] vfs_write+0xb6/0x1a0 [ 771.385410] ksys_write+0x5f/0xe0 [ 771.386500] do_syscall_64+0x5b/0x1d0 [ 771.387569] entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: 0fdcf78d5973 ("net: use flow_indr_dev_setup_offload()") Signed-off-by: wenxu Signed-off-by: David S. Miller --- net/sched/cls_api.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index f8028d73edf1..faa78b7dd962 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -672,25 +672,29 @@ static int tcf_block_offload_cmd(struct tcf_block *block, struct netlink_ext_ack *extack) { struct flow_block_offload bo = {}; - int err; tcf_block_offload_init(&bo, dev, command, ei->binder_type, &block->flow_block, tcf_block_shared(block), extack); - if (dev->netdev_ops->ndo_setup_tc) + if (dev->netdev_ops->ndo_setup_tc) { + int err; + err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo); - else - err = flow_indr_dev_setup_offload(dev, TC_SETUP_BLOCK, block, - &bo, tc_block_indr_cleanup); + if (err < 0) { + if (err != -EOPNOTSUPP) + NL_SET_ERR_MSG(extack, "Driver ndo_setup_tc failed"); + return err; + } - if (err < 0) { - if (err != -EOPNOTSUPP) - NL_SET_ERR_MSG(extack, "Driver ndo_setup_tc failed"); - return err; + return tcf_block_setup(block, &bo); } - return tcf_block_setup(block, &bo); + flow_indr_dev_setup_offload(dev, TC_SETUP_BLOCK, block, &bo, + tc_block_indr_cleanup); + tcf_block_setup(block, &bo); + + return -EOPNOTSUPP; } static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q, -- cgit v1.2.3 From 5948378b26d89f8aa5eac37629dbd0616ce8d7a7 Mon Sep 17 00:00:00 2001 From: Thomas Falcon Date: Thu, 18 Jun 2020 10:43:46 -0500 Subject: ibmveth: Fix max MTU limit The max MTU limit defined for ibmveth is not accounting for virtual ethernet buffer overhead, which is twenty-two additional bytes set aside for the ethernet header and eight additional bytes of an opaque handle reserved for use by the hypervisor. Update the max MTU to reflect this overhead. Fixes: d894be57ca92 ("ethernet: use net core MTU range checking in more drivers") Fixes: 110447f8269a ("ethernet: fix min/max MTU typos") Signed-off-by: Thomas Falcon Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmveth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ibm/ibmveth.c b/drivers/net/ethernet/ibm/ibmveth.c index 96d36ae5049e..c5c732601e35 100644 --- a/drivers/net/ethernet/ibm/ibmveth.c +++ b/drivers/net/ethernet/ibm/ibmveth.c @@ -1715,7 +1715,7 @@ static int ibmveth_probe(struct vio_dev *dev, const struct vio_device_id *id) } netdev->min_mtu = IBMVETH_MIN_MTU; - netdev->max_mtu = ETH_MAX_MTU; + netdev->max_mtu = ETH_MAX_MTU - IBMVETH_BUFF_OH; memcpy(netdev->dev_addr, mac_addr_p, ETH_ALEN); -- cgit v1.2.3 From ca8826095e4d4afc0ccaead27bba6e4b623a12ae Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 18 Jun 2020 12:40:43 -0400 Subject: selftests/net: report etf errors correctly The ETF qdisc can queue skbs that it could not pace on the errqueue. Address a few issues in the selftest - recv buffer size was too small, and incorrectly calculated - compared errno to ee_code instead of ee_errno - missed invalid request error type v2: - fix a few checkpatch --strict indentation warnings Fixes: ea6a547669b3 ("selftests/net: make so_txtime more robust to timer variance") Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- tools/testing/selftests/net/so_txtime.c | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/net/so_txtime.c b/tools/testing/selftests/net/so_txtime.c index 383bac05ac32..ceaad78e9667 100644 --- a/tools/testing/selftests/net/so_txtime.c +++ b/tools/testing/selftests/net/so_txtime.c @@ -15,8 +15,9 @@ #include #include #include +#include #include -#include +#include #include #include #include @@ -140,8 +141,8 @@ static void do_recv_errqueue_timeout(int fdt) { char control[CMSG_SPACE(sizeof(struct sock_extended_err)) + CMSG_SPACE(sizeof(struct sockaddr_in6))] = {0}; - char data[sizeof(struct ipv6hdr) + - sizeof(struct tcphdr) + 1]; + char data[sizeof(struct ethhdr) + sizeof(struct ipv6hdr) + + sizeof(struct udphdr) + 1]; struct sock_extended_err *err; struct msghdr msg = {0}; struct iovec iov = {0}; @@ -159,6 +160,8 @@ static void do_recv_errqueue_timeout(int fdt) msg.msg_controllen = sizeof(control); while (1) { + const char *reason; + ret = recvmsg(fdt, &msg, MSG_ERRQUEUE); if (ret == -1 && errno == EAGAIN) break; @@ -176,14 +179,30 @@ static void do_recv_errqueue_timeout(int fdt) err = (struct sock_extended_err *)CMSG_DATA(cm); if (err->ee_origin != SO_EE_ORIGIN_TXTIME) error(1, 0, "errqueue: origin 0x%x\n", err->ee_origin); - if (err->ee_code != ECANCELED) - error(1, 0, "errqueue: code 0x%x\n", err->ee_code); + + switch (err->ee_errno) { + case ECANCELED: + if (err->ee_code != SO_EE_CODE_TXTIME_MISSED) + error(1, 0, "errqueue: unknown ECANCELED %u\n", + err->ee_code); + reason = "missed txtime"; + break; + case EINVAL: + if (err->ee_code != SO_EE_CODE_TXTIME_INVALID_PARAM) + error(1, 0, "errqueue: unknown EINVAL %u\n", + err->ee_code); + reason = "invalid txtime"; + break; + default: + error(1, 0, "errqueue: errno %u code %u\n", + err->ee_errno, err->ee_code); + }; tstamp = ((int64_t) err->ee_data) << 32 | err->ee_info; tstamp -= (int64_t) glob_tstart; tstamp /= 1000 * 1000; - fprintf(stderr, "send: pkt %c at %" PRId64 "ms dropped\n", - data[ret - 1], tstamp); + fprintf(stderr, "send: pkt %c at %" PRId64 "ms dropped: %s\n", + data[ret - 1], tstamp, reason); msg.msg_flags = 0; msg.msg_controllen = sizeof(control); -- cgit v1.2.3 From b59eabd23ee53e8c3492602722e1697f9a2f63ad Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Thu, 18 Jun 2020 10:29:04 -0700 Subject: ionic: tame the watchdog timer on reconfig Even with moving netif_tx_disable() to an earlier point when taking down the queues for a reconfiguration, we still end up with the occasional netdev watchdog Tx Timeout complaint. The old method of using netif_trans_update() works fine for queue 0, but has no effect on the remaining queues. Using netif_device_detach() allows us to signal to the watchdog to ignore us for the moment. Fixes: beead698b173 ("ionic: Add the basic NDO callbacks for netdev support") Signed-off-by: Shannon Nelson Acked-by: Jonathan Toppins Signed-off-by: David S. Miller --- drivers/net/ethernet/pensando/ionic/ionic_lif.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_lif.c b/drivers/net/ethernet/pensando/ionic/ionic_lif.c index 8f29ef133743..aaa00edd9d5b 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_lif.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_lif.c @@ -1694,15 +1694,15 @@ static void ionic_stop_queues(struct ionic_lif *lif) if (!test_and_clear_bit(IONIC_LIF_F_UP, lif->state)) return; - ionic_txrx_disable(lif); netif_tx_disable(lif->netdev); + ionic_txrx_disable(lif); } int ionic_stop(struct net_device *netdev) { struct ionic_lif *lif = netdev_priv(netdev); - if (!netif_device_present(netdev)) + if (test_bit(IONIC_LIF_F_FW_RESET, lif->state)) return 0; ionic_stop_queues(lif); @@ -1985,18 +1985,19 @@ int ionic_reset_queues(struct ionic_lif *lif) bool running; int err = 0; - /* Put off the next watchdog timeout */ - netif_trans_update(lif->netdev); - err = ionic_wait_for_bit(lif, IONIC_LIF_F_QUEUE_RESET); if (err) return err; running = netif_running(lif->netdev); - if (running) + if (running) { + netif_device_detach(lif->netdev); err = ionic_stop(lif->netdev); - if (!err && running) + } + if (!err && running) { ionic_open(lif->netdev); + netif_device_attach(lif->netdev); + } clear_bit(IONIC_LIF_F_QUEUE_RESET, lif->state); -- cgit v1.2.3 From 8b40eb73509f5704a0e8cd25de0163876299f1a7 Mon Sep 17 00:00:00 2001 From: Dany Madden Date: Thu, 18 Jun 2020 15:24:13 -0400 Subject: ibmvnic: continue to init in CRQ reset returns H_CLOSED Continue the reset path when partner adapter is not ready or H_CLOSED is returned from reset crq. This patch allows the CRQ init to proceed to establish a valid CRQ for traffic to flow after reset. Signed-off-by: Dany Madden Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 2baf7b3ff4cb..0fd7eae25fe9 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -1971,13 +1971,18 @@ static int do_reset(struct ibmvnic_adapter *adapter, release_sub_crqs(adapter, 1); } else { rc = ibmvnic_reset_crq(adapter); - if (!rc) + if (rc == H_CLOSED || rc == H_SUCCESS) { rc = vio_enable_interrupts(adapter->vdev); + if (rc) + netdev_err(adapter->netdev, + "Reset failed to enable interrupts. rc=%d\n", + rc); + } } if (rc) { netdev_err(adapter->netdev, - "Couldn't initialize crq. rc=%d\n", rc); + "Reset couldn't initialize crq. rc=%d\n", rc); goto out; } -- cgit v1.2.3 From 89fbd26cca7ec9e82ec4787a4b6e95939b57d073 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 18 Jun 2020 23:25:50 +0200 Subject: r8169: fix firmware not resetting tp->ocp_base Typically the firmware takes care that tp->ocp_base is reset to its default value. That's not the case (at least) for RTL8117. As a result subsequent PHY access reads/writes the wrong page and the link is broken. Fix this be resetting tp->ocp_base explicitly. Fixes: 229c1e0dfd3d ("r8169: load firmware for RTL8168fp/RTL8117") Reported-by: Aaron Ma Tested-by: Aaron Ma Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169_main.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index dad84ecf5a77..b660ddbe4025 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -2114,8 +2114,11 @@ static void rtl_release_firmware(struct rtl8169_private *tp) void r8169_apply_firmware(struct rtl8169_private *tp) { /* TODO: release firmware if rtl_fw_write_firmware signals failure. */ - if (tp->rtl_fw) + if (tp->rtl_fw) { rtl_fw_write_firmware(tp, tp->rtl_fw); + /* At least one firmware doesn't reset tp->ocp_base. */ + tp->ocp_base = OCP_STD_PHY_BASE; + } } static void rtl8168_config_eee_mac(struct rtl8169_private *tp) -- cgit v1.2.3 From 54eeea0d707d4e6efb02f5a7896d6788472a2da4 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 19 Jun 2020 11:24:45 +0800 Subject: tc-testing: update geneve options match in tunnel_key unit tests Since iproute2 commit f72c3ad00f3b ("tc: m_tunnel_key: add options support for vxlan"), the geneve opt output use key word "geneve_opts" instead of "geneve_opt". To make compatibility for both old and new iproute2, let's accept both "geneve_opt" and "geneve_opts". Suggested-by: Davide Caratti Signed-off-by: Hangbin Liu Reviewed-by: Simon Horman Tested-by: Davide Caratti Signed-off-by: David S. Miller --- .../selftests/tc-testing/tc-tests/actions/tunnel_key.json | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/tunnel_key.json b/tools/testing/selftests/tc-testing/tc-tests/actions/tunnel_key.json index fbeb9197697d..7357c58fa2dc 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/actions/tunnel_key.json +++ b/tools/testing/selftests/tc-testing/tc-tests/actions/tunnel_key.json @@ -629,7 +629,7 @@ "cmdUnderTest": "$TC actions add action tunnel_key set src_ip 1.1.1.1 dst_ip 2.2.2.2 id 42 dst_port 6081 geneve_opts 0102:80:00880022 index 1", "expExitCode": "0", "verifyCmd": "$TC actions get action tunnel_key index 1", - "matchPattern": "action order [0-9]+: tunnel_key.*set.*src_ip 1.1.1.1.*dst_ip 2.2.2.2.*key_id 42.*dst_port 6081.*geneve_opt 0102:80:00880022.*index 1", + "matchPattern": "action order [0-9]+: tunnel_key.*set.*src_ip 1.1.1.1.*dst_ip 2.2.2.2.*key_id 42.*dst_port 6081.*geneve_opt[s]? 0102:80:00880022.*index 1", "matchCount": "1", "teardown": [ "$TC actions flush action tunnel_key" @@ -653,7 +653,7 @@ "cmdUnderTest": "$TC actions add action tunnel_key set src_ip 1.1.1.1 dst_ip 2.2.2.2 id 42 dst_port 6081 geneve_opts 0102:80:00880022,0408:42:0040007611223344,0111:02:1020304011223344 index 1", "expExitCode": "0", "verifyCmd": "$TC actions get action tunnel_key index 1", - "matchPattern": "action order [0-9]+: tunnel_key.*set.*src_ip 1.1.1.1.*dst_ip 2.2.2.2.*key_id 42.*dst_port 6081.*geneve_opt 0102:80:00880022,0408:42:0040007611223344,0111:02:1020304011223344.*index 1", + "matchPattern": "action order [0-9]+: tunnel_key.*set.*src_ip 1.1.1.1.*dst_ip 2.2.2.2.*key_id 42.*dst_port 6081.*geneve_opt[s]? 0102:80:00880022,0408:42:0040007611223344,0111:02:1020304011223344.*index 1", "matchCount": "1", "teardown": [ "$TC actions flush action tunnel_key" @@ -677,7 +677,7 @@ "cmdUnderTest": "$TC actions add action tunnel_key set src_ip 1.1.1.1 dst_ip 2.2.2.2 id 42 dst_port 6081 geneve_opts 824212:80:00880022 index 1", "expExitCode": "255", "verifyCmd": "$TC actions get action tunnel_key index 1", - "matchPattern": "action order [0-9]+: tunnel_key.*set.*src_ip 1.1.1.1.*dst_ip 2.2.2.2.*key_id 42.*dst_port 6081.*geneve_opt 824212:80:00880022.*index 1", + "matchPattern": "action order [0-9]+: tunnel_key.*set.*src_ip 1.1.1.1.*dst_ip 2.2.2.2.*key_id 42.*dst_port 6081.*geneve_opt[s]? 824212:80:00880022.*index 1", "matchCount": "0", "teardown": [ "$TC actions flush action tunnel_key" @@ -701,7 +701,7 @@ "cmdUnderTest": "$TC actions add action tunnel_key set src_ip 1.1.1.1 dst_ip 2.2.2.2 id 42 dst_port 6081 geneve_opts 0102:4224:00880022 index 1", "expExitCode": "255", "verifyCmd": "$TC actions get action tunnel_key index 1", - "matchPattern": "action order [0-9]+: tunnel_key.*set.*src_ip 1.1.1.1.*dst_ip 2.2.2.2.*key_id 42.*dst_port 6081.*geneve_opt 0102:4224:00880022.*index 1", + "matchPattern": "action order [0-9]+: tunnel_key.*set.*src_ip 1.1.1.1.*dst_ip 2.2.2.2.*key_id 42.*dst_port 6081.*geneve_opt[s]? 0102:4224:00880022.*index 1", "matchCount": "0", "teardown": [ "$TC actions flush action tunnel_key" @@ -725,7 +725,7 @@ "cmdUnderTest": "$TC actions add action tunnel_key set src_ip 1.1.1.1 dst_ip 2.2.2.2 id 42 dst_port 6081 geneve_opts 0102:80:4288 index 1", "expExitCode": "255", "verifyCmd": "$TC actions get action tunnel_key index 1", - "matchPattern": "action order [0-9]+: tunnel_key.*set.*src_ip 1.1.1.1.*dst_ip 2.2.2.2.*key_id 42.*dst_port 6081.*geneve_opt 0102:80:4288.*index 1", + "matchPattern": "action order [0-9]+: tunnel_key.*set.*src_ip 1.1.1.1.*dst_ip 2.2.2.2.*key_id 42.*dst_port 6081.*geneve_opt[s]? 0102:80:4288.*index 1", "matchCount": "0", "teardown": [ "$TC actions flush action tunnel_key" @@ -749,7 +749,7 @@ "cmdUnderTest": "$TC actions add action tunnel_key set src_ip 1.1.1.1 dst_ip 2.2.2.2 id 42 dst_port 6081 geneve_opts 0102:80:4288428822 index 1", "expExitCode": "255", "verifyCmd": "$TC actions get action tunnel_key index 1", - "matchPattern": "action order [0-9]+: tunnel_key.*set.*src_ip 1.1.1.1.*dst_ip 2.2.2.2.*key_id 42.*dst_port 6081.*geneve_opt 0102:80:4288428822.*index 1", + "matchPattern": "action order [0-9]+: tunnel_key.*set.*src_ip 1.1.1.1.*dst_ip 2.2.2.2.*key_id 42.*dst_port 6081.*geneve_opt[s]? 0102:80:4288428822.*index 1", "matchCount": "0", "teardown": [ "$TC actions flush action tunnel_key" @@ -773,7 +773,7 @@ "cmdUnderTest": "$TC actions add action tunnel_key set src_ip 1.1.1.1 dst_ip 2.2.2.2 id 42 dst_port 6081 geneve_opts 0102:80:00880022,0408:42: index 1", "expExitCode": "255", "verifyCmd": "$TC actions get action tunnel_key index 1", - "matchPattern": "action order [0-9]+: tunnel_key.*set.*src_ip 1.1.1.1.*dst_ip 2.2.2.2.*key_id 42.*dst_port 6081.*geneve_opt 0102:80:00880022,0408:42:.*index 1", + "matchPattern": "action order [0-9]+: tunnel_key.*set.*src_ip 1.1.1.1.*dst_ip 2.2.2.2.*key_id 42.*dst_port 6081.*geneve_opt[s]? 0102:80:00880022,0408:42:.*index 1", "matchCount": "0", "teardown": [ "$TC actions flush action tunnel_key" -- cgit v1.2.3 From 0041cd5a50442db6e456b145892a0eaf2dff061f Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 19 Jun 2020 23:38:16 +0100 Subject: rxrpc: Fix notification call on completion of discarded calls When preallocated service calls are being discarded, they're passed to ->discard_new_call() to have the caller clean up any attached higher-layer preallocated pieces before being marked completed. However, the act of marking them completed now invokes the call's notification function - which causes a problem because that function might assume that the previously freed pieces of memory are still there. Fix this by setting a dummy notification function on the socket after calling ->discard_new_call(). This results in the following kasan message when the kafs module is removed. ================================================================== BUG: KASAN: use-after-free in afs_wake_up_async_call+0x6aa/0x770 fs/afs/rxrpc.c:707 Write of size 1 at addr ffff8880946c39e4 by task kworker/u4:1/21 CPU: 0 PID: 21 Comm: kworker/u4:1 Not tainted 5.8.0-rc1-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: netns cleanup_net Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x18f/0x20d lib/dump_stack.c:118 print_address_description.constprop.0.cold+0xd3/0x413 mm/kasan/report.c:383 __kasan_report mm/kasan/report.c:513 [inline] kasan_report.cold+0x1f/0x37 mm/kasan/report.c:530 afs_wake_up_async_call+0x6aa/0x770 fs/afs/rxrpc.c:707 rxrpc_notify_socket+0x1db/0x5d0 net/rxrpc/recvmsg.c:40 __rxrpc_set_call_completion.part.0+0x172/0x410 net/rxrpc/recvmsg.c:76 __rxrpc_call_completed net/rxrpc/recvmsg.c:112 [inline] rxrpc_call_completed+0xca/0xf0 net/rxrpc/recvmsg.c:111 rxrpc_discard_prealloc+0x781/0xab0 net/rxrpc/call_accept.c:233 rxrpc_listen+0x147/0x360 net/rxrpc/af_rxrpc.c:245 afs_close_socket+0x95/0x320 fs/afs/rxrpc.c:110 afs_net_exit+0x1bc/0x310 fs/afs/main.c:155 ops_exit_list.isra.0+0xa8/0x150 net/core/net_namespace.c:186 cleanup_net+0x511/0xa50 net/core/net_namespace.c:603 process_one_work+0x965/0x1690 kernel/workqueue.c:2269 worker_thread+0x96/0xe10 kernel/workqueue.c:2415 kthread+0x3b5/0x4a0 kernel/kthread.c:291 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:293 Allocated by task 6820: save_stack+0x1b/0x40 mm/kasan/common.c:48 set_track mm/kasan/common.c:56 [inline] __kasan_kmalloc mm/kasan/common.c:494 [inline] __kasan_kmalloc.constprop.0+0xbf/0xd0 mm/kasan/common.c:467 kmem_cache_alloc_trace+0x153/0x7d0 mm/slab.c:3551 kmalloc include/linux/slab.h:555 [inline] kzalloc include/linux/slab.h:669 [inline] afs_alloc_call+0x55/0x630 fs/afs/rxrpc.c:141 afs_charge_preallocation+0xe9/0x2d0 fs/afs/rxrpc.c:757 afs_open_socket+0x292/0x360 fs/afs/rxrpc.c:92 afs_net_init+0xa6c/0xe30 fs/afs/main.c:125 ops_init+0xaf/0x420 net/core/net_namespace.c:151 setup_net+0x2de/0x860 net/core/net_namespace.c:341 copy_net_ns+0x293/0x590 net/core/net_namespace.c:482 create_new_namespaces+0x3fb/0xb30 kernel/nsproxy.c:110 unshare_nsproxy_namespaces+0xbd/0x1f0 kernel/nsproxy.c:231 ksys_unshare+0x43d/0x8e0 kernel/fork.c:2983 __do_sys_unshare kernel/fork.c:3051 [inline] __se_sys_unshare kernel/fork.c:3049 [inline] __x64_sys_unshare+0x2d/0x40 kernel/fork.c:3049 do_syscall_64+0x60/0xe0 arch/x86/entry/common.c:359 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Freed by task 21: save_stack+0x1b/0x40 mm/kasan/common.c:48 set_track mm/kasan/common.c:56 [inline] kasan_set_free_info mm/kasan/common.c:316 [inline] __kasan_slab_free+0xf7/0x140 mm/kasan/common.c:455 __cache_free mm/slab.c:3426 [inline] kfree+0x109/0x2b0 mm/slab.c:3757 afs_put_call+0x585/0xa40 fs/afs/rxrpc.c:190 rxrpc_discard_prealloc+0x764/0xab0 net/rxrpc/call_accept.c:230 rxrpc_listen+0x147/0x360 net/rxrpc/af_rxrpc.c:245 afs_close_socket+0x95/0x320 fs/afs/rxrpc.c:110 afs_net_exit+0x1bc/0x310 fs/afs/main.c:155 ops_exit_list.isra.0+0xa8/0x150 net/core/net_namespace.c:186 cleanup_net+0x511/0xa50 net/core/net_namespace.c:603 process_one_work+0x965/0x1690 kernel/workqueue.c:2269 worker_thread+0x96/0xe10 kernel/workqueue.c:2415 kthread+0x3b5/0x4a0 kernel/kthread.c:291 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:293 The buggy address belongs to the object at ffff8880946c3800 which belongs to the cache kmalloc-1k of size 1024 The buggy address is located 484 bytes inside of 1024-byte region [ffff8880946c3800, ffff8880946c3c00) The buggy address belongs to the page: page:ffffea000251b0c0 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 flags: 0xfffe0000000200(slab) raw: 00fffe0000000200 ffffea0002546508 ffffea00024fa248 ffff8880aa000c40 raw: 0000000000000000 ffff8880946c3000 0000000100000002 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff8880946c3880: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff8880946c3900: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb >ffff8880946c3980: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff8880946c3a00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff8880946c3a80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ================================================================== Reported-by: syzbot+d3eccef36ddbd02713e9@syzkaller.appspotmail.com Fixes: 5ac0d62226a0 ("rxrpc: Fix missing notification") Signed-off-by: David Howells Signed-off-by: David S. Miller --- net/rxrpc/call_accept.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index b7611cc159e5..032ed76c0166 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -22,6 +22,11 @@ #include #include "ar-internal.h" +static void rxrpc_dummy_notify(struct sock *sk, struct rxrpc_call *call, + unsigned long user_call_ID) +{ +} + /* * Preallocate a single service call, connection and peer and, if possible, * give them a user ID and attach the user's side of the ID to them. @@ -228,6 +233,8 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx) if (rx->discard_new_call) { _debug("discard %lx", call->user_call_ID); rx->discard_new_call(call, call->user_call_ID); + if (call->notify_rx) + call->notify_rx = rxrpc_dummy_notify; rxrpc_put_call(call, rxrpc_call_put_kernel); } rxrpc_call_completed(call); -- cgit v1.2.3 From 67c20de35a3cc2e2cd940f95ebd85ed0a765315a Mon Sep 17 00:00:00 2001 From: Rob Gill Date: Sat, 20 Jun 2020 02:08:25 +0000 Subject: net: Add MODULE_DESCRIPTION entries to network modules The user tool modinfo is used to get information on kernel modules, including a description where it is available. This patch adds a brief MODULE_DESCRIPTION to the following modules: 9p drop_monitor esp4_offload esp6_offload fou fou6 ila sch_fq sch_fq_codel sch_hhf Signed-off-by: Rob Gill Signed-off-by: David S. Miller --- net/9p/mod.c | 1 + net/core/drop_monitor.c | 1 + net/ipv4/esp4_offload.c | 1 + net/ipv4/fou.c | 1 + net/ipv6/esp6_offload.c | 1 + net/ipv6/fou6.c | 1 + net/ipv6/ila/ila_main.c | 1 + net/sched/sch_fq.c | 1 + net/sched/sch_fq_codel.c | 1 + net/sched/sch_hhf.c | 1 + 10 files changed, 10 insertions(+) diff --git a/net/9p/mod.c b/net/9p/mod.c index c1b62428da7b..5126566850bd 100644 --- a/net/9p/mod.c +++ b/net/9p/mod.c @@ -189,3 +189,4 @@ MODULE_AUTHOR("Latchesar Ionkov "); MODULE_AUTHOR("Eric Van Hensbergen "); MODULE_AUTHOR("Ron Minnich "); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Plan 9 Resource Sharing Support (9P2000)"); diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 2ee7bc4c9e03..b09bebeadf0b 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -1721,3 +1721,4 @@ module_exit(exit_net_drop_monitor); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Neil Horman "); MODULE_ALIAS_GENL_FAMILY("NET_DM"); +MODULE_DESCRIPTION("Monitoring code for network dropped packet alerts"); diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index d14133eac476..5bda5aeda579 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -361,3 +361,4 @@ module_exit(esp4_offload_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Steffen Klassert "); MODULE_ALIAS_XFRM_OFFLOAD_TYPE(AF_INET, XFRM_PROTO_ESP); +MODULE_DESCRIPTION("IPV4 GSO/GRO offload support"); diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index dcc79ff54b41..abd083415f89 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -1304,3 +1304,4 @@ module_init(fou_init); module_exit(fou_fini); MODULE_AUTHOR("Tom Herbert "); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Foo over UDP"); diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index 55addea1948f..1ca516fb30e1 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -395,3 +395,4 @@ module_exit(esp6_offload_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Steffen Klassert "); MODULE_ALIAS_XFRM_OFFLOAD_TYPE(AF_INET6, XFRM_PROTO_ESP); +MODULE_DESCRIPTION("IPV6 GSO/GRO offload support"); diff --git a/net/ipv6/fou6.c b/net/ipv6/fou6.c index 091f94184dc1..430518ae26fa 100644 --- a/net/ipv6/fou6.c +++ b/net/ipv6/fou6.c @@ -224,3 +224,4 @@ module_init(fou6_init); module_exit(fou6_fini); MODULE_AUTHOR("Tom Herbert "); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Foo over UDP (IPv6)"); diff --git a/net/ipv6/ila/ila_main.c b/net/ipv6/ila/ila_main.c index 257d2b681246..36c58aa257e8 100644 --- a/net/ipv6/ila/ila_main.c +++ b/net/ipv6/ila/ila_main.c @@ -120,3 +120,4 @@ module_init(ila_init); module_exit(ila_fini); MODULE_AUTHOR("Tom Herbert "); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("IPv6: Identifier Locator Addressing (ILA)"); diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 8f06a808c59a..2fb76fc0cc31 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -1075,3 +1075,4 @@ module_init(fq_module_init) module_exit(fq_module_exit) MODULE_AUTHOR("Eric Dumazet"); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Fair Queue Packet Scheduler"); diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 436160be9c18..459a784056c0 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -721,3 +721,4 @@ module_init(fq_codel_module_init) module_exit(fq_codel_module_exit) MODULE_AUTHOR("Eric Dumazet"); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Fair Queue CoDel discipline"); diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c index be35f03b657b..420ede875322 100644 --- a/net/sched/sch_hhf.c +++ b/net/sched/sch_hhf.c @@ -721,3 +721,4 @@ module_exit(hhf_module_exit) MODULE_AUTHOR("Terry Lam"); MODULE_AUTHOR("Nandita Dukkipati"); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Heavy-Hitter Filter (HHF)"); -- cgit v1.2.3 From b0c34bde72a59c05e826bf0a5aeca0d73f38f791 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Sun, 21 Jun 2020 17:06:30 +0200 Subject: MAINTAINERS: update email address for Felix Fietkau The old address has been bouncing for a while now Signed-off-by: Felix Fietkau Signed-off-by: David S. Miller --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index a09e61268c08..352904042a0d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10808,7 +10808,7 @@ F: Documentation/devicetree/bindings/dma/mtk-* F: drivers/dma/mediatek/ MEDIATEK ETHERNET DRIVER -M: Felix Fietkau +M: Felix Fietkau M: John Crispin M: Sean Wang M: Mark Lee -- cgit v1.2.3 From 6d61f483f148b856d47a6c96d5d84054d5a9f849 Mon Sep 17 00:00:00 2001 From: Dejin Zheng Date: Sat, 20 Jun 2020 22:55:34 +0800 Subject: net: phy: smsc: fix printing too many logs Commit 7ae7ad2f11ef47 ("net: phy: smsc: use phy_read_poll_timeout() to simplify the code") will print a lot of logs as follows when Ethernet cable is not connected: [ 4.473105] SMSC LAN8710/LAN8720 2188000.ethernet-1:00: lan87xx_read_status failed: -110 When wait 640 ms for check ENERGYON bit, the timeout should not be regarded as an actual error and an error message also should not be printed. due to a hardware bug in LAN87XX device, it leads to unstable detection of plugging in Ethernet cable when LAN87xx is in Energy Detect Power-Down mode. the workaround for it involves, when the link is down, and at each read_status() call: - disable EDPD mode, forcing the PHY out of low-power mode - waiting 640ms to see if we have any energy detected from the media - re-enable entry to EDPD mode This is presumably enough to allow the PHY to notice that a cable is connected, and resume normal operations to negotiate with the partner. The problem is that when no media is detected, the 640ms wait times out and this commit was modified to prints an error message. it is an inappropriate conversion by used phy_read_poll_timeout() to introduce this bug. so fix this issue by use read_poll_timeout() to replace phy_read_poll_timeout(). Fixes: 7ae7ad2f11ef47 ("net: phy: smsc: use phy_read_poll_timeout() to simplify the code") Reported-by: Kevin Groeneveld Signed-off-by: Dejin Zheng Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/smsc.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/net/phy/smsc.c b/drivers/net/phy/smsc.c index 93da7d3d0954..74568ae16125 100644 --- a/drivers/net/phy/smsc.c +++ b/drivers/net/phy/smsc.c @@ -122,10 +122,13 @@ static int lan87xx_read_status(struct phy_device *phydev) if (rc < 0) return rc; - /* Wait max 640 ms to detect energy */ - phy_read_poll_timeout(phydev, MII_LAN83C185_CTRL_STATUS, rc, - rc & MII_LAN83C185_ENERGYON, 10000, - 640000, true); + /* Wait max 640 ms to detect energy and the timeout is not + * an actual error. + */ + read_poll_timeout(phy_read, rc, + rc & MII_LAN83C185_ENERGYON || rc < 0, + 10000, 640000, true, phydev, + MII_LAN83C185_CTRL_STATUS); if (rc < 0) return rc; -- cgit v1.2.3 From f3fe412b0a634286a6a3753c3f9ff201e6bec716 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sun, 21 Jun 2020 11:29:17 +0300 Subject: mlxsw: spectrum: Do not rely on machine endianness The second commit cited below performed a cast of 'u32 buffsize' to '(u16 *)' when calling mlxsw_sp_port_headroom_8x_adjust(): mlxsw_sp_port_headroom_8x_adjust(mlxsw_sp_port, (u16 *) &buffsize); Colin noted that this will behave differently on big endian architectures compared to little endian architectures. Fix this by following Colin's suggestion and have the function accept and return 'u32' instead of passing the current size by reference. Fixes: da382875c616 ("mlxsw: spectrum: Extend to support Spectrum-3 ASIC") Fixes: 60833d54d56c ("mlxsw: spectrum: Adjust headroom buffers for 8x ports") Signed-off-by: Ido Schimmel Reported-by: Colin Ian King Suggested-by: Colin Ian King Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 4 ++-- drivers/net/ethernet/mellanox/mlxsw/spectrum.h | 8 +++----- drivers/net/ethernet/mellanox/mlxsw/spectrum_buffers.c | 2 +- drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c | 2 +- 4 files changed, 7 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 55af877763ed..029ea344ad65 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -978,10 +978,10 @@ int __mlxsw_sp_port_headroom_set(struct mlxsw_sp_port *mlxsw_sp_port, int mtu, lossy = !(pfc || pause_en); thres_cells = mlxsw_sp_pg_buf_threshold_get(mlxsw_sp, mtu); - mlxsw_sp_port_headroom_8x_adjust(mlxsw_sp_port, &thres_cells); + thres_cells = mlxsw_sp_port_headroom_8x_adjust(mlxsw_sp_port, thres_cells); delay_cells = mlxsw_sp_pg_buf_delay_get(mlxsw_sp, mtu, delay, pfc, pause_en); - mlxsw_sp_port_headroom_8x_adjust(mlxsw_sp_port, &delay_cells); + delay_cells = mlxsw_sp_port_headroom_8x_adjust(mlxsw_sp_port, delay_cells); total_cells = thres_cells + delay_cells; taken_headroom_cells += total_cells; diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h index 6e87457dd635..3abe3e7d89bc 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h @@ -374,17 +374,15 @@ mlxsw_sp_port_vlan_find_by_vid(const struct mlxsw_sp_port *mlxsw_sp_port, return NULL; } -static inline void +static inline u32 mlxsw_sp_port_headroom_8x_adjust(const struct mlxsw_sp_port *mlxsw_sp_port, - u16 *p_size) + u32 size_cells) { /* Ports with eight lanes use two headroom buffers between which the * configured headroom size is split. Therefore, multiply the calculated * headroom size by two. */ - if (mlxsw_sp_port->mapping.width != 8) - return; - *p_size *= 2; + return mlxsw_sp_port->mapping.width == 8 ? 2 * size_cells : size_cells; } enum mlxsw_sp_flood_type { diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_buffers.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_buffers.c index f25a8b084b4b..6f84557a5a6f 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_buffers.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_buffers.c @@ -312,7 +312,7 @@ static int mlxsw_sp_port_pb_init(struct mlxsw_sp_port *mlxsw_sp_port) if (i == MLXSW_SP_PB_UNUSED) continue; - mlxsw_sp_port_headroom_8x_adjust(mlxsw_sp_port, &size); + size = mlxsw_sp_port_headroom_8x_adjust(mlxsw_sp_port, size); mlxsw_reg_pbmc_lossy_buffer_pack(pbmc_pl, i, size); } mlxsw_reg_pbmc_lossy_buffer_pack(pbmc_pl, diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c index f843545d3478..92351a79addc 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c @@ -782,7 +782,7 @@ mlxsw_sp_span_port_buffer_update(struct mlxsw_sp_port *mlxsw_sp_port, u16 mtu) speed = 0; buffsize = mlxsw_sp_span_buffsize_get(mlxsw_sp, speed, mtu); - mlxsw_sp_port_headroom_8x_adjust(mlxsw_sp_port, (u16 *) &buffsize); + buffsize = mlxsw_sp_port_headroom_8x_adjust(mlxsw_sp_port, buffsize); mlxsw_reg_sbib_pack(sbib_pl, mlxsw_sp_port->local_port, buffsize); return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sbib), sbib_pl); } -- cgit v1.2.3 From b835a71ef64a61383c414d6bf2896d2c0161deca Mon Sep 17 00:00:00 2001 From: Tuomas Tynkkynen Date: Sun, 21 Jun 2020 13:43:26 +0300 Subject: usbnet: smsc95xx: Fix use-after-free after removal Syzbot reports an use-after-free in workqueue context: BUG: KASAN: use-after-free in mutex_unlock+0x19/0x40 kernel/locking/mutex.c:737 mutex_unlock+0x19/0x40 kernel/locking/mutex.c:737 __smsc95xx_mdio_read drivers/net/usb/smsc95xx.c:217 [inline] smsc95xx_mdio_read+0x583/0x870 drivers/net/usb/smsc95xx.c:278 check_carrier+0xd1/0x2e0 drivers/net/usb/smsc95xx.c:644 process_one_work+0x777/0xf90 kernel/workqueue.c:2274 worker_thread+0xa8f/0x1430 kernel/workqueue.c:2420 kthread+0x2df/0x300 kernel/kthread.c:255 It looks like that smsc95xx_unbind() is freeing the structures that are still in use by the concurrently running workqueue callback. Thus switch to using cancel_delayed_work_sync() to ensure the work callback really is no longer active. Reported-by: syzbot+29dc7d4ae19b703ff947@syzkaller.appspotmail.com Signed-off-by: Tuomas Tynkkynen Signed-off-by: David S. Miller --- drivers/net/usb/smsc95xx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/usb/smsc95xx.c b/drivers/net/usb/smsc95xx.c index 355be77f4241..3cf4dc3433f9 100644 --- a/drivers/net/usb/smsc95xx.c +++ b/drivers/net/usb/smsc95xx.c @@ -1324,7 +1324,7 @@ static void smsc95xx_unbind(struct usbnet *dev, struct usb_interface *intf) struct smsc95xx_priv *pdata = (struct smsc95xx_priv *)(dev->data[0]); if (pdata) { - cancel_delayed_work(&pdata->carrier_check); + cancel_delayed_work_sync(&pdata->carrier_check); netif_dbg(dev, ifdown, dev->net, "free pdata\n"); kfree(pdata); pdata = NULL; -- cgit v1.2.3 From de0083c7ed7dba036d1ed6e012157649d45313c8 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Sun, 21 Jun 2020 13:46:25 +0000 Subject: hsr: avoid to create proc file after unregister When an interface is being deleted, "/proc/net/dev_snmp6/" is deleted. The function for this is addrconf_ifdown() in the addrconf_notify() and it is called by notification, which is NETDEV_UNREGISTER. But, if NETDEV_CHANGEMTU is triggered after NETDEV_UNREGISTER, this proc file will be created again. This recreated proc file will be deleted by netdev_wati_allrefs(). Before netdev_wait_allrefs() is called, creating a new HSR interface routine can be executed and It tries to create a proc file but it will find an un-deleted proc file. At this point, it warns about it. To avoid this situation, it can use ->dellink() instead of ->ndo_uninit() to release resources because ->dellink() is called before NETDEV_UNREGISTER. So, a proc file will not be recreated. Test commands ip link add dummy0 type dummy ip link add dummy1 type dummy ip link set dummy0 mtu 1300 #SHELL1 while : do ip link add hsr0 type hsr slave1 dummy0 slave2 dummy1 done #SHELL2 while : do ip link del hsr0 done Splat looks like: [ 9888.980852][ T2752] proc_dir_entry 'dev_snmp6/hsr0' already registered [ 9888.981797][ C2] WARNING: CPU: 2 PID: 2752 at fs/proc/generic.c:372 proc_register+0x2d5/0x430 [ 9888.981798][ C2] Modules linked in: hsr dummy veth openvswitch nsh nf_conncount nf_nat nf_conntrack nf_defrag_ipv6x [ 9888.981814][ C2] CPU: 2 PID: 2752 Comm: ip Tainted: G W 5.8.0-rc1+ #616 [ 9888.981815][ C2] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 [ 9888.981816][ C2] RIP: 0010:proc_register+0x2d5/0x430 [ 9888.981818][ C2] Code: fc ff df 48 89 fa 48 c1 ea 03 80 3c 02 00 0f 85 65 01 00 00 49 8b b5 e0 00 00 00 48 89 ea 40 [ 9888.981819][ C2] RSP: 0018:ffff8880628dedf0 EFLAGS: 00010286 [ 9888.981821][ C2] RAX: dffffc0000000008 RBX: ffff888028c69170 RCX: ffffffffaae09a62 [ 9888.981822][ C2] RDX: 0000000000000001 RSI: 0000000000000008 RDI: ffff88806c9f75ac [ 9888.981823][ C2] RBP: ffff888028c693f4 R08: ffffed100d9401bd R09: ffffed100d9401bd [ 9888.981824][ C2] R10: ffffffffaddf406f R11: 0000000000000001 R12: ffff888028c69308 [ 9888.981825][ C2] R13: ffff8880663584c8 R14: dffffc0000000000 R15: ffffed100518d27e [ 9888.981827][ C2] FS: 00007f3876b3b0c0(0000) GS:ffff88806c800000(0000) knlGS:0000000000000000 [ 9888.981828][ C2] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 9888.981829][ C2] CR2: 00007f387601a8c0 CR3: 000000004101a002 CR4: 00000000000606e0 [ 9888.981830][ C2] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 9888.981831][ C2] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 9888.981832][ C2] Call Trace: [ 9888.981833][ C2] ? snmp6_seq_show+0x180/0x180 [ 9888.981834][ C2] proc_create_single_data+0x7c/0xa0 [ 9888.981835][ C2] snmp6_register_dev+0xb0/0x130 [ 9888.981836][ C2] ipv6_add_dev+0x4b7/0xf60 [ 9888.981837][ C2] addrconf_notify+0x684/0x1ca0 [ 9888.981838][ C2] ? __mutex_unlock_slowpath+0xd0/0x670 [ 9888.981839][ C2] ? kasan_unpoison_shadow+0x30/0x40 [ 9888.981840][ C2] ? wait_for_completion+0x250/0x250 [ 9888.981841][ C2] ? inet6_ifinfo_notify+0x100/0x100 [ 9888.981842][ C2] ? dropmon_net_event+0x227/0x410 [ 9888.981843][ C2] ? notifier_call_chain+0x90/0x160 [ 9888.981844][ C2] ? inet6_ifinfo_notify+0x100/0x100 [ 9888.981845][ C2] notifier_call_chain+0x90/0x160 [ 9888.981846][ C2] register_netdevice+0xbe5/0x1070 [ ... ] Reported-by: syzbot+1d51c8b74efa4c44adeb@syzkaller.appspotmail.com Fixes: e0a4b99773d3 ("hsr: use upper/lower device infrastructure") Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- net/hsr/hsr_device.c | 21 +-------------------- net/hsr/hsr_device.h | 2 +- net/hsr/hsr_main.c | 9 ++++++--- net/hsr/hsr_netlink.c | 17 +++++++++++++++++ 4 files changed, 25 insertions(+), 24 deletions(-) diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index cd99f548e440..478852ef98ef 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -339,7 +339,7 @@ static void hsr_announce(struct timer_list *t) rcu_read_unlock(); } -static void hsr_del_ports(struct hsr_priv *hsr) +void hsr_del_ports(struct hsr_priv *hsr) { struct hsr_port *port; @@ -356,31 +356,12 @@ static void hsr_del_ports(struct hsr_priv *hsr) hsr_del_port(port); } -/* This has to be called after all the readers are gone. - * Otherwise we would have to check the return value of - * hsr_port_get_hsr(). - */ -static void hsr_dev_destroy(struct net_device *hsr_dev) -{ - struct hsr_priv *hsr = netdev_priv(hsr_dev); - - hsr_debugfs_term(hsr); - hsr_del_ports(hsr); - - del_timer_sync(&hsr->prune_timer); - del_timer_sync(&hsr->announce_timer); - - hsr_del_self_node(hsr); - hsr_del_nodes(&hsr->node_db); -} - static const struct net_device_ops hsr_device_ops = { .ndo_change_mtu = hsr_dev_change_mtu, .ndo_open = hsr_dev_open, .ndo_stop = hsr_dev_close, .ndo_start_xmit = hsr_dev_xmit, .ndo_fix_features = hsr_fix_features, - .ndo_uninit = hsr_dev_destroy, }; static struct device_type hsr_type = { diff --git a/net/hsr/hsr_device.h b/net/hsr/hsr_device.h index a099d7de7e79..b8f9262ed101 100644 --- a/net/hsr/hsr_device.h +++ b/net/hsr/hsr_device.h @@ -11,6 +11,7 @@ #include #include "hsr_main.h" +void hsr_del_ports(struct hsr_priv *hsr); void hsr_dev_setup(struct net_device *dev); int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], unsigned char multicast_spec, u8 protocol_version, @@ -18,5 +19,4 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], void hsr_check_carrier_and_operstate(struct hsr_priv *hsr); bool is_hsr_master(struct net_device *dev); int hsr_get_max_mtu(struct hsr_priv *hsr); - #endif /* __HSR_DEVICE_H */ diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c index e2564de67603..144da15f0a81 100644 --- a/net/hsr/hsr_main.c +++ b/net/hsr/hsr_main.c @@ -6,6 +6,7 @@ */ #include +#include #include #include #include @@ -100,8 +101,10 @@ static int hsr_netdev_notify(struct notifier_block *nb, unsigned long event, master = hsr_port_get_hsr(port->hsr, HSR_PT_MASTER); hsr_del_port(port); if (hsr_slave_empty(master->hsr)) { - unregister_netdevice_queue(master->dev, - &list_kill); + const struct rtnl_link_ops *ops; + + ops = master->dev->rtnl_link_ops; + ops->dellink(master->dev, &list_kill); unregister_netdevice_many(&list_kill); } } @@ -144,9 +147,9 @@ static int __init hsr_init(void) static void __exit hsr_exit(void) { - unregister_netdevice_notifier(&hsr_nb); hsr_netlink_exit(); hsr_debugfs_remove_root(); + unregister_netdevice_notifier(&hsr_nb); } module_init(hsr_init); diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c index 1decb25f6764..6e14b7d22639 100644 --- a/net/hsr/hsr_netlink.c +++ b/net/hsr/hsr_netlink.c @@ -83,6 +83,22 @@ static int hsr_newlink(struct net *src_net, struct net_device *dev, return hsr_dev_finalize(dev, link, multicast_spec, hsr_version, extack); } +static void hsr_dellink(struct net_device *dev, struct list_head *head) +{ + struct hsr_priv *hsr = netdev_priv(dev); + + del_timer_sync(&hsr->prune_timer); + del_timer_sync(&hsr->announce_timer); + + hsr_debugfs_term(hsr); + hsr_del_ports(hsr); + + hsr_del_self_node(hsr); + hsr_del_nodes(&hsr->node_db); + + unregister_netdevice_queue(dev, head); +} + static int hsr_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct hsr_priv *hsr = netdev_priv(dev); @@ -118,6 +134,7 @@ static struct rtnl_link_ops hsr_link_ops __read_mostly = { .priv_size = sizeof(struct hsr_priv), .setup = hsr_dev_setup, .newlink = hsr_newlink, + .dellink = hsr_dellink, .fill_info = hsr_fill_info, }; -- cgit v1.2.3 From 21a739c64d3e9871186483a0cc3e7b52638c3d59 Mon Sep 17 00:00:00 2001 From: Gaurav Singh Date: Sun, 21 Jun 2020 11:30:17 -0400 Subject: ethtool: Fix check in ethtool_rx_flow_rule_create Fix check in ethtool_rx_flow_rule_create Fixes: eca4205f9ec3 ("ethtool: add ethtool_rx_flow_spec to flow_rule structure translator") Signed-off-by: Gaurav Singh Signed-off-by: David S. Miller --- net/ethtool/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index b5df90c981c2..21d5fc0f6bb3 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -2978,7 +2978,7 @@ ethtool_rx_flow_rule_create(const struct ethtool_rx_flow_spec_input *input) sizeof(match->mask.ipv6.dst)); } if (memcmp(v6_m_spec->ip6src, &zero_addr, sizeof(zero_addr)) || - memcmp(v6_m_spec->ip6src, &zero_addr, sizeof(zero_addr))) { + memcmp(v6_m_spec->ip6dst, &zero_addr, sizeof(zero_addr))) { match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_IPV6_ADDRS); match->dissector.offset[FLOW_DISSECTOR_KEY_IPV6_ADDRS] = -- cgit v1.2.3 From b562f58bbc12444219b74a5d6524977a3d87a022 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Mon, 22 Jun 2020 19:45:58 +0800 Subject: mptcp: drop sndr_key in mptcp_syn_options In RFC 8684, we don't need to send sndr_key in SYN package anymore, so drop it. Fixes: cc7972ea1932 ("mptcp: parse and emit MP_CAPABLE option according to v1 spec") Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts Signed-off-by: David S. Miller --- net/mptcp/options.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 490b92534afc..df9a51425c6f 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -336,9 +336,7 @@ bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, */ subflow->snd_isn = TCP_SKB_CB(skb)->end_seq; if (subflow->request_mptcp) { - pr_debug("local_key=%llu", subflow->local_key); opts->suboptions = OPTION_MPTCP_MPC_SYN; - opts->sndr_key = subflow->local_key; *size = TCPOLEN_MPTCP_MPC_SYN; return true; } else if (subflow->request_join) { -- cgit v1.2.3 From 26ac10be3c80a26d03c950b7387c4b9566c260b6 Mon Sep 17 00:00:00 2001 From: Aiden Leong Date: Mon, 22 Jun 2020 20:04:58 -0700 Subject: GUE: Fix a typo Fix a typo in gue.h Signed-off-by: Aiden Leong Signed-off-by: David S. Miller --- include/net/gue.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/gue.h b/include/net/gue.h index 3a6595bfa641..e42402f180b7 100644 --- a/include/net/gue.h +++ b/include/net/gue.h @@ -21,7 +21,7 @@ * | | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * - * C bit indicates contol message when set, data message when unset. + * C bit indicates control message when set, data message when unset. * For a control message, proto/ctype is interpreted as a type of * control message. For data messages, proto/ctype is the IP protocol * of the next header. -- cgit v1.2.3 From 2464bc7c2897b7549146a743045089d3aea7b85b Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Tue, 23 Jun 2020 11:05:40 +0200 Subject: bridge: uapi: mrp: Fix MRP_PORT_ROLE Currently the MRP_PORT_ROLE_NONE has the value 0x2 but this is in conflict with the IEC 62439-2 standard. The standard defines the following port roles: primary (0x0), secondary(0x1), interconnect(0x2). Therefore remove the port role none. Fixes: 4714d13791f831 ("bridge: uapi: mrp: Add mrp attributes.") Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- include/uapi/linux/mrp_bridge.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/uapi/linux/mrp_bridge.h b/include/uapi/linux/mrp_bridge.h index 84f15f48a7cb..bee366540212 100644 --- a/include/uapi/linux/mrp_bridge.h +++ b/include/uapi/linux/mrp_bridge.h @@ -36,7 +36,6 @@ enum br_mrp_port_state_type { enum br_mrp_port_role_type { BR_MRP_PORT_ROLE_PRIMARY, BR_MRP_PORT_ROLE_SECONDARY, - BR_MRP_PORT_ROLE_NONE, }; enum br_mrp_tlv_header_type { -- cgit v1.2.3 From 7882c895b71b8cd6c81946fcc8e13d15a2841de7 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Tue, 23 Jun 2020 11:05:41 +0200 Subject: bridge: mrp: Validate when setting the port role This patch adds specific checks for primary(0x0) and secondary(0x1) when setting the port role. For any other value the function 'br_mrp_set_port_role' will return -EINVAL. Fixes: 20f6a05ef63594 ("bridge: mrp: Rework the MRP netlink interface") Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- net/bridge/br_mrp.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/net/bridge/br_mrp.c b/net/bridge/br_mrp.c index 24986ec7d38c..779e1eb75443 100644 --- a/net/bridge/br_mrp.c +++ b/net/bridge/br_mrp.c @@ -411,10 +411,16 @@ int br_mrp_set_port_role(struct net_bridge_port *p, if (!mrp) return -EINVAL; - if (role == BR_MRP_PORT_ROLE_PRIMARY) + switch (role) { + case BR_MRP_PORT_ROLE_PRIMARY: rcu_assign_pointer(mrp->p_port, p); - else + break; + case BR_MRP_PORT_ROLE_SECONDARY: rcu_assign_pointer(mrp->s_port, p); + break; + default: + return -EINVAL; + } br_mrp_port_switchdev_set_role(p, role); -- cgit v1.2.3 From 558b353c9c2a717509f291c066c6bd8f5f5e21be Mon Sep 17 00:00:00 2001 From: Frank Werner-Krippendorf Date: Tue, 23 Jun 2020 03:59:44 -0600 Subject: wireguard: noise: do not assign initiation time in if condition Fixes an error condition reported by checkpatch.pl which caused by assigning a variable in an if condition in wg_noise_handshake_consume_ initiation(). Signed-off-by: Frank Werner-Krippendorf Signed-off-by: Jason A. Donenfeld Signed-off-by: David S. Miller --- drivers/net/wireguard/noise.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireguard/noise.c b/drivers/net/wireguard/noise.c index 626433690abb..201a22681945 100644 --- a/drivers/net/wireguard/noise.c +++ b/drivers/net/wireguard/noise.c @@ -617,8 +617,8 @@ wg_noise_handshake_consume_initiation(struct message_handshake_initiation *src, memcpy(handshake->hash, hash, NOISE_HASH_LEN); memcpy(handshake->chaining_key, chaining_key, NOISE_HASH_LEN); handshake->remote_index = src->sender_index; - if ((s64)(handshake->last_initiation_consumption - - (initiation_consumption = ktime_get_coarse_boottime_ns())) < 0) + initiation_consumption = ktime_get_coarse_boottime_ns(); + if ((s64)(handshake->last_initiation_consumption - initiation_consumption) < 0) handshake->last_initiation_consumption = initiation_consumption; handshake->state = HANDSHAKE_CONSUMED_INITIATION; up_write(&handshake->lock); -- cgit v1.2.3 From 900575aa33a3eaaef802b31de187a85c4a4b4bd0 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Tue, 23 Jun 2020 03:59:45 -0600 Subject: wireguard: device: avoid circular netns references Before, we took a reference to the creating netns if the new netns was different. This caused issues with circular references, with two wireguard interfaces swapping namespaces. The solution is to rather not take any extra references at all, but instead simply invalidate the creating netns pointer when that netns is deleted. In order to prevent this from happening again, this commit improves the rough object leak tracking by allowing it to account for created and destroyed interfaces, aside from just peers and keys. That then makes it possible to check for the object leak when having two interfaces take a reference to each others' namespaces. Fixes: e7096c131e51 ("net: WireGuard secure network tunnel") Signed-off-by: Jason A. Donenfeld Signed-off-by: David S. Miller --- drivers/net/wireguard/device.c | 58 ++++++++++++++---------------- drivers/net/wireguard/device.h | 3 +- drivers/net/wireguard/netlink.c | 14 +++++--- drivers/net/wireguard/socket.c | 25 +++++++++---- tools/testing/selftests/wireguard/netns.sh | 13 ++++++- 5 files changed, 67 insertions(+), 46 deletions(-) diff --git a/drivers/net/wireguard/device.c b/drivers/net/wireguard/device.c index 3ac3f8570ca1..a8f151b1b5fa 100644 --- a/drivers/net/wireguard/device.c +++ b/drivers/net/wireguard/device.c @@ -45,17 +45,18 @@ static int wg_open(struct net_device *dev) if (dev_v6) dev_v6->cnf.addr_gen_mode = IN6_ADDR_GEN_MODE_NONE; + mutex_lock(&wg->device_update_lock); ret = wg_socket_init(wg, wg->incoming_port); if (ret < 0) - return ret; - mutex_lock(&wg->device_update_lock); + goto out; list_for_each_entry(peer, &wg->peer_list, peer_list) { wg_packet_send_staged_packets(peer); if (peer->persistent_keepalive_interval) wg_packet_send_keepalive(peer); } +out: mutex_unlock(&wg->device_update_lock); - return 0; + return ret; } #ifdef CONFIG_PM_SLEEP @@ -225,6 +226,7 @@ static void wg_destruct(struct net_device *dev) list_del(&wg->device_list); rtnl_unlock(); mutex_lock(&wg->device_update_lock); + rcu_assign_pointer(wg->creating_net, NULL); wg->incoming_port = 0; wg_socket_reinit(wg, NULL, NULL); /* The final references are cleared in the below calls to destroy_workqueue. */ @@ -240,13 +242,11 @@ static void wg_destruct(struct net_device *dev) skb_queue_purge(&wg->incoming_handshakes); free_percpu(dev->tstats); free_percpu(wg->incoming_handshakes_worker); - if (wg->have_creating_net_ref) - put_net(wg->creating_net); kvfree(wg->index_hashtable); kvfree(wg->peer_hashtable); mutex_unlock(&wg->device_update_lock); - pr_debug("%s: Interface deleted\n", dev->name); + pr_debug("%s: Interface destroyed\n", dev->name); free_netdev(dev); } @@ -292,7 +292,7 @@ static int wg_newlink(struct net *src_net, struct net_device *dev, struct wg_device *wg = netdev_priv(dev); int ret = -ENOMEM; - wg->creating_net = src_net; + rcu_assign_pointer(wg->creating_net, src_net); init_rwsem(&wg->static_identity.lock); mutex_init(&wg->socket_update_lock); mutex_init(&wg->device_update_lock); @@ -393,30 +393,26 @@ static struct rtnl_link_ops link_ops __read_mostly = { .newlink = wg_newlink, }; -static int wg_netdevice_notification(struct notifier_block *nb, - unsigned long action, void *data) +static void wg_netns_pre_exit(struct net *net) { - struct net_device *dev = ((struct netdev_notifier_info *)data)->dev; - struct wg_device *wg = netdev_priv(dev); - - ASSERT_RTNL(); - - if (action != NETDEV_REGISTER || dev->netdev_ops != &netdev_ops) - return 0; + struct wg_device *wg; - if (dev_net(dev) == wg->creating_net && wg->have_creating_net_ref) { - put_net(wg->creating_net); - wg->have_creating_net_ref = false; - } else if (dev_net(dev) != wg->creating_net && - !wg->have_creating_net_ref) { - wg->have_creating_net_ref = true; - get_net(wg->creating_net); + rtnl_lock(); + list_for_each_entry(wg, &device_list, device_list) { + if (rcu_access_pointer(wg->creating_net) == net) { + pr_debug("%s: Creating namespace exiting\n", wg->dev->name); + netif_carrier_off(wg->dev); + mutex_lock(&wg->device_update_lock); + rcu_assign_pointer(wg->creating_net, NULL); + wg_socket_reinit(wg, NULL, NULL); + mutex_unlock(&wg->device_update_lock); + } } - return 0; + rtnl_unlock(); } -static struct notifier_block netdevice_notifier = { - .notifier_call = wg_netdevice_notification +static struct pernet_operations pernet_ops = { + .pre_exit = wg_netns_pre_exit }; int __init wg_device_init(void) @@ -429,18 +425,18 @@ int __init wg_device_init(void) return ret; #endif - ret = register_netdevice_notifier(&netdevice_notifier); + ret = register_pernet_device(&pernet_ops); if (ret) goto error_pm; ret = rtnl_link_register(&link_ops); if (ret) - goto error_netdevice; + goto error_pernet; return 0; -error_netdevice: - unregister_netdevice_notifier(&netdevice_notifier); +error_pernet: + unregister_pernet_device(&pernet_ops); error_pm: #ifdef CONFIG_PM_SLEEP unregister_pm_notifier(&pm_notifier); @@ -451,7 +447,7 @@ error_pm: void wg_device_uninit(void) { rtnl_link_unregister(&link_ops); - unregister_netdevice_notifier(&netdevice_notifier); + unregister_pernet_device(&pernet_ops); #ifdef CONFIG_PM_SLEEP unregister_pm_notifier(&pm_notifier); #endif diff --git a/drivers/net/wireguard/device.h b/drivers/net/wireguard/device.h index b15a8be9d816..4d0144e16947 100644 --- a/drivers/net/wireguard/device.h +++ b/drivers/net/wireguard/device.h @@ -40,7 +40,7 @@ struct wg_device { struct net_device *dev; struct crypt_queue encrypt_queue, decrypt_queue; struct sock __rcu *sock4, *sock6; - struct net *creating_net; + struct net __rcu *creating_net; struct noise_static_identity static_identity; struct workqueue_struct *handshake_receive_wq, *handshake_send_wq; struct workqueue_struct *packet_crypt_wq; @@ -56,7 +56,6 @@ struct wg_device { unsigned int num_peers, device_update_gen; u32 fwmark; u16 incoming_port; - bool have_creating_net_ref; }; int wg_device_init(void); diff --git a/drivers/net/wireguard/netlink.c b/drivers/net/wireguard/netlink.c index 802099c8828a..20a4f3c0a0a1 100644 --- a/drivers/net/wireguard/netlink.c +++ b/drivers/net/wireguard/netlink.c @@ -511,11 +511,15 @@ static int wg_set_device(struct sk_buff *skb, struct genl_info *info) if (flags & ~__WGDEVICE_F_ALL) goto out; - ret = -EPERM; - if ((info->attrs[WGDEVICE_A_LISTEN_PORT] || - info->attrs[WGDEVICE_A_FWMARK]) && - !ns_capable(wg->creating_net->user_ns, CAP_NET_ADMIN)) - goto out; + if (info->attrs[WGDEVICE_A_LISTEN_PORT] || info->attrs[WGDEVICE_A_FWMARK]) { + struct net *net; + rcu_read_lock(); + net = rcu_dereference(wg->creating_net); + ret = !net || !ns_capable(net->user_ns, CAP_NET_ADMIN) ? -EPERM : 0; + rcu_read_unlock(); + if (ret) + goto out; + } ++wg->device_update_gen; diff --git a/drivers/net/wireguard/socket.c b/drivers/net/wireguard/socket.c index f9018027fc13..c33e2c81635f 100644 --- a/drivers/net/wireguard/socket.c +++ b/drivers/net/wireguard/socket.c @@ -347,6 +347,7 @@ static void set_sock_opts(struct socket *sock) int wg_socket_init(struct wg_device *wg, u16 port) { + struct net *net; int ret; struct udp_tunnel_sock_cfg cfg = { .sk_user_data = wg, @@ -371,37 +372,47 @@ int wg_socket_init(struct wg_device *wg, u16 port) }; #endif + rcu_read_lock(); + net = rcu_dereference(wg->creating_net); + net = net ? maybe_get_net(net) : NULL; + rcu_read_unlock(); + if (unlikely(!net)) + return -ENONET; + #if IS_ENABLED(CONFIG_IPV6) retry: #endif - ret = udp_sock_create(wg->creating_net, &port4, &new4); + ret = udp_sock_create(net, &port4, &new4); if (ret < 0) { pr_err("%s: Could not create IPv4 socket\n", wg->dev->name); - return ret; + goto out; } set_sock_opts(new4); - setup_udp_tunnel_sock(wg->creating_net, new4, &cfg); + setup_udp_tunnel_sock(net, new4, &cfg); #if IS_ENABLED(CONFIG_IPV6) if (ipv6_mod_enabled()) { port6.local_udp_port = inet_sk(new4->sk)->inet_sport; - ret = udp_sock_create(wg->creating_net, &port6, &new6); + ret = udp_sock_create(net, &port6, &new6); if (ret < 0) { udp_tunnel_sock_release(new4); if (ret == -EADDRINUSE && !port && retries++ < 100) goto retry; pr_err("%s: Could not create IPv6 socket\n", wg->dev->name); - return ret; + goto out; } set_sock_opts(new6); - setup_udp_tunnel_sock(wg->creating_net, new6, &cfg); + setup_udp_tunnel_sock(net, new6, &cfg); } #endif wg_socket_reinit(wg, new4->sk, new6 ? new6->sk : NULL); - return 0; + ret = 0; +out: + put_net(net); + return ret; } void wg_socket_reinit(struct wg_device *wg, struct sock *new4, diff --git a/tools/testing/selftests/wireguard/netns.sh b/tools/testing/selftests/wireguard/netns.sh index 17a1f53ceba0..d77f4829f1e0 100755 --- a/tools/testing/selftests/wireguard/netns.sh +++ b/tools/testing/selftests/wireguard/netns.sh @@ -587,9 +587,20 @@ ip0 link set wg0 up kill $ncat_pid ip0 link del wg0 +# Ensure there aren't circular reference loops +ip1 link add wg1 type wireguard +ip2 link add wg2 type wireguard +ip1 link set wg1 netns $netns2 +ip2 link set wg2 netns $netns1 +pp ip netns delete $netns1 +pp ip netns delete $netns2 +pp ip netns add $netns1 +pp ip netns add $netns2 + +sleep 2 # Wait for cleanup and grace periods declare -A objects while read -t 0.1 -r line 2>/dev/null || [[ $? -ne 142 ]]; do - [[ $line =~ .*(wg[0-9]+:\ [A-Z][a-z]+\ [0-9]+)\ .*(created|destroyed).* ]] || continue + [[ $line =~ .*(wg[0-9]+:\ [A-Z][a-z]+\ ?[0-9]*)\ .*(created|destroyed).* ]] || continue objects["${BASH_REMATCH[1]}"]+="${BASH_REMATCH[2]}" done < /dev/kmsg alldeleted=1 -- cgit v1.2.3 From b4730ae6a443afe611afb4fb651c885c51003c15 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Tue, 23 Jun 2020 10:43:48 +0000 Subject: net: ethtool: add missing string for NETIF_F_GSO_TUNNEL_REMCSUM Commit e585f2363637 ("udp: Changes to udp_offload to support remote checksum offload") added new GSO type and a corresponding netdev feature, but missed Ethtool's 'netdev_features_strings' table. Give it a name so it will be exposed to userspace and become available for manual configuration. v3: - decouple from "netdev_features_strings[] cleanup" series; - no functional changes. v2: - don't split the "Fixes:" tag across lines; - no functional changes. Fixes: e585f2363637 ("udp: Changes to udp_offload to support remote checksum offload") Signed-off-by: Alexander Lobakin Signed-off-by: David S. Miller --- net/ethtool/common.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ethtool/common.c b/net/ethtool/common.c index 47f63526818e..aaecfc916a4d 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -40,6 +40,7 @@ const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = { [NETIF_F_GSO_UDP_TUNNEL_BIT] = "tx-udp_tnl-segmentation", [NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT] = "tx-udp_tnl-csum-segmentation", [NETIF_F_GSO_PARTIAL_BIT] = "tx-gso-partial", + [NETIF_F_GSO_TUNNEL_REMCSUM_BIT] = "tx-tunnel-remcsum-segmentation", [NETIF_F_GSO_SCTP_BIT] = "tx-sctp-segmentation", [NETIF_F_GSO_ESP_BIT] = "tx-esp-segmentation", [NETIF_F_GSO_UDP_L4_BIT] = "tx-udp-segmentation", -- cgit v1.2.3 From 97dd1abd026ae4e6a82fa68645928404ad483409 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Tue, 23 Jun 2020 16:51:29 +0300 Subject: net: qed: fix left elements count calculation qed_chain_get_element_left{,_u32} returned 0 when the difference between producer and consumer page count was equal to the total page count. Fix this by conditional expanding of producer value (vs unconditional). This allowed to eliminate normalizaton against total page count, which was the cause of this bug. Misc: replace open-coded constants with common defines. Fixes: a91eb52abb50 ("qed: Revisit chain implementation") Signed-off-by: Alexander Lobakin Signed-off-by: Igor Russkikh Signed-off-by: Michal Kalderon Signed-off-by: David S. Miller --- include/linux/qed/qed_chain.h | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/include/linux/qed/qed_chain.h b/include/linux/qed/qed_chain.h index 733fad7dfbed..6d15040c642c 100644 --- a/include/linux/qed/qed_chain.h +++ b/include/linux/qed/qed_chain.h @@ -207,28 +207,34 @@ static inline u32 qed_chain_get_cons_idx_u32(struct qed_chain *p_chain) static inline u16 qed_chain_get_elem_left(struct qed_chain *p_chain) { + u16 elem_per_page = p_chain->elem_per_page; + u32 prod = p_chain->u.chain16.prod_idx; + u32 cons = p_chain->u.chain16.cons_idx; u16 used; - used = (u16) (((u32)0x10000 + - (u32)p_chain->u.chain16.prod_idx) - - (u32)p_chain->u.chain16.cons_idx); + if (prod < cons) + prod += (u32)U16_MAX + 1; + + used = (u16)(prod - cons); if (p_chain->mode == QED_CHAIN_MODE_NEXT_PTR) - used -= p_chain->u.chain16.prod_idx / p_chain->elem_per_page - - p_chain->u.chain16.cons_idx / p_chain->elem_per_page; + used -= prod / elem_per_page - cons / elem_per_page; return (u16)(p_chain->capacity - used); } static inline u32 qed_chain_get_elem_left_u32(struct qed_chain *p_chain) { + u16 elem_per_page = p_chain->elem_per_page; + u64 prod = p_chain->u.chain32.prod_idx; + u64 cons = p_chain->u.chain32.cons_idx; u32 used; - used = (u32) (((u64)0x100000000ULL + - (u64)p_chain->u.chain32.prod_idx) - - (u64)p_chain->u.chain32.cons_idx); + if (prod < cons) + prod += (u64)U32_MAX + 1; + + used = (u32)(prod - cons); if (p_chain->mode == QED_CHAIN_MODE_NEXT_PTR) - used -= p_chain->u.chain32.prod_idx / p_chain->elem_per_page - - p_chain->u.chain32.cons_idx / p_chain->elem_per_page; + used -= (u32)(prod / elem_per_page - cons / elem_per_page); return p_chain->capacity - used; } -- cgit v1.2.3 From 31333c1a2521ff4b4ceb0c29de492549cd4a8de3 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Tue, 23 Jun 2020 16:51:30 +0300 Subject: net: qed: fix async event callbacks unregistering qed_spq_unregister_async_cb() should be called before qed_rdma_info_free() to avoid crash-spawning uses-after-free. Instead of calling it from each subsystem exit code, do it in one place on PF down. Fixes: 291d57f67d24 ("qed: Fix rdma_info structure allocation") Signed-off-by: Alexander Lobakin Signed-off-by: Igor Russkikh Signed-off-by: Michal Kalderon Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_dev.c | 9 +++++++-- drivers/net/ethernet/qlogic/qed/qed_iwarp.c | 2 -- drivers/net/ethernet/qlogic/qed/qed_roce.c | 1 - 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c index 1eebf30fa798..b41ada668948 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_dev.c +++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c @@ -1368,6 +1368,8 @@ static void qed_dbg_user_data_free(struct qed_hwfn *p_hwfn) void qed_resc_free(struct qed_dev *cdev) { + struct qed_rdma_info *rdma_info; + struct qed_hwfn *p_hwfn; int i; if (IS_VF(cdev)) { @@ -1385,7 +1387,8 @@ void qed_resc_free(struct qed_dev *cdev) qed_llh_free(cdev); for_each_hwfn(cdev, i) { - struct qed_hwfn *p_hwfn = &cdev->hwfns[i]; + p_hwfn = cdev->hwfns + i; + rdma_info = p_hwfn->p_rdma_info; qed_cxt_mngr_free(p_hwfn); qed_qm_info_free(p_hwfn); @@ -1404,8 +1407,10 @@ void qed_resc_free(struct qed_dev *cdev) qed_ooo_free(p_hwfn); } - if (QED_IS_RDMA_PERSONALITY(p_hwfn)) + if (QED_IS_RDMA_PERSONALITY(p_hwfn) && rdma_info) { + qed_spq_unregister_async_cb(p_hwfn, rdma_info->proto); qed_rdma_info_free(p_hwfn); + } qed_iov_free(p_hwfn); qed_l2_free(p_hwfn); diff --git a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c index d2fe61a5cf56..5409a2da6106 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c +++ b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c @@ -2836,8 +2836,6 @@ int qed_iwarp_stop(struct qed_hwfn *p_hwfn) if (rc) return rc; - qed_spq_unregister_async_cb(p_hwfn, PROTOCOLID_IWARP); - return qed_iwarp_ll2_stop(p_hwfn); } diff --git a/drivers/net/ethernet/qlogic/qed/qed_roce.c b/drivers/net/ethernet/qlogic/qed/qed_roce.c index 4566815f7b87..7271dd7166e5 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_roce.c +++ b/drivers/net/ethernet/qlogic/qed/qed_roce.c @@ -113,7 +113,6 @@ void qed_roce_stop(struct qed_hwfn *p_hwfn) break; } } - qed_spq_unregister_async_cb(p_hwfn, PROTOCOLID_ROCE); } static void qed_rdma_copy_gids(struct qed_rdma_qp *qp, __le32 *src_gid, -- cgit v1.2.3 From 4079c7f7a2a00ab403c177ce723b560de59139c3 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Tue, 23 Jun 2020 16:51:31 +0300 Subject: net: qede: stop adding events on an already destroyed workqueue Set rdma_wq pointer to NULL after destroying the workqueue and check for it when adding new events to fix crashes on driver unload. Fixes: cee9fbd8e2e9 ("qede: Add qedr framework") Signed-off-by: Alexander Lobakin Signed-off-by: Igor Russkikh Signed-off-by: Michal Kalderon Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qede/qede_rdma.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/qlogic/qede/qede_rdma.c b/drivers/net/ethernet/qlogic/qede/qede_rdma.c index 2d873ae8a234..668ccc9d49f8 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_rdma.c +++ b/drivers/net/ethernet/qlogic/qede/qede_rdma.c @@ -105,6 +105,7 @@ static void qede_rdma_destroy_wq(struct qede_dev *edev) qede_rdma_cleanup_event(edev); destroy_workqueue(edev->rdma_info.rdma_wq); + edev->rdma_info.rdma_wq = NULL; } int qede_rdma_dev_add(struct qede_dev *edev, bool recovery) @@ -325,7 +326,7 @@ static void qede_rdma_add_event(struct qede_dev *edev, if (edev->rdma_info.exp_recovery) return; - if (!edev->rdma_info.qedr_dev) + if (!edev->rdma_info.qedr_dev || !edev->rdma_info.rdma_wq) return; /* We don't want the cleanup flow to start while we're allocating and -- cgit v1.2.3 From ccd7c7ce167a21dbf2b698ffcf00f11d96d44f9b Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Tue, 23 Jun 2020 16:51:32 +0300 Subject: net: qed: fix NVMe login fails over VFs 25ms sleep cycles in waiting for PF response are excessive and may lead to different timeout failures. Start to wait with short udelays, and in most cases polling will end here. If the time was not sufficient, switch to msleeps. usleep_range() may go far beyond 100us depending on platform and tick configuration, hence atomic udelays for consistency. Also add explicit DMA barriers since 'done' always comes from a shared request-response DMA pool, and note that in the comment nearby. Fixes: 1408cc1fa48c ("qed: Introduce VFs") Signed-off-by: Alexander Lobakin Signed-off-by: Igor Russkikh Signed-off-by: Michal Kalderon Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_vf.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_vf.c b/drivers/net/ethernet/qlogic/qed/qed_vf.c index 856051f50eb7..adc2c8f3d48e 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_vf.c +++ b/drivers/net/ethernet/qlogic/qed/qed_vf.c @@ -81,12 +81,17 @@ static void qed_vf_pf_req_end(struct qed_hwfn *p_hwfn, int req_status) mutex_unlock(&(p_hwfn->vf_iov_info->mutex)); } +#define QED_VF_CHANNEL_USLEEP_ITERATIONS 90 +#define QED_VF_CHANNEL_USLEEP_DELAY 100 +#define QED_VF_CHANNEL_MSLEEP_ITERATIONS 10 +#define QED_VF_CHANNEL_MSLEEP_DELAY 25 + static int qed_send_msg2pf(struct qed_hwfn *p_hwfn, u8 *done, u32 resp_size) { union vfpf_tlvs *p_req = p_hwfn->vf_iov_info->vf2pf_request; struct ustorm_trigger_vf_zone trigger; struct ustorm_vf_zone *zone_data; - int rc = 0, time = 100; + int iter, rc = 0; zone_data = (struct ustorm_vf_zone *)PXP_VF_BAR0_START_USDM_ZONE_B; @@ -126,11 +131,19 @@ static int qed_send_msg2pf(struct qed_hwfn *p_hwfn, u8 *done, u32 resp_size) REG_WR(p_hwfn, (uintptr_t)&zone_data->trigger, *((u32 *)&trigger)); /* When PF would be done with the response, it would write back to the - * `done' address. Poll until then. + * `done' address from a coherent DMA zone. Poll until then. */ - while ((!*done) && time) { - msleep(25); - time--; + + iter = QED_VF_CHANNEL_USLEEP_ITERATIONS; + while (!*done && iter--) { + udelay(QED_VF_CHANNEL_USLEEP_DELAY); + dma_rmb(); + } + + iter = QED_VF_CHANNEL_MSLEEP_ITERATIONS; + while (!*done && iter--) { + msleep(QED_VF_CHANNEL_MSLEEP_DELAY); + dma_rmb(); } if (!*done) { -- cgit v1.2.3 From d434d02f7e7c24c721365fd594ed781acb18e0da Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Tue, 23 Jun 2020 16:51:33 +0300 Subject: net: qed: fix excessive QM ILT lines consumption This is likely a copy'n'paste mistake. The amount of ILT lines to reserve for a single VF was being multiplied by the total VFs count. This led to a huge redundancy in reservation and potential lines drainouts. Fixes: 1408cc1fa48c ("qed: Introduce VFs") Signed-off-by: Alexander Lobakin Signed-off-by: Igor Russkikh Signed-off-by: Michal Kalderon Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_cxt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_cxt.c b/drivers/net/ethernet/qlogic/qed/qed_cxt.c index 7b76667acaba..c0a769b5358c 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_cxt.c +++ b/drivers/net/ethernet/qlogic/qed/qed_cxt.c @@ -271,7 +271,7 @@ static void qed_cxt_qm_iids(struct qed_hwfn *p_hwfn, vf_tids += segs[NUM_TASK_PF_SEGMENTS].count; } - iids->vf_cids += vf_cids * p_mngr->vf_count; + iids->vf_cids = vf_cids; iids->tids += vf_tids * p_mngr->vf_count; DP_VERBOSE(p_hwfn, QED_MSG_ILT, -- cgit v1.2.3 From 1c85f394c2206ea3835f43534d5675f0574e1b70 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Tue, 23 Jun 2020 16:51:34 +0300 Subject: net: qede: fix PTP initialization on recovery Currently PTP cyclecounter and timecounter are initialized only on the first probing and are cleaned up during removal. This means that PTP becomes non-functional after device recovery. Fix this by unconditional PTP initialization on probing and clearing Tx pending bit on exiting. Fixes: ccc67ef50b90 ("qede: Error recovery process") Signed-off-by: Alexander Lobakin Signed-off-by: Igor Russkikh Signed-off-by: Michal Kalderon Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qede/qede_main.c | 2 +- drivers/net/ethernet/qlogic/qede/qede_ptp.c | 31 ++++++++++++---------------- drivers/net/ethernet/qlogic/qede/qede_ptp.h | 2 +- 3 files changed, 15 insertions(+), 20 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c index 756c05eb96f3..f6ff31e73ebe 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_main.c +++ b/drivers/net/ethernet/qlogic/qede/qede_main.c @@ -1229,7 +1229,7 @@ static int __qede_probe(struct pci_dev *pdev, u32 dp_module, u8 dp_level, /* PTP not supported on VFs */ if (!is_vf) - qede_ptp_enable(edev, (mode == QEDE_PROBE_NORMAL)); + qede_ptp_enable(edev); edev->ops->register_ops(cdev, &qede_ll_ops, edev); diff --git a/drivers/net/ethernet/qlogic/qede/qede_ptp.c b/drivers/net/ethernet/qlogic/qede/qede_ptp.c index 4c7f7a7fc151..cd5841a9415e 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_ptp.c +++ b/drivers/net/ethernet/qlogic/qede/qede_ptp.c @@ -412,6 +412,7 @@ void qede_ptp_disable(struct qede_dev *edev) if (ptp->tx_skb) { dev_kfree_skb_any(ptp->tx_skb); ptp->tx_skb = NULL; + clear_bit_unlock(QEDE_FLAGS_PTP_TX_IN_PRORGESS, &edev->flags); } /* Disable PTP in HW */ @@ -423,7 +424,7 @@ void qede_ptp_disable(struct qede_dev *edev) edev->ptp = NULL; } -static int qede_ptp_init(struct qede_dev *edev, bool init_tc) +static int qede_ptp_init(struct qede_dev *edev) { struct qede_ptp *ptp; int rc; @@ -444,25 +445,19 @@ static int qede_ptp_init(struct qede_dev *edev, bool init_tc) /* Init work queue for Tx timestamping */ INIT_WORK(&ptp->work, qede_ptp_task); - /* Init cyclecounter and timecounter. This is done only in the first - * load. If done in every load, PTP application will fail when doing - * unload / load (e.g. MTU change) while it is running. - */ - if (init_tc) { - memset(&ptp->cc, 0, sizeof(ptp->cc)); - ptp->cc.read = qede_ptp_read_cc; - ptp->cc.mask = CYCLECOUNTER_MASK(64); - ptp->cc.shift = 0; - ptp->cc.mult = 1; - - timecounter_init(&ptp->tc, &ptp->cc, - ktime_to_ns(ktime_get_real())); - } + /* Init cyclecounter and timecounter */ + memset(&ptp->cc, 0, sizeof(ptp->cc)); + ptp->cc.read = qede_ptp_read_cc; + ptp->cc.mask = CYCLECOUNTER_MASK(64); + ptp->cc.shift = 0; + ptp->cc.mult = 1; - return rc; + timecounter_init(&ptp->tc, &ptp->cc, ktime_to_ns(ktime_get_real())); + + return 0; } -int qede_ptp_enable(struct qede_dev *edev, bool init_tc) +int qede_ptp_enable(struct qede_dev *edev) { struct qede_ptp *ptp; int rc; @@ -483,7 +478,7 @@ int qede_ptp_enable(struct qede_dev *edev, bool init_tc) edev->ptp = ptp; - rc = qede_ptp_init(edev, init_tc); + rc = qede_ptp_init(edev); if (rc) goto err1; diff --git a/drivers/net/ethernet/qlogic/qede/qede_ptp.h b/drivers/net/ethernet/qlogic/qede/qede_ptp.h index 691a14c4b2c5..89c7f3cf3ee2 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_ptp.h +++ b/drivers/net/ethernet/qlogic/qede/qede_ptp.h @@ -41,7 +41,7 @@ void qede_ptp_rx_ts(struct qede_dev *edev, struct sk_buff *skb); void qede_ptp_tx_ts(struct qede_dev *edev, struct sk_buff *skb); int qede_ptp_hw_ts(struct qede_dev *edev, struct ifreq *req); void qede_ptp_disable(struct qede_dev *edev); -int qede_ptp_enable(struct qede_dev *edev, bool init_tc); +int qede_ptp_enable(struct qede_dev *edev); int qede_ptp_get_ts_info(struct qede_dev *edev, struct ethtool_ts_info *ts); static inline void qede_ptp_record_rx_ts(struct qede_dev *edev, -- cgit v1.2.3 From ec6c80590bde6b5dfa4970fffa3572f1acd313ca Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Tue, 23 Jun 2020 16:51:35 +0300 Subject: net: qede: fix use-after-free on recovery and AER handling Set edev->cdev pointer to NULL after calling remove() callback to avoid using of already freed object. Fixes: ccc67ef50b90 ("qede: Error recovery process") Signed-off-by: Alexander Lobakin Signed-off-by: Igor Russkikh Signed-off-by: Michal Kalderon Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qede/qede_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c index f6ff31e73ebe..29e285430f99 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_main.c +++ b/drivers/net/ethernet/qlogic/qede/qede_main.c @@ -1318,6 +1318,7 @@ static void __qede_remove(struct pci_dev *pdev, enum qede_remove_mode mode) if (system_state == SYSTEM_POWER_OFF) return; qed_ops->common->remove(cdev); + edev->cdev = NULL; /* Since this can happen out-of-sync with other flows, * don't release the netdevice until after slowpath stop -- cgit v1.2.3 From c221dd1831732449f844e03631a34fd21c2b9429 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Tue, 23 Jun 2020 16:51:36 +0300 Subject: net: qed: reset ILT block sizes before recomputing to fix crashes Sizes of all ILT blocks must be reset before ILT recomputing when disabling clients, or memory allocation may exceed ILT shadow array and provoke system crashes. Fixes: 1408cc1fa48c ("qed: Introduce VFs") Signed-off-by: Alexander Lobakin Signed-off-by: Igor Russkikh Signed-off-by: Michal Kalderon Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_cxt.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/drivers/net/ethernet/qlogic/qed/qed_cxt.c b/drivers/net/ethernet/qlogic/qed/qed_cxt.c index c0a769b5358c..08ba9d54ab63 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_cxt.c +++ b/drivers/net/ethernet/qlogic/qed/qed_cxt.c @@ -465,6 +465,20 @@ static struct qed_ilt_cli_blk *qed_cxt_set_blk(struct qed_ilt_cli_blk *p_blk) return p_blk; } +static void qed_cxt_ilt_blk_reset(struct qed_hwfn *p_hwfn) +{ + struct qed_ilt_client_cfg *clients = p_hwfn->p_cxt_mngr->clients; + u32 cli_idx, blk_idx; + + for (cli_idx = 0; cli_idx < MAX_ILT_CLIENTS; cli_idx++) { + for (blk_idx = 0; blk_idx < ILT_CLI_PF_BLOCKS; blk_idx++) + clients[cli_idx].pf_blks[blk_idx].total_size = 0; + + for (blk_idx = 0; blk_idx < ILT_CLI_VF_BLOCKS; blk_idx++) + clients[cli_idx].vf_blks[blk_idx].total_size = 0; + } +} + int qed_cxt_cfg_ilt_compute(struct qed_hwfn *p_hwfn, u32 *line_count) { struct qed_cxt_mngr *p_mngr = p_hwfn->p_cxt_mngr; @@ -484,6 +498,11 @@ int qed_cxt_cfg_ilt_compute(struct qed_hwfn *p_hwfn, u32 *line_count) p_mngr->pf_start_line = RESC_START(p_hwfn, QED_ILT); + /* Reset all ILT blocks at the beginning of ILT computing in order + * to prevent memory allocation for irrelevant blocks afterwards. + */ + qed_cxt_ilt_blk_reset(p_hwfn); + DP_VERBOSE(p_hwfn, QED_MSG_ILT, "hwfn [%d] - Set context manager starting line to be 0x%08x\n", p_hwfn->my_id, p_hwfn->p_cxt_mngr->pf_start_line); -- cgit v1.2.3 From 10f468ea5c481b3a60cc291c4dfdc7bb338abb74 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Tue, 23 Jun 2020 16:51:37 +0300 Subject: net: qed: fix "maybe uninitialized" warning Variable 'abs_ppfid' in qed_dev.c:qed_llh_add_mac_filter() always gets printed, but is initialized only under 'ref_cnt == 1' condition. This results in: In file included from ./include/linux/kernel.h:15:0, from ./include/asm-generic/bug.h:19, from ./arch/x86/include/asm/bug.h:86, from ./include/linux/bug.h:5, from ./include/linux/io.h:11, from drivers/net/ethernet/qlogic/qed/qed_dev.c:35: drivers/net/ethernet/qlogic/qed/qed_dev.c: In function 'qed_llh_add_mac_filter': ./include/linux/printk.h:358:2: warning: 'abs_ppfid' may be used uninitialized in this function [-Wmaybe-uninitialized] printk(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__) ^~~~~~ drivers/net/ethernet/qlogic/qed/qed_dev.c:983:17: note: 'abs_ppfid' was declared here u8 filter_idx, abs_ppfid; ^~~~~~~~~ ...under W=1+. Fix this by initializing it with zero. Fixes: 79284adeb99e ("qed: Add llh ppfid interface and 100g support for offload protocols") Signed-off-by: Alexander Lobakin Signed-off-by: Igor Russkikh Signed-off-by: Michal Kalderon Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c index b41ada668948..3aa51374e727 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_dev.c +++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c @@ -980,7 +980,7 @@ int qed_llh_add_mac_filter(struct qed_dev *cdev, struct qed_hwfn *p_hwfn = QED_LEADING_HWFN(cdev); struct qed_ptt *p_ptt = qed_ptt_acquire(p_hwfn); union qed_llh_filter filter = {}; - u8 filter_idx, abs_ppfid; + u8 filter_idx, abs_ppfid = 0; u32 high, low, ref_cnt; int rc = 0; -- cgit v1.2.3 From 11d8cd5c9f3b46f397f889cefdb66795518aaebd Mon Sep 17 00:00:00 2001 From: Rahul Lakkireddy Date: Wed, 24 Jun 2020 01:51:31 +0530 Subject: cxgb4: move handling L2T ARP failures to caller Move code handling L2T ARP failures to the only caller. Fixes following sparse warning: skbuff.h:2091:29: warning: context imbalance in 'handle_failed_resolution' - unexpected unlock Fixes: 749cb5fe48bb ("cxgb4: Replace arpq_head/arpq_tail with SKB double link-list code") Signed-off-by: Rahul Lakkireddy Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/l2t.c | 52 +++++++++++++++----------------- 1 file changed, 24 insertions(+), 28 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/l2t.c b/drivers/net/ethernet/chelsio/cxgb4/l2t.c index 72b37a66c7d8..0ed20a9cca14 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/l2t.c +++ b/drivers/net/ethernet/chelsio/cxgb4/l2t.c @@ -502,41 +502,20 @@ u64 cxgb4_select_ntuple(struct net_device *dev, } EXPORT_SYMBOL(cxgb4_select_ntuple); -/* - * Called when address resolution fails for an L2T entry to handle packets - * on the arpq head. If a packet specifies a failure handler it is invoked, - * otherwise the packet is sent to the device. - */ -static void handle_failed_resolution(struct adapter *adap, struct l2t_entry *e) -{ - struct sk_buff *skb; - - while ((skb = __skb_dequeue(&e->arpq)) != NULL) { - const struct l2t_skb_cb *cb = L2T_SKB_CB(skb); - - spin_unlock(&e->lock); - if (cb->arp_err_handler) - cb->arp_err_handler(cb->handle, skb); - else - t4_ofld_send(adap, skb); - spin_lock(&e->lock); - } -} - /* * Called when the host's neighbor layer makes a change to some entry that is * loaded into the HW L2 table. */ void t4_l2t_update(struct adapter *adap, struct neighbour *neigh) { - struct l2t_entry *e; - struct sk_buff_head *arpq = NULL; - struct l2t_data *d = adap->l2t; unsigned int addr_len = neigh->tbl->key_len; u32 *addr = (u32 *) neigh->primary_key; - int ifidx = neigh->dev->ifindex; - int hash = addr_hash(d, addr, addr_len, ifidx); + int hash, ifidx = neigh->dev->ifindex; + struct sk_buff_head *arpq = NULL; + struct l2t_data *d = adap->l2t; + struct l2t_entry *e; + hash = addr_hash(d, addr, addr_len, ifidx); read_lock_bh(&d->lock); for (e = d->l2tab[hash].first; e; e = e->next) if (!addreq(e, addr) && e->ifindex == ifidx) { @@ -569,8 +548,25 @@ void t4_l2t_update(struct adapter *adap, struct neighbour *neigh) write_l2e(adap, e, 0); } - if (arpq) - handle_failed_resolution(adap, e); + if (arpq) { + struct sk_buff *skb; + + /* Called when address resolution fails for an L2T + * entry to handle packets on the arpq head. If a + * packet specifies a failure handler it is invoked, + * otherwise the packet is sent to the device. + */ + while ((skb = __skb_dequeue(&e->arpq)) != NULL) { + const struct l2t_skb_cb *cb = L2T_SKB_CB(skb); + + spin_unlock(&e->lock); + if (cb->arp_err_handler) + cb->arp_err_handler(cb->handle, skb); + else + t4_ofld_send(adap, skb); + spin_lock(&e->lock); + } + } spin_unlock_bh(&e->lock); } -- cgit v1.2.3 From 030c98824deaba205787af501a074b3d7f46e32e Mon Sep 17 00:00:00 2001 From: Rahul Lakkireddy Date: Wed, 24 Jun 2020 01:51:32 +0530 Subject: cxgb4: move PTP lock and unlock to caller in Tx path Check for whether PTP is enabled or not at the caller and perform locking/unlocking at the caller. Fixes following sparse warning: sge.c:1641:26: warning: context imbalance in 'cxgb4_eth_xmit' - different lock contexts for basic block Fixes: a456950445a0 ("cxgb4: time stamping interface for PTP") Signed-off-by: Rahul Lakkireddy Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/sge.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/sge.c b/drivers/net/ethernet/chelsio/cxgb4/sge.c index 1359158652b7..f7b5a2f11001 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/sge.c +++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c @@ -1425,12 +1425,10 @@ static netdev_tx_t cxgb4_eth_xmit(struct sk_buff *skb, struct net_device *dev) qidx = skb_get_queue_mapping(skb); if (ptp_enabled) { - spin_lock(&adap->ptp_lock); if (!(adap->ptp_tx_skb)) { skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS; adap->ptp_tx_skb = skb_get(skb); } else { - spin_unlock(&adap->ptp_lock); goto out_free; } q = &adap->sge.ptptxq; @@ -1444,11 +1442,8 @@ static netdev_tx_t cxgb4_eth_xmit(struct sk_buff *skb, struct net_device *dev) #ifdef CONFIG_CHELSIO_T4_FCOE ret = cxgb_fcoe_offload(skb, adap, pi, &cntrl); - if (unlikely(ret == -ENOTSUPP)) { - if (ptp_enabled) - spin_unlock(&adap->ptp_lock); + if (unlikely(ret == -EOPNOTSUPP)) goto out_free; - } #endif /* CONFIG_CHELSIO_T4_FCOE */ chip_ver = CHELSIO_CHIP_VERSION(adap->params.chip); @@ -1461,8 +1456,6 @@ static netdev_tx_t cxgb4_eth_xmit(struct sk_buff *skb, struct net_device *dev) dev_err(adap->pdev_dev, "%s: Tx ring %u full while queue awake!\n", dev->name, qidx); - if (ptp_enabled) - spin_unlock(&adap->ptp_lock); return NETDEV_TX_BUSY; } @@ -1481,8 +1474,6 @@ static netdev_tx_t cxgb4_eth_xmit(struct sk_buff *skb, struct net_device *dev) unlikely(cxgb4_map_skb(adap->pdev_dev, skb, sgl_sdesc->addr) < 0)) { memset(sgl_sdesc->addr, 0, sizeof(sgl_sdesc->addr)); q->mapping_err++; - if (ptp_enabled) - spin_unlock(&adap->ptp_lock); goto out_free; } @@ -1630,8 +1621,6 @@ static netdev_tx_t cxgb4_eth_xmit(struct sk_buff *skb, struct net_device *dev) txq_advance(&q->q, ndesc); cxgb4_ring_tx_db(adap, &q->q, ndesc); - if (ptp_enabled) - spin_unlock(&adap->ptp_lock); return NETDEV_TX_OK; out_free: @@ -2377,6 +2366,16 @@ netdev_tx_t t4_start_xmit(struct sk_buff *skb, struct net_device *dev) if (unlikely(qid >= pi->nqsets)) return cxgb4_ethofld_xmit(skb, dev); + if (is_ptp_enabled(skb, dev)) { + struct adapter *adap = netdev2adap(dev); + netdev_tx_t ret; + + spin_lock(&adap->ptp_lock); + ret = cxgb4_eth_xmit(skb, dev); + spin_unlock(&adap->ptp_lock); + return ret; + } + return cxgb4_eth_xmit(skb, dev); } -- cgit v1.2.3 From 589b1c9c166dce120e27b32a83a78f55464a7ef9 Mon Sep 17 00:00:00 2001 From: Rahul Lakkireddy Date: Wed, 24 Jun 2020 01:51:33 +0530 Subject: cxgb4: use unaligned conversion for fetching timestamp Use get_unaligned_be64() to fetch the timestamp needed for ns_to_ktime() conversion. Fixes following sparse warning: sge.c:3282:43: warning: cast to restricted __be64 Fixes: a456950445a0 ("cxgb4: time stamping interface for PTP") Signed-off-by: Rahul Lakkireddy Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/sge.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/sge.c b/drivers/net/ethernet/chelsio/cxgb4/sge.c index f7b5a2f11001..3c8b4b153ec4 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/sge.c +++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c @@ -3312,7 +3312,7 @@ static noinline int t4_systim_to_hwstamp(struct adapter *adapter, hwtstamps = skb_hwtstamps(skb); memset(hwtstamps, 0, sizeof(*hwtstamps)); - hwtstamps->hwtstamp = ns_to_ktime(be64_to_cpu(*((u64 *)data))); + hwtstamps->hwtstamp = ns_to_ktime(get_unaligned_be64(data)); return RX_PTP_PKT_SUC; } -- cgit v1.2.3 From 27f78cb245abdb86735529c13b0a579f57829e71 Mon Sep 17 00:00:00 2001 From: Rahul Lakkireddy Date: Wed, 24 Jun 2020 01:51:34 +0530 Subject: cxgb4: parse TC-U32 key values and masks natively TC-U32 passes all keys values and masks in __be32 format. The parser already expects this and hence pass the value and masks in __be32 natively to the parser. Fixes following sparse warnings in several places: cxgb4_tc_u32.c:57:21: warning: incorrect type in assignment (different base types) cxgb4_tc_u32.c:57:21: expected unsigned int [usertype] val cxgb4_tc_u32.c:57:21: got restricted __be32 [usertype] val cxgb4_tc_u32_parse.h:48:24: warning: cast to restricted __be32 Fixes: 2e8aad7bf203 ("cxgb4: add parser to translate u32 filters to internal spec") Signed-off-by: Rahul Lakkireddy Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c | 18 +-- .../ethernet/chelsio/cxgb4/cxgb4_tc_u32_parse.h | 122 ++++++++++++++------- 2 files changed, 91 insertions(+), 49 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c index 3f3c11e54d97..dede02505ceb 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c @@ -48,7 +48,7 @@ static int fill_match_fields(struct adapter *adap, bool next_header) { unsigned int i, j; - u32 val, mask; + __be32 val, mask; int off, err; bool found; @@ -228,7 +228,7 @@ int cxgb4_config_knode(struct net_device *dev, struct tc_cls_u32_offload *cls) const struct cxgb4_next_header *next; bool found = false; unsigned int i, j; - u32 val, mask; + __be32 val, mask; int off; if (t->table[link_uhtid - 1].link_handle) { @@ -242,10 +242,10 @@ int cxgb4_config_knode(struct net_device *dev, struct tc_cls_u32_offload *cls) /* Try to find matches that allow jumps to next header. */ for (i = 0; next[i].jump; i++) { - if (next[i].offoff != cls->knode.sel->offoff || - next[i].shift != cls->knode.sel->offshift || - next[i].mask != cls->knode.sel->offmask || - next[i].offset != cls->knode.sel->off) + if (next[i].sel.offoff != cls->knode.sel->offoff || + next[i].sel.offshift != cls->knode.sel->offshift || + next[i].sel.offmask != cls->knode.sel->offmask || + next[i].sel.off != cls->knode.sel->off) continue; /* Found a possible candidate. Find a key that @@ -257,9 +257,9 @@ int cxgb4_config_knode(struct net_device *dev, struct tc_cls_u32_offload *cls) val = cls->knode.sel->keys[j].val; mask = cls->knode.sel->keys[j].mask; - if (next[i].match_off == off && - next[i].match_val == val && - next[i].match_mask == mask) { + if (next[i].key.off == off && + next[i].key.val == val && + next[i].key.mask == mask) { found = true; break; } diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32_parse.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32_parse.h index 125868c6770a..f59dd4b2ae6f 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32_parse.h +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32_parse.h @@ -38,12 +38,12 @@ struct cxgb4_match_field { int off; /* Offset from the beginning of the header to match */ /* Fill the value/mask pair in the spec if matched */ - int (*val)(struct ch_filter_specification *f, u32 val, u32 mask); + int (*val)(struct ch_filter_specification *f, __be32 val, __be32 mask); }; /* IPv4 match fields */ static inline int cxgb4_fill_ipv4_tos(struct ch_filter_specification *f, - u32 val, u32 mask) + __be32 val, __be32 mask) { f->val.tos = (ntohl(val) >> 16) & 0x000000FF; f->mask.tos = (ntohl(mask) >> 16) & 0x000000FF; @@ -52,7 +52,7 @@ static inline int cxgb4_fill_ipv4_tos(struct ch_filter_specification *f, } static inline int cxgb4_fill_ipv4_frag(struct ch_filter_specification *f, - u32 val, u32 mask) + __be32 val, __be32 mask) { u32 mask_val; u8 frag_val; @@ -74,7 +74,7 @@ static inline int cxgb4_fill_ipv4_frag(struct ch_filter_specification *f, } static inline int cxgb4_fill_ipv4_proto(struct ch_filter_specification *f, - u32 val, u32 mask) + __be32 val, __be32 mask) { f->val.proto = (ntohl(val) >> 16) & 0x000000FF; f->mask.proto = (ntohl(mask) >> 16) & 0x000000FF; @@ -83,7 +83,7 @@ static inline int cxgb4_fill_ipv4_proto(struct ch_filter_specification *f, } static inline int cxgb4_fill_ipv4_src_ip(struct ch_filter_specification *f, - u32 val, u32 mask) + __be32 val, __be32 mask) { memcpy(&f->val.fip[0], &val, sizeof(u32)); memcpy(&f->mask.fip[0], &mask, sizeof(u32)); @@ -92,7 +92,7 @@ static inline int cxgb4_fill_ipv4_src_ip(struct ch_filter_specification *f, } static inline int cxgb4_fill_ipv4_dst_ip(struct ch_filter_specification *f, - u32 val, u32 mask) + __be32 val, __be32 mask) { memcpy(&f->val.lip[0], &val, sizeof(u32)); memcpy(&f->mask.lip[0], &mask, sizeof(u32)); @@ -111,7 +111,7 @@ static const struct cxgb4_match_field cxgb4_ipv4_fields[] = { /* IPv6 match fields */ static inline int cxgb4_fill_ipv6_tos(struct ch_filter_specification *f, - u32 val, u32 mask) + __be32 val, __be32 mask) { f->val.tos = (ntohl(val) >> 20) & 0x000000FF; f->mask.tos = (ntohl(mask) >> 20) & 0x000000FF; @@ -120,7 +120,7 @@ static inline int cxgb4_fill_ipv6_tos(struct ch_filter_specification *f, } static inline int cxgb4_fill_ipv6_proto(struct ch_filter_specification *f, - u32 val, u32 mask) + __be32 val, __be32 mask) { f->val.proto = (ntohl(val) >> 8) & 0x000000FF; f->mask.proto = (ntohl(mask) >> 8) & 0x000000FF; @@ -129,7 +129,7 @@ static inline int cxgb4_fill_ipv6_proto(struct ch_filter_specification *f, } static inline int cxgb4_fill_ipv6_src_ip0(struct ch_filter_specification *f, - u32 val, u32 mask) + __be32 val, __be32 mask) { memcpy(&f->val.fip[0], &val, sizeof(u32)); memcpy(&f->mask.fip[0], &mask, sizeof(u32)); @@ -138,7 +138,7 @@ static inline int cxgb4_fill_ipv6_src_ip0(struct ch_filter_specification *f, } static inline int cxgb4_fill_ipv6_src_ip1(struct ch_filter_specification *f, - u32 val, u32 mask) + __be32 val, __be32 mask) { memcpy(&f->val.fip[4], &val, sizeof(u32)); memcpy(&f->mask.fip[4], &mask, sizeof(u32)); @@ -147,7 +147,7 @@ static inline int cxgb4_fill_ipv6_src_ip1(struct ch_filter_specification *f, } static inline int cxgb4_fill_ipv6_src_ip2(struct ch_filter_specification *f, - u32 val, u32 mask) + __be32 val, __be32 mask) { memcpy(&f->val.fip[8], &val, sizeof(u32)); memcpy(&f->mask.fip[8], &mask, sizeof(u32)); @@ -156,7 +156,7 @@ static inline int cxgb4_fill_ipv6_src_ip2(struct ch_filter_specification *f, } static inline int cxgb4_fill_ipv6_src_ip3(struct ch_filter_specification *f, - u32 val, u32 mask) + __be32 val, __be32 mask) { memcpy(&f->val.fip[12], &val, sizeof(u32)); memcpy(&f->mask.fip[12], &mask, sizeof(u32)); @@ -165,7 +165,7 @@ static inline int cxgb4_fill_ipv6_src_ip3(struct ch_filter_specification *f, } static inline int cxgb4_fill_ipv6_dst_ip0(struct ch_filter_specification *f, - u32 val, u32 mask) + __be32 val, __be32 mask) { memcpy(&f->val.lip[0], &val, sizeof(u32)); memcpy(&f->mask.lip[0], &mask, sizeof(u32)); @@ -174,7 +174,7 @@ static inline int cxgb4_fill_ipv6_dst_ip0(struct ch_filter_specification *f, } static inline int cxgb4_fill_ipv6_dst_ip1(struct ch_filter_specification *f, - u32 val, u32 mask) + __be32 val, __be32 mask) { memcpy(&f->val.lip[4], &val, sizeof(u32)); memcpy(&f->mask.lip[4], &mask, sizeof(u32)); @@ -183,7 +183,7 @@ static inline int cxgb4_fill_ipv6_dst_ip1(struct ch_filter_specification *f, } static inline int cxgb4_fill_ipv6_dst_ip2(struct ch_filter_specification *f, - u32 val, u32 mask) + __be32 val, __be32 mask) { memcpy(&f->val.lip[8], &val, sizeof(u32)); memcpy(&f->mask.lip[8], &mask, sizeof(u32)); @@ -192,7 +192,7 @@ static inline int cxgb4_fill_ipv6_dst_ip2(struct ch_filter_specification *f, } static inline int cxgb4_fill_ipv6_dst_ip3(struct ch_filter_specification *f, - u32 val, u32 mask) + __be32 val, __be32 mask) { memcpy(&f->val.lip[12], &val, sizeof(u32)); memcpy(&f->mask.lip[12], &mask, sizeof(u32)); @@ -216,7 +216,7 @@ static const struct cxgb4_match_field cxgb4_ipv6_fields[] = { /* TCP/UDP match */ static inline int cxgb4_fill_l4_ports(struct ch_filter_specification *f, - u32 val, u32 mask) + __be32 val, __be32 mask) { f->val.fport = ntohl(val) >> 16; f->mask.fport = ntohl(mask) >> 16; @@ -237,19 +237,13 @@ static const struct cxgb4_match_field cxgb4_udp_fields[] = { }; struct cxgb4_next_header { - unsigned int offset; /* Offset to next header */ - /* offset, shift, and mask added to offset above + /* Offset, shift, and mask added to beginning of the header * to get to next header. Useful when using a header * field's value to jump to next header such as IHL field * in IPv4 header. */ - unsigned int offoff; - u32 shift; - u32 mask; - /* match criteria to make this jump */ - unsigned int match_off; - u32 match_val; - u32 match_mask; + struct tc_u32_sel sel; + struct tc_u32_key key; /* location of jump to make */ const struct cxgb4_match_field *jump; }; @@ -258,26 +252,74 @@ struct cxgb4_next_header { * IPv4 header. */ static const struct cxgb4_next_header cxgb4_ipv4_jumps[] = { - { .offset = 0, .offoff = 0, .shift = 6, .mask = 0xF, - .match_off = 8, .match_val = 0x600, .match_mask = 0xFF00, - .jump = cxgb4_tcp_fields }, - { .offset = 0, .offoff = 0, .shift = 6, .mask = 0xF, - .match_off = 8, .match_val = 0x1100, .match_mask = 0xFF00, - .jump = cxgb4_udp_fields }, - { .jump = NULL } + { + /* TCP Jump */ + .sel = { + .off = 0, + .offoff = 0, + .offshift = 6, + .offmask = cpu_to_be16(0x0f00), + }, + .key = { + .off = 8, + .val = cpu_to_be32(0x00060000), + .mask = cpu_to_be32(0x00ff0000), + }, + .jump = cxgb4_tcp_fields, + }, + { + /* UDP Jump */ + .sel = { + .off = 0, + .offoff = 0, + .offshift = 6, + .offmask = cpu_to_be16(0x0f00), + }, + .key = { + .off = 8, + .val = cpu_to_be32(0x00110000), + .mask = cpu_to_be32(0x00ff0000), + }, + .jump = cxgb4_udp_fields, + }, + { .jump = NULL }, }; /* Accept a rule with a jump directly past the 40 Bytes of IPv6 fixed header * to get to transport layer header. */ static const struct cxgb4_next_header cxgb4_ipv6_jumps[] = { - { .offset = 0x28, .offoff = 0, .shift = 0, .mask = 0, - .match_off = 4, .match_val = 0x60000, .match_mask = 0xFF0000, - .jump = cxgb4_tcp_fields }, - { .offset = 0x28, .offoff = 0, .shift = 0, .mask = 0, - .match_off = 4, .match_val = 0x110000, .match_mask = 0xFF0000, - .jump = cxgb4_udp_fields }, - { .jump = NULL } + { + /* TCP Jump */ + .sel = { + .off = 40, + .offoff = 0, + .offshift = 0, + .offmask = 0, + }, + .key = { + .off = 4, + .val = cpu_to_be32(0x00000600), + .mask = cpu_to_be32(0x0000ff00), + }, + .jump = cxgb4_tcp_fields, + }, + { + /* UDP Jump */ + .sel = { + .off = 40, + .offoff = 0, + .offshift = 0, + .offmask = 0, + }, + .key = { + .off = 4, + .val = cpu_to_be32(0x00001100), + .mask = cpu_to_be32(0x0000ff00), + }, + .jump = cxgb4_udp_fields, + }, + { .jump = NULL }, }; struct cxgb4_link { -- cgit v1.2.3 From 63b53b0b99cd5f2d9754a21eda2ed8e706646cc9 Mon Sep 17 00:00:00 2001 From: Rahul Lakkireddy Date: Wed, 24 Jun 2020 01:51:35 +0530 Subject: cxgb4: fix endian conversions for L4 ports in filters The source and destination L4 ports in filter offload need to be in CPU endian. They will finally be converted to Big Endian after all operations are done and before giving them to hardware. The L4 ports for NAT are expected to be passed as a byte stream TCB. So, treat them as such. Fixes following sparse warnings in several places: cxgb4_tc_flower.c:159:33: warning: cast from restricted __be16 cxgb4_tc_flower.c:159:33: warning: incorrect type in argument 1 (different base types) cxgb4_tc_flower.c:159:33: expected unsigned short [usertype] val cxgb4_tc_flower.c:159:33: got restricted __be16 [usertype] dst Fixes: dca4faeb812f ("cxgb4: Add LE hash collision bug fix path in LLD driver") Fixes: 62488e4b53ae ("cxgb4: add basic tc flower offload support") Fixes: 557ccbf9dfa8 ("cxgb4: add tc flower support for L3/L4 rewrite") Signed-off-by: Rahul Lakkireddy Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c | 15 ++++++++--- drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 2 +- .../net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c | 30 ++++++++-------------- 3 files changed, 22 insertions(+), 25 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c index 796555255207..c6bf2648fe42 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c @@ -165,6 +165,9 @@ static void set_nat_params(struct adapter *adap, struct filter_entry *f, unsigned int tid, bool dip, bool sip, bool dp, bool sp) { + u8 *nat_lp = (u8 *)&f->fs.nat_lport; + u8 *nat_fp = (u8 *)&f->fs.nat_fport; + if (dip) { if (f->fs.type) { set_tcb_field(adap, f, tid, TCB_SND_UNA_RAW_W, @@ -236,8 +239,9 @@ static void set_nat_params(struct adapter *adap, struct filter_entry *f, } set_tcb_field(adap, f, tid, TCB_PDU_HDR_LEN_W, WORD_MASK, - (dp ? f->fs.nat_lport : 0) | - (sp ? f->fs.nat_fport << 16 : 0), 1); + (dp ? (nat_lp[1] | nat_lp[0] << 8) : 0) | + (sp ? (nat_fp[1] << 16 | nat_fp[0] << 24) : 0), + 1); } /* Validate filter spec against configuration done on the card. */ @@ -909,6 +913,9 @@ int set_filter_wr(struct adapter *adapter, int fidx) fwr->fpm = htons(f->fs.mask.fport); if (adapter->params.filter2_wr_support) { + u8 *nat_lp = (u8 *)&f->fs.nat_lport; + u8 *nat_fp = (u8 *)&f->fs.nat_fport; + fwr->natmode_to_ulp_type = FW_FILTER2_WR_ULP_TYPE_V(f->fs.nat_mode ? ULP_MODE_TCPDDP : @@ -916,8 +923,8 @@ int set_filter_wr(struct adapter *adapter, int fidx) FW_FILTER2_WR_NATMODE_V(f->fs.nat_mode); memcpy(fwr->newlip, f->fs.nat_lip, sizeof(fwr->newlip)); memcpy(fwr->newfip, f->fs.nat_fip, sizeof(fwr->newfip)); - fwr->newlport = htons(f->fs.nat_lport); - fwr->newfport = htons(f->fs.nat_fport); + fwr->newlport = htons(nat_lp[1] | nat_lp[0] << 8); + fwr->newfport = htons(nat_fp[1] | nat_fp[0] << 8); } /* Mark the filter as "pending" and ship off the Filter Work Request. diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c index 854b1717a70d..1e66159de079 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c @@ -2609,7 +2609,7 @@ int cxgb4_create_server_filter(const struct net_device *dev, unsigned int stid, /* Clear out filter specifications */ memset(&f->fs, 0, sizeof(struct ch_filter_specification)); - f->fs.val.lport = cpu_to_be16(sport); + f->fs.val.lport = be16_to_cpu(sport); f->fs.mask.lport = ~0; val = (u8 *)&sip; if ((val[0] | val[1] | val[2] | val[3]) != 0) { diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c index 4a5fa9eba0b6..59b65d4db086 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c @@ -58,10 +58,6 @@ static struct ch_tc_pedit_fields pedits[] = { PEDIT_FIELDS(IP6_, DST_63_32, 4, nat_lip, 4), PEDIT_FIELDS(IP6_, DST_95_64, 4, nat_lip, 8), PEDIT_FIELDS(IP6_, DST_127_96, 4, nat_lip, 12), - PEDIT_FIELDS(TCP_, SPORT, 2, nat_fport, 0), - PEDIT_FIELDS(TCP_, DPORT, 2, nat_lport, 0), - PEDIT_FIELDS(UDP_, SPORT, 2, nat_fport, 0), - PEDIT_FIELDS(UDP_, DPORT, 2, nat_lport, 0), }; static struct ch_tc_flower_entry *allocate_flower_entry(void) @@ -156,14 +152,14 @@ static void cxgb4_process_flow_match(struct net_device *dev, struct flow_match_ports match; flow_rule_match_ports(rule, &match); - fs->val.lport = cpu_to_be16(match.key->dst); - fs->mask.lport = cpu_to_be16(match.mask->dst); - fs->val.fport = cpu_to_be16(match.key->src); - fs->mask.fport = cpu_to_be16(match.mask->src); + fs->val.lport = be16_to_cpu(match.key->dst); + fs->mask.lport = be16_to_cpu(match.mask->dst); + fs->val.fport = be16_to_cpu(match.key->src); + fs->mask.fport = be16_to_cpu(match.mask->src); /* also initialize nat_lport/fport to same values */ - fs->nat_lport = cpu_to_be16(match.key->dst); - fs->nat_fport = cpu_to_be16(match.key->src); + fs->nat_lport = fs->val.lport; + fs->nat_fport = fs->val.fport; } if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_IP)) { @@ -354,12 +350,9 @@ static void process_pedit_field(struct ch_filter_specification *fs, u32 val, switch (offset) { case PEDIT_TCP_SPORT_DPORT: if (~mask & PEDIT_TCP_UDP_SPORT_MASK) - offload_pedit(fs, cpu_to_be32(val) >> 16, - cpu_to_be32(mask) >> 16, - TCP_SPORT); + fs->nat_fport = val; else - offload_pedit(fs, cpu_to_be32(val), - cpu_to_be32(mask), TCP_DPORT); + fs->nat_lport = val >> 16; } fs->nat_mode = NAT_MODE_ALL; break; @@ -367,12 +360,9 @@ static void process_pedit_field(struct ch_filter_specification *fs, u32 val, switch (offset) { case PEDIT_UDP_SPORT_DPORT: if (~mask & PEDIT_TCP_UDP_SPORT_MASK) - offload_pedit(fs, cpu_to_be32(val) >> 16, - cpu_to_be32(mask) >> 16, - UDP_SPORT); + fs->nat_fport = val; else - offload_pedit(fs, cpu_to_be32(val), - cpu_to_be32(mask), UDP_DPORT); + fs->nat_lport = val >> 16; } fs->nat_mode = NAT_MODE_ALL; } -- cgit v1.2.3 From f286dd8eaad5a2758750f407ab079298e0bcc8a5 Mon Sep 17 00:00:00 2001 From: Rahul Lakkireddy Date: Wed, 24 Jun 2020 01:51:36 +0530 Subject: cxgb4: use correct type for all-mask IP address comparison Use correct type to check for all-mask exact match IP addresses. Fixes following sparse warnings due to big endian value checks against 0xffffffff in is_addr_all_mask(): cxgb4_filter.c:977:25: warning: restricted __be32 degrades to integer cxgb4_filter.c:983:37: warning: restricted __be32 degrades to integer cxgb4_filter.c:984:37: warning: restricted __be32 degrades to integer cxgb4_filter.c:985:37: warning: restricted __be32 degrades to integer cxgb4_filter.c:986:37: warning: restricted __be32 degrades to integer Fixes: 3eb8b62d5a26 ("cxgb4: add support to create hash-filters via tc-flower offload") Signed-off-by: Rahul Lakkireddy Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c index c6bf2648fe42..7a7f61a8cdf4 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c @@ -1112,16 +1112,16 @@ static bool is_addr_all_mask(u8 *ipmask, int family) struct in_addr *addr; addr = (struct in_addr *)ipmask; - if (addr->s_addr == 0xffffffff) + if (ntohl(addr->s_addr) == 0xffffffff) return true; } else if (family == AF_INET6) { struct in6_addr *addr6; addr6 = (struct in6_addr *)ipmask; - if (addr6->s6_addr32[0] == 0xffffffff && - addr6->s6_addr32[1] == 0xffffffff && - addr6->s6_addr32[2] == 0xffffffff && - addr6->s6_addr32[3] == 0xffffffff) + if (ntohl(addr6->s6_addr32[0]) == 0xffffffff && + ntohl(addr6->s6_addr32[1]) == 0xffffffff && + ntohl(addr6->s6_addr32[2]) == 0xffffffff && + ntohl(addr6->s6_addr32[3]) == 0xffffffff) return true; } return false; -- cgit v1.2.3 From 1992ded5d111997877a9a25205976d8d03c46814 Mon Sep 17 00:00:00 2001 From: Rahul Lakkireddy Date: Wed, 24 Jun 2020 01:51:37 +0530 Subject: cxgb4: fix SGE queue dump destination buffer context The data in destination buffer is expected to be be parsed in big endian. So, use the right context. Fixes following sparse warning: cudbg_lib.c:2041:44: warning: incorrect type in assignment (different base types) cudbg_lib.c:2041:44: expected unsigned long long [usertype] cudbg_lib.c:2041:44: got restricted __be64 [usertype] Fixes: 736c3b94474e ("cxgb4: collect egress and ingress SGE queue contexts") Signed-off-by: Rahul Lakkireddy Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c b/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c index 7b9cd69f9844..d8ab8e366818 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c @@ -1975,7 +1975,6 @@ int cudbg_collect_dump_context(struct cudbg_init *pdbg_init, u8 mem_type[CTXT_INGRESS + 1] = { 0 }; struct cudbg_buffer temp_buff = { 0 }; struct cudbg_ch_cntxt *buff; - u64 *dst_off, *src_off; u8 *ctx_buf; u8 i, k; int rc; @@ -2044,8 +2043,11 @@ int cudbg_collect_dump_context(struct cudbg_init *pdbg_init, } for (j = 0; j < max_ctx_qid; j++) { + __be64 *dst_off; + u64 *src_off; + src_off = (u64 *)(ctx_buf + j * SGE_CTXT_SIZE); - dst_off = (u64 *)buff->data; + dst_off = (__be64 *)buff->data; /* The data is stored in 64-bit cpu order. Convert it * to big endian before parsing. -- cgit v1.2.3 From 2f6670165d22406467c667e9454e558bc75b933e Mon Sep 17 00:00:00 2001 From: Rahul Lakkireddy Date: Wed, 24 Jun 2020 01:51:38 +0530 Subject: cxgb4: remove cast when saving IPv4 partial checksum The checksum field in IPv4 header is in __sum16 and ip_fast_csum() also returns __sum16. So, no need to cast it to u16. Fixes following sparse warning: sge.c:1539:47: warning: cast from restricted __sum16 sge.c:1539:44: warning: incorrect type in assignment (different base types) sge.c:1539:44: expected restricted __sum16 [usertype] check sge.c:1539:44: got unsigned short [usertype] Fixes: d0a1299c6bf7 ("cxgb4: add support for vxlan segmentation offload") Signed-off-by: Rahul Lakkireddy Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/sge.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/sge.c b/drivers/net/ethernet/chelsio/cxgb4/sge.c index 3c8b4b153ec4..72ff46b16704 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/sge.c +++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c @@ -1524,8 +1524,7 @@ static netdev_tx_t cxgb4_eth_xmit(struct sk_buff *skb, struct net_device *dev) if (iph->version == 4) { iph->check = 0; iph->tot_len = 0; - iph->check = (u16)(~ip_fast_csum((u8 *)iph, - iph->ihl)); + iph->check = ~ip_fast_csum((u8 *)iph, iph->ihl); } if (skb->ip_summed == CHECKSUM_PARTIAL) cntrl = hwcsum(adap->params.chip, skb); -- cgit v1.2.3 From bab3bcf3e9873e1e6e9cb39c1f55a05fb10415a4 Mon Sep 17 00:00:00 2001 From: Rahul Lakkireddy Date: Wed, 24 Jun 2020 01:51:39 +0530 Subject: cxgb4: move DCB version extern to header file Move the DCB version string array extern to header file. Fixes following sparse warning: cxgb4_dcb.c:13:12: warning: symbol 'dcb_ver_array' was not declared. Should it be static? Fixes: ebddd97afb89 ("cxgb4: add support to display DCB info") Signed-off-by: Rahul Lakkireddy Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/cxgb4_dcb.h | 3 +++ drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c | 1 - 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_dcb.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_dcb.h index d3c654b9989b..80c6627fe981 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_dcb.h +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_dcb.h @@ -136,6 +136,9 @@ static inline __u8 bitswap_1(unsigned char val) ((val & 0x02) << 5) | ((val & 0x01) << 7); } + +extern const char * const dcb_ver_array[]; + #define CXGB4_DCB_ENABLED true #else /* !CONFIG_CHELSIO_T4_DCB */ diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c index 828499256004..b477b8842905 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c @@ -2379,7 +2379,6 @@ static const struct file_operations rss_vf_config_debugfs_fops = { }; #ifdef CONFIG_CHELSIO_T4_DCB -extern char *dcb_ver_array[]; /* Data Center Briging information for each port. */ -- cgit v1.2.3 From 00e31cfc8995de2a15352522fc672922a23b435e Mon Sep 17 00:00:00 2001 From: Rahul Lakkireddy Date: Wed, 24 Jun 2020 01:51:40 +0530 Subject: cxgb4: fix set but unused variable when DCB is disabled Remove the set but unused variable when DCB is disabled. Instead, do the calculation directly inline. Fixes following warning in make W=1: cxgb4_main.c: In function 'cfg_queues': cxgb4_main.c:5380:29: warning: variable 'n1g' set but not used [-Wunused-but-set-variable] u32 i, n10g = 0, qidx = 0, n1g = 0; ^ Fixes: 116ca924aea6 ("cxgb4: fix checks for max queues to allocate") Signed-off-by: Rahul Lakkireddy Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c index 1e66159de079..472e7c9e47bd 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c @@ -5377,10 +5377,10 @@ static inline bool is_x_10g_port(const struct link_config *lc) static int cfg_queues(struct adapter *adap) { u32 avail_qsets, avail_eth_qsets, avail_uld_qsets; - u32 i, n10g = 0, qidx = 0, n1g = 0; u32 ncpus = num_online_cpus(); u32 niqflint, neq, num_ulds; struct sge *s = &adap->sge; + u32 i, n10g = 0, qidx = 0; u32 q10g = 0, q1g; /* Reduce memory usage in kdump environment, disable all offload. */ @@ -5426,7 +5426,6 @@ static int cfg_queues(struct adapter *adap) if (n10g) q10g = (avail_eth_qsets - (adap->params.nports - n10g)) / n10g; - n1g = adap->params.nports - n10g; #ifdef CONFIG_CHELSIO_T4_DCB /* For Data Center Bridging support we need to be able to support up * to 8 Traffic Priorities; each of which will be assigned to its @@ -5444,7 +5443,8 @@ static int cfg_queues(struct adapter *adap) else q10g = max(8U, q10g); - while ((q10g * n10g) > (avail_eth_qsets - n1g * q1g)) + while ((q10g * n10g) > + (avail_eth_qsets - (adap->params.nports - n10g) * q1g)) q10g--; #else /* !CONFIG_CHELSIO_T4_DCB */ -- cgit v1.2.3 From 29bbf5d7f5efe84f94bc66c6c24614f812a95f62 Mon Sep 17 00:00:00 2001 From: Rahul Lakkireddy Date: Wed, 24 Jun 2020 01:51:41 +0530 Subject: cxgb4: update kernel-doc line comments Update several kernel-doc line comments to fix warnings reported by make W=1. Fixes following class of warnings reported by make W=1 in several places: l2t.c:616: warning: Cannot understand * @dev: net_device pointer t4_hw.c:3175: warning: Function parameter or member 'adap' not described in 't4_get_exprom_version' t4_hw.c:3175: warning: Excess function parameter 'adapter' description in 't4_get_exprom_version' Fixes: 56d36be4dd5f ("cxgb4: Add HW and FW support code") Fixes: fd3a47900b6f ("cxgb4: Add packet queues and packet DMA code") Fixes: 26f7cbc0a5a4 ("cxgb4: Don't attempt to upgrade T4 firmware when cxgb4 will end up as a slave") Fixes: 793dad94e745 ("RDMA/cxgb4: Fix bug for active and passive LE hash collision path") Fixes: ba3f8cd55f2a ("cxgb4: Add support in cxgb4 to get expansion rom version via ethtool") Fixes: f7502659cec8 ("cxgb4: Add API to alloc l2t entry; also update existing ones") Fixes: ddc7740d9a7c ("cxgb4: Decode link down reason code obtained from firmware") Fixes: 193c4c2845f7 ("cxgb4: Update T6 Buffer Group and Channel Mappings") Fixes: 8f46d46715a1 ("cxgb4: Use Firmware params to get buffer-group map") Fixes: a456950445a0 ("cxgb4: time stamping interface for PTP") Fixes: 9c33e4208bce ("cxgb4: Add PTP Hardware Clock (PHC) support") Fixes: c3168cabe1af ("cxgb4/cxgbvf: Handle 32-bit fw port capabilities") Fixes: 5ccf9d049615 ("cxgb4: update API for TP indirect register access") Fixes: 3bdb376e6944 ("cxgb4: introduce SMT ops to prepare for SMAC rewrite support") Fixes: 736c3b94474e ("cxgb4: collect egress and ingress SGE queue contexts") Fixes: f56ec6766dcf ("cxgb4: Add support for ethtool i2c dump") Fixes: 9d5fd927d20b ("cxgb4/cxgb4vf: add support for ndo_set_vf_vlan") Fixes: 98f3697f8d41 ("cxgb4: add tc flower match support for tunnel VNI") Fixes: 02d805dc5fe3 ("cxgb4: use new fw interface to get the VIN and smt index") Fixes: 3f8cfd0d95e6 ("cxgb4/cxgb4vf: Program hash region for {t4/t4vf}_change_mac()") Fixes: d429005fdf2c ("cxgb4/cxgb4vf: Add support for SGE doorbell queue timer") Fixes: 0e395b3cb1fb ("cxgb4: add FLOWC based QoS offload") Signed-off-by: Rahul Lakkireddy Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c | 2 +- drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 3 +- drivers/net/ethernet/chelsio/cxgb4/cxgb4_ptp.c | 3 +- drivers/net/ethernet/chelsio/cxgb4/l2t.c | 1 + drivers/net/ethernet/chelsio/cxgb4/sched.c | 2 +- drivers/net/ethernet/chelsio/cxgb4/sge.c | 19 +++++++----- drivers/net/ethernet/chelsio/cxgb4/smt.c | 2 ++ drivers/net/ethernet/chelsio/cxgb4/t4_hw.c | 36 +++++++++++----------- 8 files changed, 38 insertions(+), 30 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c index 9fd496732b2c..f27be1132d37 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c @@ -588,7 +588,7 @@ static void fw_caps_to_lmm(enum fw_port_type port_type, /** * lmm_to_fw_caps - translate ethtool Link Mode Mask to Firmware * capabilities - * @et_lmm: ethtool Link Mode Mask + * @link_mode_mask: ethtool Link Mode Mask * * Translate ethtool Link Mode Mask into a Firmware Port capabilities * value. diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c index 472e7c9e47bd..0329a6b52087 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c @@ -449,7 +449,7 @@ static int set_rxmode(struct net_device *dev, int mtu, bool sleep_ok) * or -1 * @addr: the new MAC address value * @persist: whether a new MAC allocation should be persistent - * @add_smt: if true also add the address to the HW SMT + * @smt_idx: the destination to store the new SMT index. * * Modifies an MPS filter and sets it to the new MAC address if * @tcam_idx >= 0, or adds the MAC address to a new filter if @@ -1615,6 +1615,7 @@ static int tid_init(struct tid_info *t) * @stid: the server TID * @sip: local IP address to bind server to * @sport: the server's TCP port + * @vlan: the VLAN header information * @queue: queue to direct messages from this server to * * Create an IP server for the given port and address. diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ptp.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ptp.c index f5bc996ac77d..70dbee89118e 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ptp.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ptp.c @@ -194,6 +194,7 @@ int cxgb4_ptp_redirect_rx_packet(struct adapter *adapter, struct port_info *pi) } /** + * cxgb4_ptp_adjfreq - Adjust frequency of PHC cycle counter * @ptp: ptp clock structure * @ppb: Desired frequency change in parts per billion * @@ -229,7 +230,7 @@ static int cxgb4_ptp_adjfreq(struct ptp_clock_info *ptp, s32 ppb) /** * cxgb4_ptp_fineadjtime - Shift the time of the hardware clock - * @ptp: ptp clock structure + * @adapter: board private structure * @delta: Desired change in nanoseconds * * Adjust the timer by resetting the timecounter structure. diff --git a/drivers/net/ethernet/chelsio/cxgb4/l2t.c b/drivers/net/ethernet/chelsio/cxgb4/l2t.c index 0ed20a9cca14..c4864125fe02 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/l2t.c +++ b/drivers/net/ethernet/chelsio/cxgb4/l2t.c @@ -609,6 +609,7 @@ struct l2t_entry *t4_l2t_alloc_switching(struct adapter *adap, u16 vlan, } /** + * cxgb4_l2t_alloc_switching - Allocates an L2T entry for switch filters * @dev: net_device pointer * @vlan: VLAN Id * @port: Associated port diff --git a/drivers/net/ethernet/chelsio/cxgb4/sched.c b/drivers/net/ethernet/chelsio/cxgb4/sched.c index fde93c50cfec..a1b14468d1ff 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/sched.c +++ b/drivers/net/ethernet/chelsio/cxgb4/sched.c @@ -598,7 +598,7 @@ struct sched_class *cxgb4_sched_class_alloc(struct net_device *dev, /** * cxgb4_sched_class_free - free a scheduling class * @dev: net_device pointer - * @e: scheduling class + * @classid: scheduling class id to free * * Frees a scheduling class if there are no users. */ diff --git a/drivers/net/ethernet/chelsio/cxgb4/sge.c b/drivers/net/ethernet/chelsio/cxgb4/sge.c index 72ff46b16704..32a45dc51ed7 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/sge.c +++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c @@ -302,7 +302,7 @@ static void deferred_unmap_destructor(struct sk_buff *skb) /** * free_tx_desc - reclaims Tx descriptors and their buffers - * @adapter: the adapter + * @adap: the adapter * @q: the Tx queue to reclaim descriptors from * @n: the number of descriptors to reclaim * @unmap: whether the buffers should be unmapped for DMA @@ -722,6 +722,7 @@ static inline unsigned int flits_to_desc(unsigned int n) /** * is_eth_imm - can an Ethernet packet be sent as immediate data? * @skb: the packet + * @chip_ver: chip version * * Returns whether an Ethernet packet is small enough to fit as * immediate data. Return value corresponds to headroom required. @@ -749,6 +750,7 @@ static inline int is_eth_imm(const struct sk_buff *skb, unsigned int chip_ver) /** * calc_tx_flits - calculate the number of flits for a packet Tx WR * @skb: the packet + * @chip_ver: chip version * * Returns the number of flits needed for a Tx WR for the given Ethernet * packet, including the needed WR and CPL headers. @@ -804,6 +806,7 @@ static inline unsigned int calc_tx_flits(const struct sk_buff *skb, /** * calc_tx_descs - calculate the number of Tx descriptors for a packet * @skb: the packet + * @chip_ver: chip version * * Returns the number of Tx descriptors needed for the given Ethernet * packet, including the needed WR and CPL headers. @@ -2408,9 +2411,9 @@ static void eosw_txq_flush_pending_skbs(struct sge_eosw_txq *eosw_txq) /** * cxgb4_ethofld_send_flowc - Send ETHOFLD flowc request to bind eotid to tc. - * @dev - netdevice - * @eotid - ETHOFLD tid to bind/unbind - * @tc - traffic class. If set to FW_SCHED_CLS_NONE, then unbinds the @eotid + * @dev: netdevice + * @eotid: ETHOFLD tid to bind/unbind + * @tc: traffic class. If set to FW_SCHED_CLS_NONE, then unbinds the @eotid * * Send a FLOWC work request to bind an ETHOFLD TID to a traffic class. * If @tc is set to FW_SCHED_CLS_NONE, then the @eotid is unbound from @@ -2689,7 +2692,6 @@ static inline unsigned int calc_tx_flits_ofld(const struct sk_buff *skb) /** * txq_stop_maperr - stop a Tx queue due to I/O MMU exhaustion - * @adap: the adapter * @q: the queue to stop * * Mark a Tx queue stopped due to I/O MMU exhaustion and resulting @@ -3284,7 +3286,7 @@ enum { /** * t4_systim_to_hwstamp - read hardware time stamp - * @adap: the adapter + * @adapter: the adapter * @skb: the packet * * Read Time Stamp from MPS packet and insert in skb which @@ -3318,8 +3320,9 @@ static noinline int t4_systim_to_hwstamp(struct adapter *adapter, /** * t4_rx_hststamp - Recv PTP Event Message - * @adap: the adapter + * @adapter: the adapter * @rsp: the response queue descriptor holding the RX_PKT message + * @rxq: the response queue holding the RX_PKT message * @skb: the packet * * PTP enabled and MPS packet, read HW timestamp @@ -3343,7 +3346,7 @@ static int t4_rx_hststamp(struct adapter *adapter, const __be64 *rsp, /** * t4_tx_hststamp - Loopback PTP Transmit Event Message - * @adap: the adapter + * @adapter: the adapter * @skb: the packet * @dev: the ingress net device * diff --git a/drivers/net/ethernet/chelsio/cxgb4/smt.c b/drivers/net/ethernet/chelsio/cxgb4/smt.c index 01c65d13fc0e..cbe72ed27b1e 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/smt.c +++ b/drivers/net/ethernet/chelsio/cxgb4/smt.c @@ -103,6 +103,7 @@ static void t4_smte_free(struct smt_entry *e) } /** + * cxgb4_smt_release - Release SMT entry * @e: smt entry to release * * Releases ref count and frees up an smt entry from SMT table @@ -231,6 +232,7 @@ static struct smt_entry *t4_smt_alloc_switching(struct adapter *adap, u16 pfvf, } /** + * cxgb4_smt_alloc_switching - Allocates an SMT entry for switch filters. * @dev: net_device pointer * @smac: MAC address to add to SMT * Returns pointer to the SMT entry created diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c index 1c8068c02728..1aa6dc10dc0b 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c +++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c @@ -3163,7 +3163,7 @@ int t4_get_tp_version(struct adapter *adapter, u32 *vers) /** * t4_get_exprom_version - return the Expansion ROM version (if any) - * @adapter: the adapter + * @adap: the adapter * @vers: where to place the version * * Reads the Expansion ROM header from FLASH and returns the version @@ -5310,7 +5310,7 @@ static unsigned int t4_use_ldst(struct adapter *adap) * @cmd: TP fw ldst address space type * @vals: where the indirect register values are stored/written * @nregs: how many indirect registers to read/write - * @start_idx: index of first indirect register to read/write + * @start_index: index of first indirect register to read/write * @rw: Read (1) or Write (0) * @sleep_ok: if true we may sleep while awaiting command completion * @@ -6115,7 +6115,7 @@ void t4_pmrx_get_stats(struct adapter *adap, u32 cnt[], u64 cycles[]) /** * compute_mps_bg_map - compute the MPS Buffer Group Map for a Port - * @adap: the adapter + * @adapter: the adapter * @pidx: the port index * * Computes and returns a bitmap indicating which MPS buffer groups are @@ -6252,7 +6252,7 @@ static unsigned int t4_get_tp_e2c_map(struct adapter *adapter, int pidx) /** * t4_get_tp_ch_map - return TP ingress channels associated with a port - * @adapter: the adapter + * @adap: the adapter * @pidx: the port index * * Returns a bitmap indicating which TP Ingress Channels are associated @@ -6589,7 +6589,7 @@ int t4_mdio_rd(struct adapter *adap, unsigned int mbox, unsigned int phy_addr, * @phy_addr: the PHY address * @mmd: the PHY MMD to access (0 for clause 22 PHYs) * @reg: the register to write - * @valp: value to write + * @val: value to write * * Issues a FW command through the given mailbox to write a PHY register. */ @@ -6615,7 +6615,7 @@ int t4_mdio_wr(struct adapter *adap, unsigned int mbox, unsigned int phy_addr, /** * t4_sge_decode_idma_state - decode the idma state - * @adap: the adapter + * @adapter: the adapter * @state: the state idma is stuck in */ void t4_sge_decode_idma_state(struct adapter *adapter, int state) @@ -6782,7 +6782,7 @@ void t4_sge_decode_idma_state(struct adapter *adapter, int state) * t4_sge_ctxt_flush - flush the SGE context cache * @adap: the adapter * @mbox: mailbox to use for the FW command - * @ctx_type: Egress or Ingress + * @ctxt_type: Egress or Ingress * * Issues a FW command through the given mailbox to flush the * SGE context cache. @@ -6809,7 +6809,7 @@ int t4_sge_ctxt_flush(struct adapter *adap, unsigned int mbox, int ctxt_type) /** * t4_read_sge_dbqtimers - read SGE Doorbell Queue Timer values - * @adap - the adapter + * @adap: the adapter * @ndbqtimers: size of the provided SGE Doorbell Queue Timer table * @dbqtimers: SGE Doorbell Queue Timer table * @@ -7092,6 +7092,7 @@ static int t4_fw_halt(struct adapter *adap, unsigned int mbox, int force) /** * t4_fw_restart - restart the firmware by taking the uP out of RESET * @adap: the adapter + * @mbox: mailbox to use for the FW command * @reset: if we want to do a RESET to restart things * * Restart firmware previously halted by t4_fw_halt(). On successful @@ -7630,6 +7631,8 @@ int t4_cfg_pfvf(struct adapter *adap, unsigned int mbox, unsigned int pf, * @nmac: number of MAC addresses needed (1 to 5) * @mac: the MAC addresses of the VI * @rss_size: size of RSS table slice associated with this VI + * @vivld: the destination to store the VI Valid value. + * @vin: the destination to store the VIN value. * * Allocates a virtual interface for the given physical port. If @mac is * not %NULL it contains the MAC addresses of the VI as assigned by FW. @@ -7848,7 +7851,7 @@ int t4_free_raw_mac_filt(struct adapter *adap, unsigned int viid, * t4_alloc_encap_mac_filt - Adds a mac entry in mps tcam with VNI support * @adap: the adapter * @viid: the VI id - * @mac: the MAC address + * @addr: the MAC address * @mask: the mask * @vni: the VNI id for the tunnel protocol * @vni_mask: mask for the VNI id @@ -7897,11 +7900,11 @@ int t4_alloc_encap_mac_filt(struct adapter *adap, unsigned int viid, * t4_alloc_raw_mac_filt - Adds a mac entry in mps tcam * @adap: the adapter * @viid: the VI id - * @mac: the MAC address + * @addr: the MAC address * @mask: the mask * @idx: index at which to add this entry - * @port_id: the port index * @lookup_type: MAC address for inner (1) or outer (0) header + * @port_id: the port index * @sleep_ok: call is allowed to sleep * * Adds the mac entry at the specified index using raw mac interface. @@ -8126,7 +8129,7 @@ int t4_free_mac_filt(struct adapter *adap, unsigned int mbox, * @idx: index of existing filter for old value of MAC address, or -1 * @addr: the new MAC address value * @persist: whether a new MAC allocation should be persistent - * @add_smt: if true also add the address to the HW SMT + * @smt_idx: the destination to store the new SMT index. * * Modifies an exact-match filter and sets it to the new MAC address. * Note that in general it is not possible to modify the value of a given @@ -8448,7 +8451,6 @@ int t4_ofld_eq_free(struct adapter *adap, unsigned int mbox, unsigned int pf, /** * t4_link_down_rc_str - return a string for a Link Down Reason Code - * @adap: the adapter * @link_down_rc: Link Down Reason Code * * Returns a string representation of the Link Down Reason Code. @@ -8472,9 +8474,7 @@ static const char *t4_link_down_rc_str(unsigned char link_down_rc) return reason[link_down_rc]; } -/** - * Return the highest speed set in the port capabilities, in Mb/s. - */ +/* Return the highest speed set in the port capabilities, in Mb/s. */ static unsigned int fwcap_to_speed(fw_port_cap32_t caps) { #define TEST_SPEED_RETURN(__caps_speed, __speed) \ @@ -9110,7 +9110,6 @@ found: /** * t4_prep_adapter - prepare SW and HW for operation * @adapter: the adapter - * @reset: if true perform a HW reset * * Initialize adapter SW state for the various HW modules, set initial * values for some adapter tunables, take PHYs out of reset, and @@ -10395,6 +10394,7 @@ int t4_sched_params(struct adapter *adapter, u8 type, u8 level, u8 mode, /** * t4_i2c_rd - read I2C data from adapter * @adap: the adapter + * @mbox: mailbox to use for the FW command * @port: Port number if per-port device; <0 if not * @devid: per-port device ID or absolute device ID * @offset: byte offset into device I2C space @@ -10450,7 +10450,7 @@ int t4_i2c_rd(struct adapter *adap, unsigned int mbox, int port, /** * t4_set_vlan_acl - Set a VLAN id for the specified VF - * @adapter: the adapter + * @adap: the adapter * @mbox: mailbox to use for the FW command * @vf: one of the VFs instantiated by the specified PF * @vlan: The vlanid to be set -- cgit v1.2.3 From 20bb0c8f2c446c55f5a4e296beaa77d62ffe2d1e Mon Sep 17 00:00:00 2001 From: Rahul Lakkireddy Date: Wed, 24 Jun 2020 01:51:42 +0530 Subject: cxgb4vf: update kernel-doc line comments Update several kernel-doc line comments to fix warnings reported by make W=1. Fixes following class of warnings reported by make W=1 in several places: cxgb4vf_main.c:275: warning: Function parameter or member 'persistent' not described in 'cxgb4vf_change_mac' cxgb4vf_main.c:275: warning: Excess function parameter 'persist' description in 'cxgb4vf_change_mac' Fixes: 16f8bd4be754 ("cxgb4vf: Add core T4 PCI-E SR-IOV Virtual Function hardware definitions and device communication code") Fixes: c6e0d91464da ("cxgb4vf: Add T4 Virtual Function Scatter-Gather Engine DMA code") Fixes: e0a8b34a9cc4 ("cxgb4vf: Add and initialize some sge params for VF driver") Fixes: c3168cabe1af ("cxgb4/cxgbvf: Handle 32-bit fw port capabilities") Fixes: 0e23daeb6407 ("drivers/net: chelsio/cxgb*: Convert timers to use timer_setup()") Fixes: 3f8cfd0d95e6 ("cxgb4/cxgb4vf: Program hash region for {t4/t4vf}_change_mac()") Signed-off-by: Rahul Lakkireddy Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c | 3 +-- drivers/net/ethernet/chelsio/cxgb4vf/sge.c | 7 ++++--- drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c | 9 +++------ 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c b/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c index cec865a97464..a7641be9094f 100644 --- a/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c @@ -260,8 +260,7 @@ static int cxgb4vf_set_addr_hash(struct port_info *pi) * @tcam_idx: TCAM index of existing filter for old value of MAC address, * or -1 * @addr: the new MAC address value - * @persist: whether a new MAC allocation should be persistent - * @add_smt: if true also add the address to the HW SMT + * @persistent: whether a new MAC allocation should be persistent * * Modifies an MPS filter and sets it to the new MAC address if * @tcam_idx >= 0, or adds the MAC address to a new filter if diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/sge.c b/drivers/net/ethernet/chelsio/cxgb4vf/sge.c index f71c973398ec..8c3d6e11a4bf 100644 --- a/drivers/net/ethernet/chelsio/cxgb4vf/sge.c +++ b/drivers/net/ethernet/chelsio/cxgb4vf/sge.c @@ -1692,7 +1692,7 @@ static inline bool is_new_response(const struct rsp_ctrl *rc, * restore_rx_bufs - put back a packet's RX buffers * @gl: the packet gather list * @fl: the SGE Free List - * @nfrags: how many fragments in @si + * @frags: how many fragments in @si * * Called when we find out that the current packet, @si, can't be * processed right away for some reason. This is a very rare event and @@ -2054,7 +2054,7 @@ irq_handler_t t4vf_intr_handler(struct adapter *adapter) /** * sge_rx_timer_cb - perform periodic maintenance of SGE RX queues - * @data: the adapter + * @t: Rx timer * * Runs periodically from a timer to perform maintenance of SGE RX queues. * @@ -2113,7 +2113,7 @@ static void sge_rx_timer_cb(struct timer_list *t) /** * sge_tx_timer_cb - perform periodic maintenance of SGE Tx queues - * @data: the adapter + * @t: Tx timer * * Runs periodically from a timer to perform maintenance of SGE TX queues. * @@ -2405,6 +2405,7 @@ err: * t4vf_sge_alloc_eth_txq - allocate an SGE Ethernet TX Queue * @adapter: the adapter * @txq: pointer to the new txq to be filled in + * @dev: the network device * @devq: the network TX queue associated with the new txq * @iqid: the relative ingress queue ID to which events relating to * the new txq should be directed diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c b/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c index 9d49ff211cc1..a31b87390b50 100644 --- a/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c +++ b/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c @@ -389,9 +389,7 @@ static inline enum cc_fec fwcap_to_cc_fec(fw_port_cap32_t fw_fec) return cc_fec; } -/** - * Return the highest speed set in the port capabilities, in Mb/s. - */ +/* Return the highest speed set in the port capabilities, in Mb/s. */ static unsigned int fwcap_to_speed(fw_port_cap32_t caps) { #define TEST_SPEED_RETURN(__caps_speed, __speed) \ @@ -1467,6 +1465,7 @@ int t4vf_identify_port(struct adapter *adapter, unsigned int viid, * @bcast: 1 to enable broadcast Rx, 0 to disable it, -1 no change * @vlanex: 1 to enable hardware VLAN Tag extraction, 0 to disable it, * -1 no change + * @sleep_ok: call is allowed to sleep * * Sets Rx properties of a virtual interface. */ @@ -1906,7 +1905,7 @@ static const char *t4vf_link_down_rc_str(unsigned char link_down_rc) /** * t4vf_handle_get_port_info - process a FW reply message * @pi: the port info - * @rpl: start of the FW message + * @cmd: start of the FW message * * Processes a GET_PORT_INFO FW reply message. */ @@ -2137,8 +2136,6 @@ int t4vf_handle_fw_rpl(struct adapter *adapter, const __be64 *rpl) return 0; } -/** - */ int t4vf_prep_adapter(struct adapter *adapter) { int err; -- cgit v1.2.3 From d0ad2ea2bc185835f8a749302ad07b70528d2a09 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Tue, 23 Jun 2020 19:01:35 -0400 Subject: bnxt_en: Store the running firmware version code. We currently only store the firmware version as a string for ethtool and devlink info. Store it also as a version code. The next 2 patches will need to check the firmware major version to determine some workarounds. We also use the 16-bit firmware version fields if the firmware is newer and provides the 16-bit fields. Reviewed-by: Edwin Peer Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 22 ++++++++++++++++++---- drivers/net/ethernet/broadcom/bnxt/bnxt.h | 4 ++++ 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index b93e05f91d77..0ad8d490975e 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -7240,8 +7240,9 @@ static int __bnxt_hwrm_ver_get(struct bnxt *bp, bool silent) static int bnxt_hwrm_ver_get(struct bnxt *bp) { struct hwrm_ver_get_output *resp = bp->hwrm_cmd_resp_addr; + u16 fw_maj, fw_min, fw_bld, fw_rsv; u32 dev_caps_cfg, hwrm_ver; - int rc; + int rc, len; bp->hwrm_max_req_len = HWRM_MAX_REQ_LEN; mutex_lock(&bp->hwrm_cmd_lock); @@ -7273,9 +7274,22 @@ static int bnxt_hwrm_ver_get(struct bnxt *bp) resp->hwrm_intf_maj_8b, resp->hwrm_intf_min_8b, resp->hwrm_intf_upd_8b); - snprintf(bp->fw_ver_str, BC_HWRM_STR_LEN, "%d.%d.%d.%d", - resp->hwrm_fw_maj_8b, resp->hwrm_fw_min_8b, - resp->hwrm_fw_bld_8b, resp->hwrm_fw_rsvd_8b); + fw_maj = le16_to_cpu(resp->hwrm_fw_major); + if (bp->hwrm_spec_code > 0x10803 && fw_maj) { + fw_min = le16_to_cpu(resp->hwrm_fw_minor); + fw_bld = le16_to_cpu(resp->hwrm_fw_build); + fw_rsv = le16_to_cpu(resp->hwrm_fw_patch); + len = FW_VER_STR_LEN; + } else { + fw_maj = resp->hwrm_fw_maj_8b; + fw_min = resp->hwrm_fw_min_8b; + fw_bld = resp->hwrm_fw_bld_8b; + fw_rsv = resp->hwrm_fw_rsvd_8b; + len = BC_HWRM_STR_LEN; + } + bp->fw_ver_code = BNXT_FW_VER_CODE(fw_maj, fw_min, fw_bld, fw_rsv); + snprintf(bp->fw_ver_str, len, "%d.%d.%d.%d", fw_maj, fw_min, fw_bld, + fw_rsv); if (strlen(resp->active_pkg_name)) { int fw_ver_len = strlen(bp->fw_ver_str); diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 9e173d74b72a..858440e1dcdc 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -1746,6 +1746,10 @@ struct bnxt { #define PHY_VER_STR_LEN (FW_VER_STR_LEN - BC_HWRM_STR_LEN) char fw_ver_str[FW_VER_STR_LEN]; char hwrm_ver_supp[FW_VER_STR_LEN]; + u64 fw_ver_code; +#define BNXT_FW_VER_CODE(maj, min, bld, rsv) \ + ((u64)(maj) << 48 | (u64)(min) << 32 | (u64)(bld) << 16 | (rsv)) + __be16 vxlan_port; u8 vxlan_port_cnt; __le16 vxlan_fw_dst_port_id; -- cgit v1.2.3 From fed7edd18143c68c63ea049999a7e861123de6de Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Tue, 23 Jun 2020 19:01:36 -0400 Subject: bnxt_en: Do not enable legacy TX push on older firmware. Older firmware may not support legacy TX push properly and may not be disabling it. So we check certain firmware versions that may have this problem and disable legacy TX push unconditionally. Fixes: c0c050c58d84 ("bnxt_en: New Broadcom ethernet driver.") Reviewed-by: Edwin Peer Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 3 ++- drivers/net/ethernet/broadcom/bnxt/bnxt.h | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 0ad8d490975e..f8c50b1d2331 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -6976,7 +6976,8 @@ static int __bnxt_hwrm_func_qcaps(struct bnxt *bp) bp->fw_cap |= BNXT_FW_CAP_ERR_RECOVER_RELOAD; bp->tx_push_thresh = 0; - if (flags & FUNC_QCAPS_RESP_FLAGS_PUSH_MODE_SUPPORTED) + if ((flags & FUNC_QCAPS_RESP_FLAGS_PUSH_MODE_SUPPORTED) && + BNXT_FW_MAJ(bp) > 217) bp->tx_push_thresh = BNXT_TX_PUSH_THRESH; hw_resc->max_rsscos_ctxs = le16_to_cpu(resp->max_rsscos_ctx); diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 858440e1dcdc..78e2fd63ac3d 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -1749,6 +1749,7 @@ struct bnxt { u64 fw_ver_code; #define BNXT_FW_VER_CODE(maj, min, bld, rsv) \ ((u64)(maj) << 48 | (u64)(min) << 32 | (u64)(bld) << 16 | (rsv)) +#define BNXT_FW_MAJ(bp) ((bp)->fw_ver_code >> 48) __be16 vxlan_port; u8 vxlan_port_cnt; -- cgit v1.2.3 From c2dec363feb41544a76c8083aca2378990e17166 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Tue, 23 Jun 2020 19:01:37 -0400 Subject: bnxt_en: Fix statistics counters issue during ifdown with older firmware. On older firmware, the hardware statistics are not cleared when the driver frees the hardware stats contexts during ifdown. The driver expects these stats to be cleared and saves a copy before freeing the stats contexts. During the next ifup, the driver will likely allocate the same hardware stats contexts and this will cause a big increase in the counters as the old counters are added back to the saved counters. We fix it by making an additional firmware call to clear the counters before freeing the hw stats contexts when the firmware is the older 20.x firmware. Fixes: b8875ca356f1 ("bnxt_en: Save ring statistics before reset.") Reported-by: Jakub Kicinski Reviewed-by: Vasundhara Volam Signed-off-by: Michael Chan Tested-by: Jakub Kicinski Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index f8c50b1d2331..6dc7cc4df9e8 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -6292,6 +6292,7 @@ int bnxt_hwrm_set_coal(struct bnxt *bp) static void bnxt_hwrm_stat_ctx_free(struct bnxt *bp) { + struct hwrm_stat_ctx_clr_stats_input req0 = {0}; struct hwrm_stat_ctx_free_input req = {0}; int i; @@ -6301,6 +6302,7 @@ static void bnxt_hwrm_stat_ctx_free(struct bnxt *bp) if (BNXT_CHIP_TYPE_NITRO_A0(bp)) return; + bnxt_hwrm_cmd_hdr_init(bp, &req0, HWRM_STAT_CTX_CLR_STATS, -1, -1); bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_STAT_CTX_FREE, -1, -1); mutex_lock(&bp->hwrm_cmd_lock); @@ -6310,7 +6312,11 @@ static void bnxt_hwrm_stat_ctx_free(struct bnxt *bp) if (cpr->hw_stats_ctx_id != INVALID_STATS_CTX_ID) { req.stat_ctx_id = cpu_to_le32(cpr->hw_stats_ctx_id); - + if (BNXT_FW_MAJ(bp) <= 20) { + req0.stat_ctx_id = req.stat_ctx_id; + _hwrm_send_message(bp, &req0, sizeof(req0), + HWRM_CMD_TIMEOUT); + } _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT); -- cgit v1.2.3 From c55e28a8b43fcd7dc71868bd165705bc7741a7ca Mon Sep 17 00:00:00 2001 From: Vasundhara Volam Date: Tue, 23 Jun 2020 19:01:38 -0400 Subject: bnxt_en: Read VPD info only for PFs Virtual functions does not have VPD information. This patch modifies calling bnxt_read_vpd_info() only for PFs and avoids an unnecessary error log. Fixes: a0d0fd70fed5 ("bnxt_en: Read partno and serialno of the board from VPD") Signed-off-by: Vasundhara Volam Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 6dc7cc4df9e8..6a884df44612 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -11913,7 +11913,8 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) dev->ethtool_ops = &bnxt_ethtool_ops; pci_set_drvdata(pdev, dev); - bnxt_vpd_read_info(bp); + if (BNXT_PF(bp)) + bnxt_vpd_read_info(bp); rc = bnxt_alloc_hwrm_resources(bp); if (rc) -- cgit v1.2.3 From 4b973f49830d74de36b5f4a75c07f6658524d2e4 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 24 Jun 2020 03:25:45 +0200 Subject: net: ethtool: Handle missing cable test TDR parameters A last minute change put the TDR cable test parameters into a nest. The validation is not sufficient, resulting in an oops if the nest is missing. Set default values first, then update them if the nest is provided. Fixes: f2bc8ad31a7f ("net: ethtool: Allow PHY cable test TDR data to configured") Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- net/ethtool/cabletest.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/net/ethtool/cabletest.c b/net/ethtool/cabletest.c index 7b7a0456c15c..7194956aa09e 100644 --- a/net/ethtool/cabletest.c +++ b/net/ethtool/cabletest.c @@ -234,6 +234,14 @@ static int ethnl_act_cable_test_tdr_cfg(const struct nlattr *nest, struct nlattr *tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_MAX + 1]; int ret; + cfg->first = 100; + cfg->step = 100; + cfg->last = MAX_CABLE_LENGTH_CM; + cfg->pair = PHY_PAIR_ALL; + + if (!nest) + return 0; + ret = nla_parse_nested(tb, ETHTOOL_A_CABLE_TEST_TDR_CFG_MAX, nest, cable_test_tdr_act_cfg_policy, info->extack); if (ret < 0) @@ -242,17 +250,12 @@ static int ethnl_act_cable_test_tdr_cfg(const struct nlattr *nest, if (tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_FIRST]) cfg->first = nla_get_u32( tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_FIRST]); - else - cfg->first = 100; + if (tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_LAST]) cfg->last = nla_get_u32(tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_LAST]); - else - cfg->last = MAX_CABLE_LENGTH_CM; if (tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP]) cfg->step = nla_get_u32(tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP]); - else - cfg->step = 100; if (tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR]) { cfg->pair = nla_get_u8(tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR]); @@ -263,8 +266,6 @@ static int ethnl_act_cable_test_tdr_cfg(const struct nlattr *nest, "invalid pair parameter"); return -EINVAL; } - } else { - cfg->pair = PHY_PAIR_ALL; } if (cfg->first > MAX_CABLE_LENGTH_CM) { -- cgit v1.2.3 From 41b14fb8724d5a4b382a63cb4a1a61880347ccb8 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Mon, 22 Jun 2020 23:26:04 +0300 Subject: net: Do not clear the sock TX queue in sk_set_socket() Clearing the sock TX queue in sk_set_socket() might cause unexpected out-of-order transmit when called from sock_orphan(), as outstanding packets can pick a different TX queue and bypass the ones already queued. This is undesired in general. More specifically, it breaks the in-order scheduling property guarantee for device-offloaded TLS sockets. Remove the call to sk_tx_queue_clear() in sk_set_socket(), and add it explicitly only where needed. Fixes: e022f0b4a03f ("net: Introduce sk_tx_queue_mapping") Signed-off-by: Tariq Toukan Reviewed-by: Boris Pismenny Signed-off-by: David S. Miller --- include/net/sock.h | 1 - net/core/sock.c | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/include/net/sock.h b/include/net/sock.h index c53cc42b5ab9..3428619faae4 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1848,7 +1848,6 @@ static inline int sk_rx_queue_get(const struct sock *sk) static inline void sk_set_socket(struct sock *sk, struct socket *sock) { - sk_tx_queue_clear(sk); sk->sk_socket = sock; } diff --git a/net/core/sock.c b/net/core/sock.c index 94391da27754..d832c650287c 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1767,6 +1767,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, cgroup_sk_alloc(&sk->sk_cgrp_data); sock_update_classid(&sk->sk_cgrp_data); sock_update_netprioidx(&sk->sk_cgrp_data); + sk_tx_queue_clear(sk); } return sk; @@ -1990,6 +1991,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) */ sk_refcnt_debug_inc(newsk); sk_set_socket(newsk, NULL); + sk_tx_queue_clear(newsk); RCU_INIT_POINTER(newsk->sk_wq, NULL); if (newsk->sk_prot->sockets_allocated) -- cgit v1.2.3 From c718af2d00a37587b09e5958d142da7569f3d55b Mon Sep 17 00:00:00 2001 From: Russell King Date: Tue, 23 Jun 2020 17:47:23 +0100 Subject: net: phylink: fix ethtool -A with attached PHYs Fix a phylink's ethtool set_pauseparam support deadlock caused by phylib interacting with phylink: we must not hold the state lock while calling phylib functions that may call into phylink_phy_change(). Fixes: f904f15ea9b5 ("net: phylink: allow ethtool -A to change flow control advertisement") Signed-off-by: Russell King Signed-off-by: David S. Miller --- drivers/net/phy/phylink.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index 0ab65fb75258..453f9a399bb1 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -1502,18 +1502,20 @@ int phylink_ethtool_set_pauseparam(struct phylink *pl, linkmode_set_pause(config->advertising, pause->tx_pause, pause->rx_pause); - /* If we have a PHY, phylib will call our link state function if the - * mode has changed, which will trigger a resolve and update the MAC - * configuration. + if (!pl->phydev && !test_bit(PHYLINK_DISABLE_STOPPED, + &pl->phylink_disable_state)) + phylink_pcs_config(pl, true, &pl->link_config); + + mutex_unlock(&pl->state_mutex); + + /* If we have a PHY, a change of the pause frame advertisement will + * cause phylib to renegotiate (if AN is enabled) which will in turn + * call our phylink_phy_change() and trigger a resolve. Note that + * we can't hold our state mutex while calling phy_set_asym_pause(). */ - if (pl->phydev) { + if (pl->phydev) phy_set_asym_pause(pl->phydev, pause->rx_pause, pause->tx_pause); - } else if (!test_bit(PHYLINK_DISABLE_STOPPED, - &pl->phylink_disable_state)) { - phylink_pcs_config(pl, true, &pl->link_config); - } - mutex_unlock(&pl->state_mutex); return 0; } -- cgit v1.2.3 From 2e919bc446faee429ac862a6cdb5e40017051f6b Mon Sep 17 00:00:00 2001 From: Russell King Date: Tue, 23 Jun 2020 17:47:29 +0100 Subject: net: phylink: ensure manual pause mode configuration takes effect We have been relying on link events and mac_config() when the manual pause modes are changed. With recent developments, such as moving the programming of link state to mac_link_up(), this no longer works. To ensure that we update the MAC, we must generate a link-down followed by a link-up event; we can do that by setting mac_link_dropped and triggering a resolve. Fixes: 91a208f2185a ("net: phylink: propagate resolved link config via mac_link_up()") Signed-off-by: Russell King Signed-off-by: David S. Miller --- drivers/net/phy/phylink.c | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index 453f9a399bb1..3b7c70e6c5dd 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -1463,6 +1463,8 @@ int phylink_ethtool_set_pauseparam(struct phylink *pl, struct ethtool_pauseparam *pause) { struct phylink_link_state *config = &pl->link_config; + bool manual_changed; + int pause_state; ASSERT_RTNL(); @@ -1477,15 +1479,15 @@ int phylink_ethtool_set_pauseparam(struct phylink *pl, !pause->autoneg && pause->rx_pause != pause->tx_pause) return -EINVAL; - mutex_lock(&pl->state_mutex); - config->pause = 0; + pause_state = 0; if (pause->autoneg) - config->pause |= MLO_PAUSE_AN; + pause_state |= MLO_PAUSE_AN; if (pause->rx_pause) - config->pause |= MLO_PAUSE_RX; + pause_state |= MLO_PAUSE_RX; if (pause->tx_pause) - config->pause |= MLO_PAUSE_TX; + pause_state |= MLO_PAUSE_TX; + mutex_lock(&pl->state_mutex); /* * See the comments for linkmode_set_pause(), wrt the deficiencies * with the current implementation. A solution to this issue would @@ -1502,6 +1504,12 @@ int phylink_ethtool_set_pauseparam(struct phylink *pl, linkmode_set_pause(config->advertising, pause->tx_pause, pause->rx_pause); + manual_changed = (config->pause ^ pause_state) & MLO_PAUSE_AN || + (!(pause_state & MLO_PAUSE_AN) && + (config->pause ^ pause_state) & MLO_PAUSE_TXRX_MASK); + + config->pause = pause_state; + if (!pl->phydev && !test_bit(PHYLINK_DISABLE_STOPPED, &pl->phylink_disable_state)) phylink_pcs_config(pl, true, &pl->link_config); @@ -1517,6 +1525,15 @@ int phylink_ethtool_set_pauseparam(struct phylink *pl, phy_set_asym_pause(pl->phydev, pause->rx_pause, pause->tx_pause); + /* If the manual pause settings changed, make sure we trigger a + * resolve to update their state; we can not guarantee that the + * link will cycle. + */ + if (manual_changed) { + pl->mac_link_dropped = true; + phylink_run_resolve(pl); + } + return 0; } EXPORT_SYMBOL_GPL(phylink_ethtool_set_pauseparam); -- cgit v1.2.3 From 17843655708e1941c0653af3cd61be6948e36f43 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 23 Jun 2020 18:33:15 +0200 Subject: openvswitch: take into account de-fragmentation/gso_size in execute_check_pkt_len ovs connection tracking module performs de-fragmentation on incoming fragmented traffic. Take info account if traffic has been de-fragmented in execute_check_pkt_len action otherwise we will perform the wrong nested action considering the original packet size. This issue typically occurs if ovs-vswitchd adds a rule in the pipeline that requires connection tracking (e.g. OVN stateful ACLs) before execute_check_pkt_len action. Moreover take into account GSO fragment size for GSO packet in execute_check_pkt_len routine Fixes: 4d5ec89fc8d14 ("net: openvswitch: Add a new action check_pkt_len") Signed-off-by: Lorenzo Bianconi Signed-off-by: David S. Miller --- net/openvswitch/actions.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index fc0efd8833c8..2611657f40ca 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -1169,9 +1169,10 @@ static int execute_check_pkt_len(struct datapath *dp, struct sk_buff *skb, struct sw_flow_key *key, const struct nlattr *attr, bool last) { + struct ovs_skb_cb *ovs_cb = OVS_CB(skb); const struct nlattr *actions, *cpl_arg; + int len, max_len, rem = nla_len(attr); const struct check_pkt_len_arg *arg; - int rem = nla_len(attr); bool clone_flow_key; /* The first netlink attribute in 'attr' is always @@ -1180,7 +1181,11 @@ static int execute_check_pkt_len(struct datapath *dp, struct sk_buff *skb, cpl_arg = nla_data(attr); arg = nla_data(cpl_arg); - if (skb->len <= arg->pkt_len) { + len = ovs_cb->mru ? ovs_cb->mru + skb->mac_len : skb->len; + max_len = arg->pkt_len; + + if ((skb_is_gso(skb) && skb_gso_validate_mac_len(skb, max_len)) || + len <= max_len) { /* Second netlink attribute in 'attr' is always * 'OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL'. */ -- cgit v1.2.3 From 1ed9ec9b08addbd8d3e36d5f4a652d8590a6ddb7 Mon Sep 17 00:00:00 2001 From: Daniel Mack Date: Sat, 20 Jun 2020 21:39:25 +0200 Subject: dsa: Allow forwarding of redirected IGMP traffic The driver for Marvell switches puts all ports in IGMP snooping mode which results in all IGMP/MLD frames that ingress on the ports to be forwarded to the CPU only. The bridge code in the kernel can then interpret these frames and act upon them, for instance by updating the mdb in the switch to reflect multicast memberships of stations connected to the ports. However, the IGMP/MLD frames must then also be forwarded to other ports of the bridge so external IGMP queriers can track membership reports, and external multicast clients can receive query reports from foreign IGMP queriers. Currently, this is impossible as the EDSA tagger sets offload_fwd_mark on the skb when it unwraps the tagged frames, and that will make the switchdev layer prevent the skb from egressing on any other port of the same switch. To fix that, look at the To_CPU code in the DSA header and make forwarding of the frame possible for trapped IGMP packets. Introduce some #defines for the frame types to make the code a bit more comprehensive. This was tested on a Marvell 88E6352 variant. Signed-off-by: Daniel Mack Reviewed-by: Andrew Lunn Tested-by: Andrew Lunn Signed-off-by: David S. Miller --- net/dsa/tag_edsa.c | 37 ++++++++++++++++++++++++++++++++++--- 1 file changed, 34 insertions(+), 3 deletions(-) diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c index e8eaa804ccb9..d6200ff98200 100644 --- a/net/dsa/tag_edsa.c +++ b/net/dsa/tag_edsa.c @@ -13,6 +13,16 @@ #define DSA_HLEN 4 #define EDSA_HLEN 8 +#define FRAME_TYPE_TO_CPU 0x00 +#define FRAME_TYPE_FORWARD 0x03 + +#define TO_CPU_CODE_MGMT_TRAP 0x00 +#define TO_CPU_CODE_FRAME2REG 0x01 +#define TO_CPU_CODE_IGMP_MLD_TRAP 0x02 +#define TO_CPU_CODE_POLICY_TRAP 0x03 +#define TO_CPU_CODE_ARP_MIRROR 0x04 +#define TO_CPU_CODE_POLICY_MIRROR 0x05 + static struct sk_buff *edsa_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_port *dp = dsa_slave_to_port(dev); @@ -77,6 +87,8 @@ static struct sk_buff *edsa_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) { u8 *edsa_header; + int frame_type; + int code; int source_device; int source_port; @@ -91,8 +103,29 @@ static struct sk_buff *edsa_rcv(struct sk_buff *skb, struct net_device *dev, /* * Check that frame type is either TO_CPU or FORWARD. */ - if ((edsa_header[0] & 0xc0) != 0x00 && (edsa_header[0] & 0xc0) != 0xc0) + frame_type = edsa_header[0] >> 6; + + switch (frame_type) { + case FRAME_TYPE_TO_CPU: + code = (edsa_header[1] & 0x6) | ((edsa_header[2] >> 4) & 1); + + /* + * Mark the frame to never egress on any port of the same switch + * unless it's a trapped IGMP/MLD packet, in which case the + * bridge might want to forward it. + */ + if (code != TO_CPU_CODE_IGMP_MLD_TRAP) + skb->offload_fwd_mark = 1; + + break; + + case FRAME_TYPE_FORWARD: + skb->offload_fwd_mark = 1; + break; + + default: return NULL; + } /* * Determine source device and port. @@ -156,8 +189,6 @@ static struct sk_buff *edsa_rcv(struct sk_buff *skb, struct net_device *dev, 2 * ETH_ALEN); } - skb->offload_fwd_mark = 1; - return skb; } -- cgit v1.2.3 From d3d239dcb8aae6d7b10642d292b404e57604f7ea Mon Sep 17 00:00:00 2001 From: Sascha Hauer Date: Wed, 24 Jun 2020 09:00:44 +0200 Subject: net: ethernet: mvneta: Do not error out in non serdes modes In mvneta_config_interface() the RGMII modes are catched by the default case which is an error return. The RGMII modes are valid modes for the driver, so instead of returning an error add a break statement to return successfully. This avoids this warning for non comphy SoCs which use RGMII, like SolidRun Clearfog: WARNING: CPU: 0 PID: 268 at drivers/net/ethernet/marvell/mvneta.c:3512 mvneta_start_dev+0x220/0x23c Fixes: b4748553f53f ("net: ethernet: mvneta: Fix Serdes configuration for SoCs without comphy") Signed-off-by: Sascha Hauer Reviewed-by: Russell King Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mvneta.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index af6000172848..c4552f868157 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -3571,7 +3571,7 @@ static int mvneta_config_interface(struct mvneta_port *pp, MVNETA_HSGMII_SERDES_PROTO); break; default: - return -EINVAL; + break; } } -- cgit v1.2.3 From 41c2b6b4f0f807803bb49f65835d136941a70f85 Mon Sep 17 00:00:00 2001 From: Sascha Hauer Date: Wed, 24 Jun 2020 09:00:45 +0200 Subject: net: ethernet: mvneta: Add back interface mode validation When writing the serdes configuration register was moved to mvneta_config_interface() the whole code block was removed from mvneta_port_power_up() in the assumption that its only purpose was to write the serdes configuration register. As mentioned by Russell King its purpose was also to check for valid interface modes early so that later in the driver we do not have to care for unexpected interface modes. Add back the test to let the driver bail out early on unhandled interface modes. Fixes: b4748553f53f ("net: ethernet: mvneta: Fix Serdes configuration for SoCs without comphy") Signed-off-by: Sascha Hauer Reviewed-by: Russell King Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mvneta.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index c4552f868157..c639e3a29302 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -5009,10 +5009,18 @@ static void mvneta_conf_mbus_windows(struct mvneta_port *pp, } /* Power up the port */ -static void mvneta_port_power_up(struct mvneta_port *pp, int phy_mode) +static int mvneta_port_power_up(struct mvneta_port *pp, int phy_mode) { /* MAC Cause register should be cleared */ mvreg_write(pp, MVNETA_UNIT_INTR_CAUSE, 0); + + if (phy_mode != PHY_INTERFACE_MODE_QSGMII && + phy_mode != PHY_INTERFACE_MODE_SGMII && + !phy_interface_mode_is_8023z(phy_mode) && + !phy_interface_mode_is_rgmii(phy_mode)) + return -EINVAL; + + return 0; } /* Device initialization routine */ @@ -5198,7 +5206,11 @@ static int mvneta_probe(struct platform_device *pdev) if (err < 0) goto err_netdev; - mvneta_port_power_up(pp, phy_mode); + err = mvneta_port_power_up(pp, pp->phy_interface); + if (err < 0) { + dev_err(&pdev->dev, "can't power up port\n"); + return err; + } /* Armada3700 network controller does not support per-cpu * operation, so only single NAPI should be initialized. @@ -5352,7 +5364,11 @@ static int mvneta_resume(struct device *device) } } mvneta_defaults_set(pp); - mvneta_port_power_up(pp, pp->phy_interface); + err = mvneta_port_power_up(pp, pp->phy_interface); + if (err < 0) { + dev_err(device, "can't power up port\n"); + return err; + } netif_device_attach(dev); -- cgit v1.2.3 From 3dd4ef1bdbac959bb20faec93937720ddd9917c6 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Wed, 24 Jun 2020 15:58:24 +0800 Subject: net: phy: make phy_disable_interrupts() non-static We face an issue with rtl8211f, a pin is shared between INTB and PMEB, and the PHY Register Accessible Interrupt is enabled by default, so the INTB/PMEB pin is always active in polling mode case. As Heiner pointed out "I was thinking about calling phy_disable_interrupts() in phy_init_hw(), to have a defined init state as we don't know in which state the PHY is if the PHY driver is loaded. We shouldn't assume that it's the chip power-on defaults, BIOS or boot loader could have changed this. Or in case of dual-boot systems the other OS could leave the PHY in whatever state." Make phy_disable_interrupts() non-static so that it could be used in phy_init_hw() to have a defined init state. Suggested-by: Heiner Kallweit Signed-off-by: Jisheng Zhang Signed-off-by: David S. Miller --- drivers/net/phy/phy.c | 2 +- include/linux/phy.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index 1de3938628f4..56cfae950472 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -840,7 +840,7 @@ static void phy_error(struct phy_device *phydev) * phy_disable_interrupts - Disable the PHY interrupts from the PHY side * @phydev: target phy_device struct */ -static int phy_disable_interrupts(struct phy_device *phydev) +int phy_disable_interrupts(struct phy_device *phydev) { int err; diff --git a/include/linux/phy.h b/include/linux/phy.h index 8c05d0fb5c00..b693b609b2f5 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1416,6 +1416,7 @@ int phy_ethtool_ksettings_set(struct phy_device *phydev, int phy_mii_ioctl(struct phy_device *phydev, struct ifreq *ifr, int cmd); int phy_do_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd); int phy_do_ioctl_running(struct net_device *dev, struct ifreq *ifr, int cmd); +int phy_disable_interrupts(struct phy_device *phydev); void phy_request_interrupt(struct phy_device *phydev); void phy_free_interrupt(struct phy_device *phydev); void phy_print_status(struct phy_device *phydev); -- cgit v1.2.3 From 9886a4dbd2aacf4b044e294e4cdebb43992d44b1 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Wed, 24 Jun 2020 15:59:23 +0800 Subject: net: phy: call phy_disable_interrupts() in phy_init_hw() Call phy_disable_interrupts() in phy_init_hw() to "have a defined init state as we don't know in which state the PHY is if the PHY driver is loaded. We shouldn't assume that it's the chip power-on defaults, BIOS or boot loader could have changed this. Or in case of dual-boot systems the other OS could leave the PHY in whatever state." as pointed out by Heiner. Suggested-by: Heiner Kallweit Signed-off-by: Jisheng Zhang Signed-off-by: David S. Miller --- drivers/net/phy/phy_device.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 85ba95b598b5..b4978c5fb2ca 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -1092,6 +1092,10 @@ int phy_init_hw(struct phy_device *phydev) if (ret < 0) return ret; + ret = phy_disable_interrupts(phydev); + if (ret) + return ret; + if (phydev->drv->config_init) ret = phydev->drv->config_init(phydev); -- cgit v1.2.3 From a51243860893615b457b149da1ef5df0a4f1ffc2 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 24 Jun 2020 11:13:02 +0100 Subject: qed: add missing error test for DBG_STATUS_NO_MATCHING_FRAMING_MODE The error DBG_STATUS_NO_MATCHING_FRAMING_MODE was added to the enum enum dbg_status however there is a missing corresponding entry for this in the array s_status_str. This causes an out-of-bounds read when indexing into the last entry of s_status_str. Fix this by adding in the missing entry. Addresses-Coverity: ("Out-of-bounds read"). Fixes: 2d22bc8354b1 ("qed: FW 8.42.2.0 debug features") Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_debug.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_debug.c b/drivers/net/ethernet/qlogic/qed/qed_debug.c index 57a0dab88431..81e8fbe4a05b 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_debug.c +++ b/drivers/net/ethernet/qlogic/qed/qed_debug.c @@ -5568,7 +5568,8 @@ static const char * const s_status_str[] = { /* DBG_STATUS_INVALID_FILTER_TRIGGER_DWORDS */ "The filter/trigger constraint dword offsets are not enabled for recording", - + /* DBG_STATUS_NO_MATCHING_FRAMING_MODE */ + "No matching framing mode", /* DBG_STATUS_VFC_READ_ERROR */ "Error reading from VFC", -- cgit v1.2.3 From 715028460082d07a7ec6fcd87b14b46784346a72 Mon Sep 17 00:00:00 2001 From: Russell King Date: Wed, 10 Jun 2020 21:51:11 +0100 Subject: netfilter: ipset: fix unaligned atomic access When using ip_set with counters and comment, traffic causes the kernel to panic on 32-bit ARM: Alignment trap: not handling instruction e1b82f9f at [] Unhandled fault: alignment exception (0x221) at 0xea08133c PC is at ip_set_match_extensions+0xe0/0x224 [ip_set] The problem occurs when we try to update the 64-bit counters - the faulting address above is not 64-bit aligned. The problem occurs due to the way elements are allocated, for example: set->dsize = ip_set_elem_len(set, tb, 0, 0); map = ip_set_alloc(sizeof(*map) + elements * set->dsize); If the element has a requirement for a member to be 64-bit aligned, and set->dsize is not a multiple of 8, but is a multiple of four, then every odd numbered elements will be misaligned - and hitting an atomic64_add() on that element will cause the kernel to panic. ip_set_elem_len() must return a size that is rounded to the maximum alignment of any extension field stored in the element. This change ensures that is the case. Fixes: 95ad1f4a9358 ("netfilter: ipset: Fix extension alignment") Signed-off-by: Russell King Acked-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 340cb955af25..56621d6bfd29 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -460,6 +460,8 @@ ip_set_elem_len(struct ip_set *set, struct nlattr *tb[], size_t len, for (id = 0; id < IPSET_EXT_ID_MAX; id++) { if (!add_extension(id, cadt_flags, tb)) continue; + if (align < ip_set_extensions[id].align) + align = ip_set_extensions[id].align; len = ALIGN(len, ip_set_extensions[id].align); set->offset[id] = len; set->extensions |= ip_set_extensions[id].type; -- cgit v1.2.3 From 4cacc39516784670aa09833a9ec8bf3e90bef561 Mon Sep 17 00:00:00 2001 From: Rob Gill Date: Sun, 21 Jun 2020 05:27:36 +0000 Subject: netfilter: Add MODULE_DESCRIPTION entries to kernel modules The user tool modinfo is used to get information on kernel modules, including a description where it is available. This patch adds a brief MODULE_DESCRIPTION to netfilter kernel modules (descriptions taken from Kconfig file or code comments) Signed-off-by: Rob Gill Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/nft_meta_bridge.c | 1 + net/bridge/netfilter/nft_reject_bridge.c | 1 + net/ipv4/netfilter/ipt_SYNPROXY.c | 1 + net/ipv4/netfilter/nf_flow_table_ipv4.c | 1 + net/ipv4/netfilter/nft_dup_ipv4.c | 1 + net/ipv4/netfilter/nft_fib_ipv4.c | 1 + net/ipv4/netfilter/nft_reject_ipv4.c | 1 + net/ipv6/netfilter/ip6t_SYNPROXY.c | 1 + net/ipv6/netfilter/nf_flow_table_ipv6.c | 1 + net/ipv6/netfilter/nft_dup_ipv6.c | 1 + net/ipv6/netfilter/nft_fib_ipv6.c | 1 + net/ipv6/netfilter/nft_reject_ipv6.c | 1 + net/netfilter/nf_dup_netdev.c | 1 + net/netfilter/nf_flow_table_core.c | 1 + net/netfilter/nf_flow_table_inet.c | 1 + net/netfilter/nf_synproxy_core.c | 1 + net/netfilter/nfnetlink.c | 1 + net/netfilter/nft_compat.c | 1 + net/netfilter/nft_connlimit.c | 1 + net/netfilter/nft_counter.c | 1 + net/netfilter/nft_ct.c | 1 + net/netfilter/nft_dup_netdev.c | 1 + net/netfilter/nft_fib_inet.c | 1 + net/netfilter/nft_fib_netdev.c | 1 + net/netfilter/nft_flow_offload.c | 1 + net/netfilter/nft_hash.c | 1 + net/netfilter/nft_limit.c | 1 + net/netfilter/nft_log.c | 1 + net/netfilter/nft_masq.c | 1 + net/netfilter/nft_nat.c | 1 + net/netfilter/nft_numgen.c | 1 + net/netfilter/nft_objref.c | 1 + net/netfilter/nft_osf.c | 1 + net/netfilter/nft_queue.c | 1 + net/netfilter/nft_quota.c | 1 + net/netfilter/nft_redir.c | 1 + net/netfilter/nft_reject.c | 1 + net/netfilter/nft_reject_inet.c | 1 + net/netfilter/nft_synproxy.c | 1 + net/netfilter/nft_tunnel.c | 1 + net/netfilter/xt_nat.c | 1 + 41 files changed, 41 insertions(+) diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c index 7c9e92b2f806..8e8ffac037cd 100644 --- a/net/bridge/netfilter/nft_meta_bridge.c +++ b/net/bridge/netfilter/nft_meta_bridge.c @@ -155,3 +155,4 @@ module_exit(nft_meta_bridge_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("wenxu "); MODULE_ALIAS_NFT_AF_EXPR(AF_BRIDGE, "meta"); +MODULE_DESCRIPTION("Support for bridge dedicated meta key"); diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c index f48cf4cfb80f..deae2c9a0f69 100644 --- a/net/bridge/netfilter/nft_reject_bridge.c +++ b/net/bridge/netfilter/nft_reject_bridge.c @@ -455,3 +455,4 @@ module_exit(nft_reject_bridge_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso "); MODULE_ALIAS_NFT_AF_EXPR(AF_BRIDGE, "reject"); +MODULE_DESCRIPTION("Reject packets from bridge via nftables"); diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c index 748dc3ce58d3..f2984c7eef40 100644 --- a/net/ipv4/netfilter/ipt_SYNPROXY.c +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c @@ -118,3 +118,4 @@ module_exit(synproxy_tg4_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy "); +MODULE_DESCRIPTION("Intercept TCP connections and establish them using syncookies"); diff --git a/net/ipv4/netfilter/nf_flow_table_ipv4.c b/net/ipv4/netfilter/nf_flow_table_ipv4.c index e32e41b99f0f..aba65fe90345 100644 --- a/net/ipv4/netfilter/nf_flow_table_ipv4.c +++ b/net/ipv4/netfilter/nf_flow_table_ipv4.c @@ -34,3 +34,4 @@ module_exit(nf_flow_ipv4_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso "); MODULE_ALIAS_NF_FLOWTABLE(AF_INET); +MODULE_DESCRIPTION("Netfilter flow table support"); diff --git a/net/ipv4/netfilter/nft_dup_ipv4.c b/net/ipv4/netfilter/nft_dup_ipv4.c index abf89b972094..bcdb37f86a94 100644 --- a/net/ipv4/netfilter/nft_dup_ipv4.c +++ b/net/ipv4/netfilter/nft_dup_ipv4.c @@ -107,3 +107,4 @@ module_exit(nft_dup_ipv4_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso "); MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "dup"); +MODULE_DESCRIPTION("IPv4 nftables packet duplication support"); diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c index ce294113dbcd..03df986217b7 100644 --- a/net/ipv4/netfilter/nft_fib_ipv4.c +++ b/net/ipv4/netfilter/nft_fib_ipv4.c @@ -210,3 +210,4 @@ module_exit(nft_fib4_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Florian Westphal "); MODULE_ALIAS_NFT_AF_EXPR(2, "fib"); +MODULE_DESCRIPTION("nftables fib / ip route lookup support"); diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c index 7e6fd5cde50f..e408f813f5d8 100644 --- a/net/ipv4/netfilter/nft_reject_ipv4.c +++ b/net/ipv4/netfilter/nft_reject_ipv4.c @@ -71,3 +71,4 @@ module_exit(nft_reject_ipv4_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy "); MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "reject"); +MODULE_DESCRIPTION("IPv4 packet rejection for nftables"); diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c index fd1f52a21bf1..d51d0c3e5fe9 100644 --- a/net/ipv6/netfilter/ip6t_SYNPROXY.c +++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c @@ -121,3 +121,4 @@ module_exit(synproxy_tg6_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy "); +MODULE_DESCRIPTION("Intercept IPv6 TCP connections and establish them using syncookies"); diff --git a/net/ipv6/netfilter/nf_flow_table_ipv6.c b/net/ipv6/netfilter/nf_flow_table_ipv6.c index a8566ee12e83..667b8af2546a 100644 --- a/net/ipv6/netfilter/nf_flow_table_ipv6.c +++ b/net/ipv6/netfilter/nf_flow_table_ipv6.c @@ -35,3 +35,4 @@ module_exit(nf_flow_ipv6_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso "); MODULE_ALIAS_NF_FLOWTABLE(AF_INET6); +MODULE_DESCRIPTION("Netfilter flow table IPv6 module"); diff --git a/net/ipv6/netfilter/nft_dup_ipv6.c b/net/ipv6/netfilter/nft_dup_ipv6.c index 2af32200507d..8b5193efb1f1 100644 --- a/net/ipv6/netfilter/nft_dup_ipv6.c +++ b/net/ipv6/netfilter/nft_dup_ipv6.c @@ -105,3 +105,4 @@ module_exit(nft_dup_ipv6_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso "); MODULE_ALIAS_NFT_AF_EXPR(AF_INET6, "dup"); +MODULE_DESCRIPTION("IPv6 nftables packet duplication support"); diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c b/net/ipv6/netfilter/nft_fib_ipv6.c index 7ece86afd079..e204163c7036 100644 --- a/net/ipv6/netfilter/nft_fib_ipv6.c +++ b/net/ipv6/netfilter/nft_fib_ipv6.c @@ -255,3 +255,4 @@ module_exit(nft_fib6_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Florian Westphal "); MODULE_ALIAS_NFT_AF_EXPR(10, "fib"); +MODULE_DESCRIPTION("nftables fib / ipv6 route lookup support"); diff --git a/net/ipv6/netfilter/nft_reject_ipv6.c b/net/ipv6/netfilter/nft_reject_ipv6.c index 680a28ce29fd..c1098a1968e1 100644 --- a/net/ipv6/netfilter/nft_reject_ipv6.c +++ b/net/ipv6/netfilter/nft_reject_ipv6.c @@ -72,3 +72,4 @@ module_exit(nft_reject_ipv6_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy "); MODULE_ALIAS_NFT_AF_EXPR(AF_INET6, "reject"); +MODULE_DESCRIPTION("IPv6 packet rejection for nftables"); diff --git a/net/netfilter/nf_dup_netdev.c b/net/netfilter/nf_dup_netdev.c index f108a76925dd..2b01a151eaa8 100644 --- a/net/netfilter/nf_dup_netdev.c +++ b/net/netfilter/nf_dup_netdev.c @@ -73,3 +73,4 @@ EXPORT_SYMBOL_GPL(nft_fwd_dup_netdev_offload); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso "); +MODULE_DESCRIPTION("Netfilter packet duplication support"); diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index afa85171df38..b1eb5272b379 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -594,3 +594,4 @@ module_exit(nf_flow_table_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso "); +MODULE_DESCRIPTION("Netfilter flow table module"); diff --git a/net/netfilter/nf_flow_table_inet.c b/net/netfilter/nf_flow_table_inet.c index 88bedf1ff1ae..bc4126d8ef65 100644 --- a/net/netfilter/nf_flow_table_inet.c +++ b/net/netfilter/nf_flow_table_inet.c @@ -72,3 +72,4 @@ module_exit(nf_flow_inet_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso "); MODULE_ALIAS_NF_FLOWTABLE(1); /* NFPROTO_INET */ +MODULE_DESCRIPTION("Netfilter flow table mixed IPv4/IPv6 module"); diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index b9cbe1e2453e..ebcdc8e54476 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -1237,3 +1237,4 @@ EXPORT_SYMBOL_GPL(nf_synproxy_ipv6_fini); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy "); +MODULE_DESCRIPTION("nftables SYNPROXY expression support"); diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 99127e2d95a8..5f24edf95830 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -33,6 +33,7 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Harald Welte "); MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NETFILTER); +MODULE_DESCRIPTION("Netfilter messages via netlink socket"); #define nfnl_dereference_protected(id) \ rcu_dereference_protected(table[(id)].subsys, \ diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index f9adca62ccb3..aa1a066cb74b 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -902,3 +902,4 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso "); MODULE_ALIAS_NFT_EXPR("match"); MODULE_ALIAS_NFT_EXPR("target"); +MODULE_DESCRIPTION("x_tables over nftables support"); diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c index 69d6173f91e2..7d0761fad37e 100644 --- a/net/netfilter/nft_connlimit.c +++ b/net/netfilter/nft_connlimit.c @@ -280,3 +280,4 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso"); MODULE_ALIAS_NFT_EXPR("connlimit"); MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CONNLIMIT); +MODULE_DESCRIPTION("nftables connlimit rule support"); diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c index f6d4d0fa23a6..85ed461ec24e 100644 --- a/net/netfilter/nft_counter.c +++ b/net/netfilter/nft_counter.c @@ -303,3 +303,4 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy "); MODULE_ALIAS_NFT_EXPR("counter"); MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_COUNTER); +MODULE_DESCRIPTION("nftables counter rule support"); diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index faea72c2df32..77258af1fce0 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -1345,3 +1345,4 @@ MODULE_ALIAS_NFT_EXPR("notrack"); MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_HELPER); MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_TIMEOUT); MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_EXPECT); +MODULE_DESCRIPTION("Netfilter nf_tables conntrack module"); diff --git a/net/netfilter/nft_dup_netdev.c b/net/netfilter/nft_dup_netdev.c index c2e78c160fd7..40788b3f1071 100644 --- a/net/netfilter/nft_dup_netdev.c +++ b/net/netfilter/nft_dup_netdev.c @@ -102,3 +102,4 @@ module_exit(nft_dup_netdev_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso "); MODULE_ALIAS_NFT_AF_EXPR(5, "dup"); +MODULE_DESCRIPTION("nftables netdev packet duplication support"); diff --git a/net/netfilter/nft_fib_inet.c b/net/netfilter/nft_fib_inet.c index 465432e0531b..a88d44e163d1 100644 --- a/net/netfilter/nft_fib_inet.c +++ b/net/netfilter/nft_fib_inet.c @@ -76,3 +76,4 @@ module_exit(nft_fib_inet_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Florian Westphal "); MODULE_ALIAS_NFT_AF_EXPR(1, "fib"); +MODULE_DESCRIPTION("nftables fib inet support"); diff --git a/net/netfilter/nft_fib_netdev.c b/net/netfilter/nft_fib_netdev.c index a2e726ae7f07..3f3478abd845 100644 --- a/net/netfilter/nft_fib_netdev.c +++ b/net/netfilter/nft_fib_netdev.c @@ -85,3 +85,4 @@ module_exit(nft_fib_netdev_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo M. Bermudo Garay "); MODULE_ALIAS_NFT_AF_EXPR(5, "fib"); +MODULE_DESCRIPTION("nftables netdev fib lookups support"); diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index b70b48996801..3b9b97aa4b32 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -286,3 +286,4 @@ module_exit(nft_flow_offload_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso "); MODULE_ALIAS_NFT_EXPR("flow_offload"); +MODULE_DESCRIPTION("nftables hardware flow offload module"); diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c index b836d550b919..96371d878e7e 100644 --- a/net/netfilter/nft_hash.c +++ b/net/netfilter/nft_hash.c @@ -248,3 +248,4 @@ module_exit(nft_hash_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Laura Garcia "); MODULE_ALIAS_NFT_EXPR("hash"); +MODULE_DESCRIPTION("Netfilter nftables hash module"); diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c index 35b67d7e3694..0e2c315c3b5e 100644 --- a/net/netfilter/nft_limit.c +++ b/net/netfilter/nft_limit.c @@ -372,3 +372,4 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy "); MODULE_ALIAS_NFT_EXPR("limit"); MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_LIMIT); +MODULE_DESCRIPTION("nftables limit expression support"); diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c index fe4831f2258f..57899454a530 100644 --- a/net/netfilter/nft_log.c +++ b/net/netfilter/nft_log.c @@ -298,3 +298,4 @@ module_exit(nft_log_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy "); MODULE_ALIAS_NFT_EXPR("log"); +MODULE_DESCRIPTION("Netfilter nf_tables log module"); diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c index bc9fd98c5d6d..71390b727040 100644 --- a/net/netfilter/nft_masq.c +++ b/net/netfilter/nft_masq.c @@ -305,3 +305,4 @@ module_exit(nft_masq_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Arturo Borrero Gonzalez "); MODULE_ALIAS_NFT_EXPR("masq"); +MODULE_DESCRIPTION("Netfilter nftables masquerade expression support"); diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index 23a7bfd10521..4bcf33b049c4 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -402,3 +402,4 @@ module_exit(nft_nat_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Tomasz Bursztyka "); MODULE_ALIAS_NFT_EXPR("nat"); +MODULE_DESCRIPTION("Network Address Translation support"); diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c index 48edb9d5f012..f1fc824f9737 100644 --- a/net/netfilter/nft_numgen.c +++ b/net/netfilter/nft_numgen.c @@ -217,3 +217,4 @@ module_exit(nft_ng_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Laura Garcia "); MODULE_ALIAS_NFT_EXPR("numgen"); +MODULE_DESCRIPTION("nftables number generator module"); diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c index bfd18d2b65a2..5f9207a9f485 100644 --- a/net/netfilter/nft_objref.c +++ b/net/netfilter/nft_objref.c @@ -252,3 +252,4 @@ module_exit(nft_objref_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso "); MODULE_ALIAS_NFT_EXPR("objref"); +MODULE_DESCRIPTION("nftables stateful object reference module"); diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c index b42247aa48a9..c261d57a666a 100644 --- a/net/netfilter/nft_osf.c +++ b/net/netfilter/nft_osf.c @@ -149,3 +149,4 @@ module_exit(nft_osf_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Fernando Fernandez "); MODULE_ALIAS_NFT_EXPR("osf"); +MODULE_DESCRIPTION("nftables passive OS fingerprint support"); diff --git a/net/netfilter/nft_queue.c b/net/netfilter/nft_queue.c index 5ece0a6aa8c3..23265d757acb 100644 --- a/net/netfilter/nft_queue.c +++ b/net/netfilter/nft_queue.c @@ -216,3 +216,4 @@ module_exit(nft_queue_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Eric Leblond "); MODULE_ALIAS_NFT_EXPR("queue"); +MODULE_DESCRIPTION("Netfilter nftables queue module"); diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c index 4413690591f2..0363f533a42b 100644 --- a/net/netfilter/nft_quota.c +++ b/net/netfilter/nft_quota.c @@ -254,3 +254,4 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso "); MODULE_ALIAS_NFT_EXPR("quota"); MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_QUOTA); +MODULE_DESCRIPTION("Netfilter nftables quota module"); diff --git a/net/netfilter/nft_redir.c b/net/netfilter/nft_redir.c index 5b779171565c..2056051c0af0 100644 --- a/net/netfilter/nft_redir.c +++ b/net/netfilter/nft_redir.c @@ -292,3 +292,4 @@ module_exit(nft_redir_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Arturo Borrero Gonzalez "); MODULE_ALIAS_NFT_EXPR("redir"); +MODULE_DESCRIPTION("Netfilter nftables redirect support"); diff --git a/net/netfilter/nft_reject.c b/net/netfilter/nft_reject.c index 00f865fb80ca..86eafbb0fdd0 100644 --- a/net/netfilter/nft_reject.c +++ b/net/netfilter/nft_reject.c @@ -119,3 +119,4 @@ EXPORT_SYMBOL_GPL(nft_reject_icmpv6_code); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy "); +MODULE_DESCRIPTION("Netfilter x_tables over nftables module"); diff --git a/net/netfilter/nft_reject_inet.c b/net/netfilter/nft_reject_inet.c index f41f414b72d1..cf8f2646e93c 100644 --- a/net/netfilter/nft_reject_inet.c +++ b/net/netfilter/nft_reject_inet.c @@ -149,3 +149,4 @@ module_exit(nft_reject_inet_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy "); MODULE_ALIAS_NFT_AF_EXPR(1, "reject"); +MODULE_DESCRIPTION("Netfilter nftables reject inet support"); diff --git a/net/netfilter/nft_synproxy.c b/net/netfilter/nft_synproxy.c index e2c1fc608841..4fda8b3f1762 100644 --- a/net/netfilter/nft_synproxy.c +++ b/net/netfilter/nft_synproxy.c @@ -388,3 +388,4 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Fernando Fernandez "); MODULE_ALIAS_NFT_EXPR("synproxy"); MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_SYNPROXY); +MODULE_DESCRIPTION("nftables SYNPROXY expression support"); diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c index 30be5787fbde..d3eb953d0333 100644 --- a/net/netfilter/nft_tunnel.c +++ b/net/netfilter/nft_tunnel.c @@ -719,3 +719,4 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso "); MODULE_ALIAS_NFT_EXPR("tunnel"); MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_TUNNEL); +MODULE_DESCRIPTION("nftables tunnel expression support"); diff --git a/net/netfilter/xt_nat.c b/net/netfilter/xt_nat.c index a8e5f6c8db7a..b4f7bbc3f3ca 100644 --- a/net/netfilter/xt_nat.c +++ b/net/netfilter/xt_nat.c @@ -244,3 +244,4 @@ MODULE_ALIAS("ipt_SNAT"); MODULE_ALIAS("ipt_DNAT"); MODULE_ALIAS("ip6t_SNAT"); MODULE_ALIAS("ip6t_DNAT"); +MODULE_DESCRIPTION("SNAT and DNAT targets support"); -- cgit v1.2.3 From 1cbf90985f7448f1b0dd630e17ee1070f7d58665 Mon Sep 17 00:00:00 2001 From: David Wilder Date: Mon, 22 Jun 2020 10:10:11 -0700 Subject: netfilter: iptables: Split ipt_unregister_table() into pre_exit and exit helpers. The pre_exit will un-register the underlying hook and .exit will do the table freeing. The netns core does an unconditional synchronize_rcu after the pre_exit hooks insuring no packets are in flight that have picked up the pointer before completing the un-register. Fixes: b9e69e127397 ("netfilter: xtables: don't hook tables by default") Signed-off-by: David Wilder Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_ipv4/ip_tables.h | 6 ++++++ net/ipv4/netfilter/ip_tables.c | 15 ++++++++++++++- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/include/linux/netfilter_ipv4/ip_tables.h b/include/linux/netfilter_ipv4/ip_tables.h index b394bd4f68a3..c4676d6feeff 100644 --- a/include/linux/netfilter_ipv4/ip_tables.h +++ b/include/linux/netfilter_ipv4/ip_tables.h @@ -25,6 +25,12 @@ int ipt_register_table(struct net *net, const struct xt_table *table, const struct ipt_replace *repl, const struct nf_hook_ops *ops, struct xt_table **res); + +void ipt_unregister_table_pre_exit(struct net *net, struct xt_table *table, + const struct nf_hook_ops *ops); + +void ipt_unregister_table_exit(struct net *net, struct xt_table *table); + void ipt_unregister_table(struct net *net, struct xt_table *table, const struct nf_hook_ops *ops); diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index c2670eaa74e6..5bf9fa06aee0 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1797,11 +1797,22 @@ out_free: return ret; } +void ipt_unregister_table_pre_exit(struct net *net, struct xt_table *table, + const struct nf_hook_ops *ops) +{ + nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks)); +} + +void ipt_unregister_table_exit(struct net *net, struct xt_table *table) +{ + __ipt_unregister_table(net, table); +} + void ipt_unregister_table(struct net *net, struct xt_table *table, const struct nf_hook_ops *ops) { if (ops) - nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks)); + ipt_unregister_table_pre_exit(net, table, ops); __ipt_unregister_table(net, table); } @@ -1958,6 +1969,8 @@ static void __exit ip_tables_fini(void) EXPORT_SYMBOL(ipt_register_table); EXPORT_SYMBOL(ipt_unregister_table); +EXPORT_SYMBOL(ipt_unregister_table_pre_exit); +EXPORT_SYMBOL(ipt_unregister_table_exit); EXPORT_SYMBOL(ipt_do_table); module_init(ip_tables_init); module_exit(ip_tables_fini); -- cgit v1.2.3 From cf4cbc610bfa29a88cd71ca638a890f8c565a22e Mon Sep 17 00:00:00 2001 From: David Wilder Date: Mon, 22 Jun 2020 10:10:12 -0700 Subject: netfilter: iptables: Add a .pre_exit hook in all iptable_foo.c. Using new helpers ipt_unregister_table_pre_exit() and ipt_unregister_table_exit(). Fixes: b9e69e127397 ("netfilter: xtables: don't hook tables by default") Signed-off-by: David Wilder Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/iptable_filter.c | 10 +++++++++- net/ipv4/netfilter/iptable_mangle.c | 10 +++++++++- net/ipv4/netfilter/iptable_nat.c | 10 ++++++++-- net/ipv4/netfilter/iptable_raw.c | 10 +++++++++- net/ipv4/netfilter/iptable_security.c | 11 +++++++++-- 5 files changed, 44 insertions(+), 7 deletions(-) diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index 9d54b4017e50..8f7bc1ee7453 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c @@ -72,16 +72,24 @@ static int __net_init iptable_filter_net_init(struct net *net) return 0; } +static void __net_exit iptable_filter_net_pre_exit(struct net *net) +{ + if (net->ipv4.iptable_filter) + ipt_unregister_table_pre_exit(net, net->ipv4.iptable_filter, + filter_ops); +} + static void __net_exit iptable_filter_net_exit(struct net *net) { if (!net->ipv4.iptable_filter) return; - ipt_unregister_table(net, net->ipv4.iptable_filter, filter_ops); + ipt_unregister_table_exit(net, net->ipv4.iptable_filter); net->ipv4.iptable_filter = NULL; } static struct pernet_operations iptable_filter_net_ops = { .init = iptable_filter_net_init, + .pre_exit = iptable_filter_net_pre_exit, .exit = iptable_filter_net_exit, }; diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index bb9266ea3785..f703a717ab1d 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -100,15 +100,23 @@ static int __net_init iptable_mangle_table_init(struct net *net) return ret; } +static void __net_exit iptable_mangle_net_pre_exit(struct net *net) +{ + if (net->ipv4.iptable_mangle) + ipt_unregister_table_pre_exit(net, net->ipv4.iptable_mangle, + mangle_ops); +} + static void __net_exit iptable_mangle_net_exit(struct net *net) { if (!net->ipv4.iptable_mangle) return; - ipt_unregister_table(net, net->ipv4.iptable_mangle, mangle_ops); + ipt_unregister_table_exit(net, net->ipv4.iptable_mangle); net->ipv4.iptable_mangle = NULL; } static struct pernet_operations iptable_mangle_net_ops = { + .pre_exit = iptable_mangle_net_pre_exit, .exit = iptable_mangle_net_exit, }; diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c index ad33687b7444..b0143b109f25 100644 --- a/net/ipv4/netfilter/iptable_nat.c +++ b/net/ipv4/netfilter/iptable_nat.c @@ -113,16 +113,22 @@ static int __net_init iptable_nat_table_init(struct net *net) return ret; } +static void __net_exit iptable_nat_net_pre_exit(struct net *net) +{ + if (net->ipv4.nat_table) + ipt_nat_unregister_lookups(net); +} + static void __net_exit iptable_nat_net_exit(struct net *net) { if (!net->ipv4.nat_table) return; - ipt_nat_unregister_lookups(net); - ipt_unregister_table(net, net->ipv4.nat_table, NULL); + ipt_unregister_table_exit(net, net->ipv4.nat_table); net->ipv4.nat_table = NULL; } static struct pernet_operations iptable_nat_net_ops = { + .pre_exit = iptable_nat_net_pre_exit, .exit = iptable_nat_net_exit, }; diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index 69697eb4bfc6..9abfe6bf2cb9 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -67,15 +67,23 @@ static int __net_init iptable_raw_table_init(struct net *net) return ret; } +static void __net_exit iptable_raw_net_pre_exit(struct net *net) +{ + if (net->ipv4.iptable_raw) + ipt_unregister_table_pre_exit(net, net->ipv4.iptable_raw, + rawtable_ops); +} + static void __net_exit iptable_raw_net_exit(struct net *net) { if (!net->ipv4.iptable_raw) return; - ipt_unregister_table(net, net->ipv4.iptable_raw, rawtable_ops); + ipt_unregister_table_exit(net, net->ipv4.iptable_raw); net->ipv4.iptable_raw = NULL; } static struct pernet_operations iptable_raw_net_ops = { + .pre_exit = iptable_raw_net_pre_exit, .exit = iptable_raw_net_exit, }; diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c index ac633c1db97e..415c1975d770 100644 --- a/net/ipv4/netfilter/iptable_security.c +++ b/net/ipv4/netfilter/iptable_security.c @@ -62,16 +62,23 @@ static int __net_init iptable_security_table_init(struct net *net) return ret; } +static void __net_exit iptable_security_net_pre_exit(struct net *net) +{ + if (net->ipv4.iptable_security) + ipt_unregister_table_pre_exit(net, net->ipv4.iptable_security, + sectbl_ops); +} + static void __net_exit iptable_security_net_exit(struct net *net) { if (!net->ipv4.iptable_security) return; - - ipt_unregister_table(net, net->ipv4.iptable_security, sectbl_ops); + ipt_unregister_table_exit(net, net->ipv4.iptable_security); net->ipv4.iptable_security = NULL; } static struct pernet_operations iptable_security_net_ops = { + .pre_exit = iptable_security_net_pre_exit, .exit = iptable_security_net_exit, }; -- cgit v1.2.3 From 57ea5f18882a3d7cf6135fa8c949a37c89395837 Mon Sep 17 00:00:00 2001 From: David Wilder Date: Mon, 22 Jun 2020 10:10:13 -0700 Subject: netfilter: ip6tables: Split ip6t_unregister_table() into pre_exit and exit helpers. The pre_exit will un-register the underlying hook and .exit will do the table freeing. The netns core does an unconditional synchronize_rcu after the pre_exit hooks insuring no packets are in flight that have picked up the pointer before completing the un-register. Fixes: b9e69e127397 ("netfilter: xtables: don't hook tables by default") Signed-off-by: David Wilder Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_ipv6/ip6_tables.h | 3 +++ net/ipv6/netfilter/ip6_tables.c | 15 ++++++++++++++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/include/linux/netfilter_ipv6/ip6_tables.h b/include/linux/netfilter_ipv6/ip6_tables.h index 8225f7821a29..1547d5f9ae06 100644 --- a/include/linux/netfilter_ipv6/ip6_tables.h +++ b/include/linux/netfilter_ipv6/ip6_tables.h @@ -29,6 +29,9 @@ int ip6t_register_table(struct net *net, const struct xt_table *table, const struct nf_hook_ops *ops, struct xt_table **res); void ip6t_unregister_table(struct net *net, struct xt_table *table, const struct nf_hook_ops *ops); +void ip6t_unregister_table_pre_exit(struct net *net, struct xt_table *table, + const struct nf_hook_ops *ops); +void ip6t_unregister_table_exit(struct net *net, struct xt_table *table); extern unsigned int ip6t_do_table(struct sk_buff *skb, const struct nf_hook_state *state, struct xt_table *table); diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index e27393498ecb..e96a431549bc 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -1807,11 +1807,22 @@ out_free: return ret; } +void ip6t_unregister_table_pre_exit(struct net *net, struct xt_table *table, + const struct nf_hook_ops *ops) +{ + nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks)); +} + +void ip6t_unregister_table_exit(struct net *net, struct xt_table *table) +{ + __ip6t_unregister_table(net, table); +} + void ip6t_unregister_table(struct net *net, struct xt_table *table, const struct nf_hook_ops *ops) { if (ops) - nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks)); + ip6t_unregister_table_pre_exit(net, table, ops); __ip6t_unregister_table(net, table); } @@ -1969,6 +1980,8 @@ static void __exit ip6_tables_fini(void) EXPORT_SYMBOL(ip6t_register_table); EXPORT_SYMBOL(ip6t_unregister_table); +EXPORT_SYMBOL(ip6t_unregister_table_pre_exit); +EXPORT_SYMBOL(ip6t_unregister_table_exit); EXPORT_SYMBOL(ip6t_do_table); module_init(ip6_tables_init); -- cgit v1.2.3 From 5f027bc74a9be2e53233de4ebcd2f90390b50a62 Mon Sep 17 00:00:00 2001 From: David Wilder Date: Mon, 22 Jun 2020 10:10:14 -0700 Subject: netfilter: ip6tables: Add a .pre_exit hook in all ip6table_foo.c. Using new helpers ip6t_unregister_table_pre_exit() and ip6t_unregister_table_exit(). Fixes: b9e69e127397 ("netfilter: xtables: don't hook tables by default") Signed-off-by: David Wilder Signed-off-by: Pablo Neira Ayuso --- net/ipv6/netfilter/ip6table_filter.c | 10 +++++++++- net/ipv6/netfilter/ip6table_mangle.c | 10 +++++++++- net/ipv6/netfilter/ip6table_nat.c | 10 ++++++++-- net/ipv6/netfilter/ip6table_raw.c | 10 +++++++++- net/ipv6/netfilter/ip6table_security.c | 10 +++++++++- 5 files changed, 44 insertions(+), 6 deletions(-) diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c index 32667f5d5a33..88337b51ffbf 100644 --- a/net/ipv6/netfilter/ip6table_filter.c +++ b/net/ipv6/netfilter/ip6table_filter.c @@ -73,16 +73,24 @@ static int __net_init ip6table_filter_net_init(struct net *net) return 0; } +static void __net_exit ip6table_filter_net_pre_exit(struct net *net) +{ + if (net->ipv6.ip6table_filter) + ip6t_unregister_table_pre_exit(net, net->ipv6.ip6table_filter, + filter_ops); +} + static void __net_exit ip6table_filter_net_exit(struct net *net) { if (!net->ipv6.ip6table_filter) return; - ip6t_unregister_table(net, net->ipv6.ip6table_filter, filter_ops); + ip6t_unregister_table_exit(net, net->ipv6.ip6table_filter); net->ipv6.ip6table_filter = NULL; } static struct pernet_operations ip6table_filter_net_ops = { .init = ip6table_filter_net_init, + .pre_exit = ip6table_filter_net_pre_exit, .exit = ip6table_filter_net_exit, }; diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c index 070afb97fa2b..1a2748611e00 100644 --- a/net/ipv6/netfilter/ip6table_mangle.c +++ b/net/ipv6/netfilter/ip6table_mangle.c @@ -93,16 +93,24 @@ static int __net_init ip6table_mangle_table_init(struct net *net) return ret; } +static void __net_exit ip6table_mangle_net_pre_exit(struct net *net) +{ + if (net->ipv6.ip6table_mangle) + ip6t_unregister_table_pre_exit(net, net->ipv6.ip6table_mangle, + mangle_ops); +} + static void __net_exit ip6table_mangle_net_exit(struct net *net) { if (!net->ipv6.ip6table_mangle) return; - ip6t_unregister_table(net, net->ipv6.ip6table_mangle, mangle_ops); + ip6t_unregister_table_exit(net, net->ipv6.ip6table_mangle); net->ipv6.ip6table_mangle = NULL; } static struct pernet_operations ip6table_mangle_net_ops = { + .pre_exit = ip6table_mangle_net_pre_exit, .exit = ip6table_mangle_net_exit, }; diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c index 0f4875952efc..0a23265e3caa 100644 --- a/net/ipv6/netfilter/ip6table_nat.c +++ b/net/ipv6/netfilter/ip6table_nat.c @@ -114,16 +114,22 @@ static int __net_init ip6table_nat_table_init(struct net *net) return ret; } +static void __net_exit ip6table_nat_net_pre_exit(struct net *net) +{ + if (net->ipv6.ip6table_nat) + ip6t_nat_unregister_lookups(net); +} + static void __net_exit ip6table_nat_net_exit(struct net *net) { if (!net->ipv6.ip6table_nat) return; - ip6t_nat_unregister_lookups(net); - ip6t_unregister_table(net, net->ipv6.ip6table_nat, NULL); + ip6t_unregister_table_exit(net, net->ipv6.ip6table_nat); net->ipv6.ip6table_nat = NULL; } static struct pernet_operations ip6table_nat_net_ops = { + .pre_exit = ip6table_nat_net_pre_exit, .exit = ip6table_nat_net_exit, }; diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c index a22100b1cf2c..8f9e742226f7 100644 --- a/net/ipv6/netfilter/ip6table_raw.c +++ b/net/ipv6/netfilter/ip6table_raw.c @@ -66,15 +66,23 @@ static int __net_init ip6table_raw_table_init(struct net *net) return ret; } +static void __net_exit ip6table_raw_net_pre_exit(struct net *net) +{ + if (net->ipv6.ip6table_raw) + ip6t_unregister_table_pre_exit(net, net->ipv6.ip6table_raw, + rawtable_ops); +} + static void __net_exit ip6table_raw_net_exit(struct net *net) { if (!net->ipv6.ip6table_raw) return; - ip6t_unregister_table(net, net->ipv6.ip6table_raw, rawtable_ops); + ip6t_unregister_table_exit(net, net->ipv6.ip6table_raw); net->ipv6.ip6table_raw = NULL; } static struct pernet_operations ip6table_raw_net_ops = { + .pre_exit = ip6table_raw_net_pre_exit, .exit = ip6table_raw_net_exit, }; diff --git a/net/ipv6/netfilter/ip6table_security.c b/net/ipv6/netfilter/ip6table_security.c index a74335fe2bd9..5e8c48fed032 100644 --- a/net/ipv6/netfilter/ip6table_security.c +++ b/net/ipv6/netfilter/ip6table_security.c @@ -61,15 +61,23 @@ static int __net_init ip6table_security_table_init(struct net *net) return ret; } +static void __net_exit ip6table_security_net_pre_exit(struct net *net) +{ + if (net->ipv6.ip6table_security) + ip6t_unregister_table_pre_exit(net, net->ipv6.ip6table_security, + sectbl_ops); +} + static void __net_exit ip6table_security_net_exit(struct net *net) { if (!net->ipv6.ip6table_security) return; - ip6t_unregister_table(net, net->ipv6.ip6table_security, sectbl_ops); + ip6t_unregister_table_exit(net, net->ipv6.ip6table_security); net->ipv6.ip6table_security = NULL; } static struct pernet_operations ip6table_security_net_ops = { + .pre_exit = ip6table_security_net_pre_exit, .exit = ip6table_security_net_exit, }; -- cgit v1.2.3 From 619ae8e0697a6fb85b99b19137590c7c337c579e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 22 Jun 2020 10:28:32 +0200 Subject: selftests: netfilter: add test case for conntrack helper assignment check that 'nft ... ct helper set ' works: 1. configure ftp helper via nft and assign it to connections on port 2121 2. check with 'conntrack -L' that the next connection has the ftp helper attached to it. Also add a test for auto-assign (old behaviour). Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- tools/testing/selftests/netfilter/Makefile | 2 +- .../selftests/netfilter/nft_conntrack_helper.sh | 175 +++++++++++++++++++++ 2 files changed, 176 insertions(+), 1 deletion(-) create mode 100755 tools/testing/selftests/netfilter/nft_conntrack_helper.sh diff --git a/tools/testing/selftests/netfilter/Makefile b/tools/testing/selftests/netfilter/Makefile index 9c0f758310fe..a179f0dca8ce 100644 --- a/tools/testing/selftests/netfilter/Makefile +++ b/tools/testing/selftests/netfilter/Makefile @@ -3,7 +3,7 @@ TEST_PROGS := nft_trans_stress.sh nft_nat.sh bridge_brouter.sh \ conntrack_icmp_related.sh nft_flowtable.sh ipvs.sh \ - nft_concat_range.sh \ + nft_concat_range.sh nft_conntrack_helper.sh \ nft_queue.sh LDLIBS = -lmnl diff --git a/tools/testing/selftests/netfilter/nft_conntrack_helper.sh b/tools/testing/selftests/netfilter/nft_conntrack_helper.sh new file mode 100755 index 000000000000..edf0a48da6bf --- /dev/null +++ b/tools/testing/selftests/netfilter/nft_conntrack_helper.sh @@ -0,0 +1,175 @@ +#!/bin/bash +# +# This tests connection tracking helper assignment: +# 1. can attach ftp helper to a connection from nft ruleset. +# 2. auto-assign still works. +# +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 +ret=0 + +sfx=$(mktemp -u "XXXXXXXX") +ns1="ns1-$sfx" +ns2="ns2-$sfx" +testipv6=1 + +cleanup() +{ + ip netns del ${ns1} + ip netns del ${ns2} +} + +nft --version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without nft tool" + exit $ksft_skip +fi + +ip -Version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without ip tool" + exit $ksft_skip +fi + +conntrack -V > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without conntrack tool" + exit $ksft_skip +fi + +which nc >/dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without netcat tool" + exit $ksft_skip +fi + +trap cleanup EXIT + +ip netns add ${ns1} +ip netns add ${ns2} + +ip link add veth0 netns ${ns1} type veth peer name veth0 netns ${ns2} > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: No virtual ethernet pair device support in kernel" + exit $ksft_skip +fi + +ip -net ${ns1} link set lo up +ip -net ${ns1} link set veth0 up + +ip -net ${ns2} link set lo up +ip -net ${ns2} link set veth0 up + +ip -net ${ns1} addr add 10.0.1.1/24 dev veth0 +ip -net ${ns1} addr add dead:1::1/64 dev veth0 + +ip -net ${ns2} addr add 10.0.1.2/24 dev veth0 +ip -net ${ns2} addr add dead:1::2/64 dev veth0 + +load_ruleset_family() { + local family=$1 + local ns=$2 + +ip netns exec ${ns} nft -f - < /dev/null |grep -q 'helper=ftp' + if [ $? -ne 0 ] ; then + echo "FAIL: ${netns} did not show attached helper $message" 1>&2 + ret=1 + fi + + echo "PASS: ${netns} connection on port $port has ftp helper attached" 1>&2 + return 0 +} + +test_helper() +{ + local port=$1 + local msg=$2 + + sleep 3 | ip netns exec ${ns2} nc -w 2 -l -p $port > /dev/null & + + sleep 1 + sleep 1 | ip netns exec ${ns1} nc -w 2 10.0.1.2 $port > /dev/null & + + check_for_helper "$ns1" "ip $msg" $port + check_for_helper "$ns2" "ip $msg" $port + + wait + + if [ $testipv6 -eq 0 ] ;then + return 0 + fi + + ip netns exec ${ns1} conntrack -F 2> /dev/null + ip netns exec ${ns2} conntrack -F 2> /dev/null + + sleep 3 | ip netns exec ${ns2} nc -w 2 -6 -l -p $port > /dev/null & + + sleep 1 + sleep 1 | ip netns exec ${ns1} nc -w 2 -6 dead:1::2 $port > /dev/null & + + check_for_helper "$ns1" "ipv6 $msg" $port + check_for_helper "$ns2" "ipv6 $msg" $port + + wait +} + +load_ruleset_family ip ${ns1} +if [ $? -ne 0 ];then + echo "FAIL: ${ns1} cannot load ip ruleset" 1>&2 + exit 1 +fi + +load_ruleset_family ip6 ${ns1} +if [ $? -ne 0 ];then + echo "SKIP: ${ns1} cannot load ip6 ruleset" 1>&2 + testipv6=0 +fi + +load_ruleset_family inet ${ns2} +if [ $? -ne 0 ];then + echo "SKIP: ${ns1} cannot load inet ruleset" 1>&2 + load_ruleset_family ip ${ns2} + if [ $? -ne 0 ];then + echo "FAIL: ${ns2} cannot load ip ruleset" 1>&2 + exit 1 + fi + + if [ $testipv6 -eq 1 ] ;then + load_ruleset_family ip6 ${ns2} + if [ $? -ne 0 ];then + echo "FAIL: ${ns2} cannot load ip6 ruleset" 1>&2 + exit 1 + fi + fi +fi + +test_helper 2121 "set via ruleset" +ip netns exec ${ns1} sysctl -q 'net.netfilter.nf_conntrack_helper=1' +ip netns exec ${ns2} sysctl -q 'net.netfilter.nf_conntrack_helper=1' +test_helper 21 "auto-assign" + +exit $ret -- cgit v1.2.3 From 673bafd5b8b0ec9b1e2eedca2a2be6b44ee5ba9c Mon Sep 17 00:00:00 2001 From: Doug Berger Date: Wed, 24 Jun 2020 18:14:53 -0700 Subject: net: bcmgenet: re-remove bcmgenet_hfb_add_filter This function was originally removed by Baoyou Xie in commit e2072600a241 ("net: bcmgenet: remove unused function in bcmgenet.c") to prevent a build warning. Some of the functions removed by Baoyou Xie are now used for WAKE_FILTER support so his commit was reverted, but this function is still unused and the kbuild test robot dutifully reported the warning. This commit once again removes the remaining unused hfb functions. Fixes: 14da1510fedc ("Revert "net: bcmgenet: remove unused function in bcmgenet.c"") Reported-by: kbuild test robot Signed-off-by: Doug Berger Acked-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/genet/bcmgenet.c | 77 -------------------------- 1 file changed, 77 deletions(-) diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c index ff31da0ed846..f1fa11665319 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c @@ -459,17 +459,6 @@ static inline void bcmgenet_rdma_ring_writel(struct bcmgenet_priv *priv, genet_dma_ring_regs[r]); } -static bool bcmgenet_hfb_is_filter_enabled(struct bcmgenet_priv *priv, - u32 f_index) -{ - u32 offset; - u32 reg; - - offset = HFB_FLT_ENABLE_V3PLUS + (f_index < 32) * sizeof(u32); - reg = bcmgenet_hfb_reg_readl(priv, offset); - return !!(reg & (1 << (f_index % 32))); -} - static void bcmgenet_hfb_enable_filter(struct bcmgenet_priv *priv, u32 f_index) { u32 offset; @@ -533,19 +522,6 @@ static void bcmgenet_hfb_set_filter_length(struct bcmgenet_priv *priv, bcmgenet_hfb_reg_writel(priv, reg, offset); } -static int bcmgenet_hfb_find_unused_filter(struct bcmgenet_priv *priv) -{ - u32 f_index; - - /* First MAX_NUM_OF_FS_RULES are reserved for Rx NFC filters */ - for (f_index = MAX_NUM_OF_FS_RULES; - f_index < priv->hw_params->hfb_filter_cnt; f_index++) - if (!bcmgenet_hfb_is_filter_enabled(priv, f_index)) - return f_index; - - return -ENOMEM; -} - static int bcmgenet_hfb_validate_mask(void *mask, size_t size) { while (size) { @@ -744,59 +720,6 @@ static int bcmgenet_hfb_create_rxnfc_filter(struct bcmgenet_priv *priv, return err; } -/* bcmgenet_hfb_add_filter - * - * Add new filter to Hardware Filter Block to match and direct Rx traffic to - * desired Rx queue. - * - * f_data is an array of unsigned 32-bit integers where each 32-bit integer - * provides filter data for 2 bytes (4 nibbles) of Rx frame: - * - * bits 31:20 - unused - * bit 19 - nibble 0 match enable - * bit 18 - nibble 1 match enable - * bit 17 - nibble 2 match enable - * bit 16 - nibble 3 match enable - * bits 15:12 - nibble 0 data - * bits 11:8 - nibble 1 data - * bits 7:4 - nibble 2 data - * bits 3:0 - nibble 3 data - * - * Example: - * In order to match: - * - Ethernet frame type = 0x0800 (IP) - * - IP version field = 4 - * - IP protocol field = 0x11 (UDP) - * - * The following filter is needed: - * u32 hfb_filter_ipv4_udp[] = { - * Rx frame offset 0x00: 0x00000000, 0x00000000, 0x00000000, 0x00000000, - * Rx frame offset 0x08: 0x00000000, 0x00000000, 0x000F0800, 0x00084000, - * Rx frame offset 0x10: 0x00000000, 0x00000000, 0x00000000, 0x00030011, - * }; - * - * To add the filter to HFB and direct the traffic to Rx queue 0, call: - * bcmgenet_hfb_add_filter(priv, hfb_filter_ipv4_udp, - * ARRAY_SIZE(hfb_filter_ipv4_udp), 0); - */ -int bcmgenet_hfb_add_filter(struct bcmgenet_priv *priv, u32 *f_data, - u32 f_length, u32 rx_queue) -{ - int f_index; - - f_index = bcmgenet_hfb_find_unused_filter(priv); - if (f_index < 0) - return -ENOMEM; - - if (f_length > priv->hw_params->hfb_filter_size) - return -EINVAL; - - bcmgenet_hfb_set_filter(priv, f_data, f_length, rx_queue, f_index); - bcmgenet_hfb_enable_filter(priv, f_index); - - return 0; -} - /* bcmgenet_hfb_clear * * Clear Hardware Filter Block and disable all filtering. -- cgit v1.2.3 From d966d2efb643a8ba925e910bfc10e19c8b018de7 Mon Sep 17 00:00:00 2001 From: Doug Berger Date: Wed, 24 Jun 2020 18:14:54 -0700 Subject: net: bcmgenet: use __be16 for htons(ETH_P_IP) The 16-bit value that holds a short in network byte order should be declared as a restricted big endian type to allow type checks to succeed during assignment. Fixes: 3e370952287c ("net: bcmgenet: add support for ethtool rxnfc flows") Reported-by: kbuild test robot Signed-off-by: Doug Berger Acked-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/genet/bcmgenet.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c index f1fa11665319..c63f01e2bb03 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c @@ -610,8 +610,9 @@ static int bcmgenet_hfb_create_rxnfc_filter(struct bcmgenet_priv *priv, { struct ethtool_rx_flow_spec *fs = &rule->fs; int err = 0, offset = 0, f_length = 0; - u16 val_16, mask_16; u8 val_8, mask_8; + __be16 val_16; + u16 mask_16; size_t size; u32 *f_data; -- cgit v1.2.3 From 20d1f2d1b024f6be199a3bedf1578a1d21592bc5 Mon Sep 17 00:00:00 2001 From: Doug Berger Date: Wed, 24 Jun 2020 18:14:55 -0700 Subject: net: bcmgenet: use hardware padding of runt frames When commit 474ea9cafc45 ("net: bcmgenet: correctly pad short packets") added the call to skb_padto() it should have been located before the nr_frags parameter was read since that value could be changed when padding packets with lengths between 55 and 59 bytes (inclusive). The use of a stale nr_frags value can cause corruption of the pad data when tx-scatter-gather is enabled. This corruption of the pad can cause invalid checksum computation when hardware offload of tx-checksum is also enabled. Since the original reason for the padding was corrected by commit 7dd399130efb ("net: bcmgenet: fix skb_len in bcmgenet_xmit_single()") we can remove the software padding all together and make use of hardware padding of short frames as long as the hardware also always appends the FCS value to the frame. Fixes: 474ea9cafc45 ("net: bcmgenet: correctly pad short packets") Signed-off-by: Doug Berger Acked-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/genet/bcmgenet.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c index c63f01e2bb03..af924a8b885f 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c @@ -2042,11 +2042,6 @@ static netdev_tx_t bcmgenet_xmit(struct sk_buff *skb, struct net_device *dev) goto out; } - if (skb_padto(skb, ETH_ZLEN)) { - ret = NETDEV_TX_OK; - goto out; - } - /* Retain how many bytes will be sent on the wire, without TSB inserted * by transmit checksum offload */ @@ -2093,6 +2088,9 @@ static netdev_tx_t bcmgenet_xmit(struct sk_buff *skb, struct net_device *dev) len_stat = (size << DMA_BUFLENGTH_SHIFT) | (priv->hw_params->qtag_mask << DMA_TX_QTAG_SHIFT); + /* Note: if we ever change from DMA_TX_APPEND_CRC below we + * will need to restore software padding of "runt" packets + */ if (!i) { len_stat |= DMA_TX_APPEND_CRC | DMA_SOP; if (skb->ip_summed == CHECKSUM_PARTIAL) -- cgit v1.2.3 From 5a3235e50c0e3219d8659cbec996dae960acfee8 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 25 Jun 2020 09:18:16 +0200 Subject: net: phy: mscc: avoid skcipher API for single block AES encryption The skcipher API dynamically instantiates the transformation object on request that implements the requested algorithm optimally on the given platform. This notion of optimality only matters for cases like bulk network or disk encryption, where performance can be a bottleneck, or in cases where the algorithm itself is not known at compile time. In the mscc case, we are dealing with AES encryption of a single block, and so neither concern applies, and we are better off using the AES library interface, which is lightweight and safe for this kind of use. Note that the scatterlist API does not permit references to buffers that are located on the stack, so the existing code is incorrect in any case, but avoiding the skcipher and scatterlist APIs entirely is the most straight-forward approach to fixing this. Cc: Antoine Tenart Cc: Andrew Lunn Cc: Florian Fainelli Cc: Heiner Kallweit Cc: "David S. Miller" Cc: Jakub Kicinski Fixes: 28c5107aa904e ("net: phy: mscc: macsec support") Reviewed-by: Eric Biggers Signed-off-by: Ard Biesheuvel Tested-by: Antoine Tenart Signed-off-by: David S. Miller --- drivers/net/phy/Kconfig | 3 +-- drivers/net/phy/mscc/mscc_macsec.c | 40 +++++++++----------------------------- 2 files changed, 10 insertions(+), 33 deletions(-) diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig index f25702386d83..e351d65533aa 100644 --- a/drivers/net/phy/Kconfig +++ b/drivers/net/phy/Kconfig @@ -480,8 +480,7 @@ config MICROCHIP_T1_PHY config MICROSEMI_PHY tristate "Microsemi PHYs" depends on MACSEC || MACSEC=n - select CRYPTO_AES - select CRYPTO_ECB + select CRYPTO_LIB_AES if MACSEC help Currently supports VSC8514, VSC8530, VSC8531, VSC8540 and VSC8541 PHYs diff --git a/drivers/net/phy/mscc/mscc_macsec.c b/drivers/net/phy/mscc/mscc_macsec.c index b4d3dc4068e2..d53ca884b5c9 100644 --- a/drivers/net/phy/mscc/mscc_macsec.c +++ b/drivers/net/phy/mscc/mscc_macsec.c @@ -10,7 +10,7 @@ #include #include -#include +#include #include @@ -500,39 +500,17 @@ static u32 vsc8584_macsec_flow_context_id(struct macsec_flow *flow) static int vsc8584_macsec_derive_key(const u8 key[MACSEC_KEYID_LEN], u16 key_len, u8 hkey[16]) { - struct crypto_skcipher *tfm = crypto_alloc_skcipher("ecb(aes)", 0, 0); - struct skcipher_request *req = NULL; - struct scatterlist src, dst; - DECLARE_CRYPTO_WAIT(wait); - u32 input[4] = {0}; + const u8 input[AES_BLOCK_SIZE] = {0}; + struct crypto_aes_ctx ctx; int ret; - if (IS_ERR(tfm)) - return PTR_ERR(tfm); - - req = skcipher_request_alloc(tfm, GFP_KERNEL); - if (!req) { - ret = -ENOMEM; - goto out; - } - - skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | - CRYPTO_TFM_REQ_MAY_SLEEP, crypto_req_done, - &wait); - ret = crypto_skcipher_setkey(tfm, key, key_len); - if (ret < 0) - goto out; - - sg_init_one(&src, input, 16); - sg_init_one(&dst, hkey, 16); - skcipher_request_set_crypt(req, &src, &dst, 16, NULL); - - ret = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); + ret = aes_expandkey(&ctx, key, key_len); + if (ret) + return ret; -out: - skcipher_request_free(req); - crypto_free_skcipher(tfm); - return ret; + aes_encrypt(&ctx, hkey, input); + memzero_explicit(&ctx, sizeof(ctx)); + return 0; } static int vsc8584_macsec_transformation(struct phy_device *phydev, -- cgit v1.2.3 From 2570284060b48f3f79d8f1a2698792f36c385e9a Mon Sep 17 00:00:00 2001 From: Denis Kirjanov Date: Thu, 25 Jun 2020 14:51:06 +0300 Subject: tcp: don't ignore ECN CWR on pure ACK there is a problem with the CWR flag set in an incoming ACK segment and it leads to the situation when the ECE flag is latched forever the following packetdrill script shows what happens: // Stack receives incoming segments with CE set +0.1 <[ect0] . 11001:12001(1000) ack 1001 win 65535 +0.0 <[ce] . 12001:13001(1000) ack 1001 win 65535 +0.0 <[ect0] P. 13001:14001(1000) ack 1001 win 65535 // Stack repsonds with ECN ECHO +0.0 >[noecn] . 1001:1001(0) ack 12001 +0.0 >[noecn] E. 1001:1001(0) ack 13001 +0.0 >[noecn] E. 1001:1001(0) ack 14001 // Write a packet +0.1 write(3, ..., 1000) = 1000 +0.0 >[ect0] PE. 1001:2001(1000) ack 14001 // Pure ACK received +0.01 <[noecn] W. 14001:14001(0) ack 2001 win 65535 // Since CWR was sent, this packet should NOT have ECE set +0.1 write(3, ..., 1000) = 1000 +0.0 >[ect0] P. 2001:3001(1000) ack 14001 // but Linux will still keep ECE latched here, with packetdrill // flagging a missing ECE flag, expecting // >[ect0] PE. 2001:3001(1000) ack 14001 // in the script In the situation above we will continue to send ECN ECHO packets and trigger the peer to reduce the congestion window. To avoid that we can check CWR on pure ACKs received. v3: - Add a sequence check to avoid sending an ACK to an ACK v2: - Adjusted the comment - move CWR check before checking for unacknowledged packets Signed-off-by: Denis Kirjanov Acked-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 12fda8f27b08..f3a0eb139b76 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -261,7 +261,8 @@ static void tcp_ecn_accept_cwr(struct sock *sk, const struct sk_buff *skb) * cwnd may be very low (even just 1 packet), so we should ACK * immediately. */ - inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; + if (TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) + inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; } } @@ -3665,6 +3666,15 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tcp_in_ack_event(sk, ack_ev_flags); } + /* This is a deviation from RFC3168 since it states that: + * "When the TCP data sender is ready to set the CWR bit after reducing + * the congestion window, it SHOULD set the CWR bit only on the first + * new data packet that it transmits." + * We accept CWR on pure ACKs to be more robust + * with widely-deployed TCP implementations that do this. + */ + tcp_ecn_accept_cwr(sk, skb); + /* We passed data and got it acked, remove any soft error * log. Something worked... */ @@ -4800,8 +4810,6 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) skb_dst_drop(skb); __skb_pull(skb, tcp_hdr(skb)->doff * 4); - tcp_ecn_accept_cwr(sk, skb); - tp->rx_opt.dsack = 0; /* Queue data for delivery to the user. -- cgit v1.2.3 From 206e732323c2a8e6d11f4125a62c27e60a50a850 Mon Sep 17 00:00:00 2001 From: Thomas Martitz Date: Thu, 25 Jun 2020 14:26:03 +0200 Subject: net: bridge: enfore alignment for ethernet address The eth_addr member is passed to ether_addr functions that require 2-byte alignment, therefore the member must be properly aligned to avoid unaligned accesses. The problem is in place since the initial merge of multicast to unicast: commit 6db6f0eae6052b70885562e1733896647ec1d807 bridge: multicast to unicast Fixes: 6db6f0eae605 ("bridge: multicast to unicast") Cc: Roopa Prabhu Cc: Nikolay Aleksandrov Cc: David S. Miller Cc: Jakub Kicinski Cc: Felix Fietkau Signed-off-by: Thomas Martitz Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_private.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 7501be4eeba0..2130fe0194e6 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -217,8 +217,8 @@ struct net_bridge_port_group { struct rcu_head rcu; struct timer_list timer; struct br_ip addr; + unsigned char eth_addr[ETH_ALEN] __aligned(2); unsigned char flags; - unsigned char eth_addr[ETH_ALEN]; }; struct net_bridge_mdb_entry { -- cgit v1.2.3 From 0eaf228d574bd82a9aed73e3953bfb81721f4227 Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Wed, 24 Jun 2020 13:08:17 +0300 Subject: net: macb: call pm_runtime_put_sync on failure path Call pm_runtime_put_sync() on failure path of at91ether_open. Fixes: e6a41c23df0d ("net: macb: ensure interface is not suspended on at91rm9200") Signed-off-by: Claudiu Beznea Signed-off-by: David S. Miller --- drivers/net/ethernet/cadence/macb_main.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index 257c4920cb88..5705359a3612 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -3837,7 +3837,7 @@ static int at91ether_open(struct net_device *dev) ret = at91ether_start(dev); if (ret) - return ret; + goto pm_exit; /* Enable MAC interrupts */ macb_writel(lp, IER, MACB_BIT(RCOMP) | @@ -3850,11 +3850,15 @@ static int at91ether_open(struct net_device *dev) ret = macb_phylink_connect(lp); if (ret) - return ret; + goto pm_exit; netif_start_queue(dev); return 0; + +pm_exit: + pm_runtime_put_sync(&lp->pdev->dev); + return ret; } /* Close the interface */ -- cgit v1.2.3 From 33fdef24c9ac20c68b71b363e423fbf9ad2bfc1e Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Wed, 24 Jun 2020 13:08:18 +0300 Subject: net: macb: free resources on failure path of at91ether_open() DMA buffers were not freed on failure path of at91ether_open(). Along with changes for freeing the DMA buffers the enable/disable interrupt instructions were moved to at91ether_start()/at91ether_stop() functions and the operations on at91ether_stop() were done in their reverse order (compared with how is done in at91ether_start()): before this patch the operation order on interface open path was as follows: 1/ alloc DMA buffers 2/ enable tx, rx 3/ enable interrupts and the order on interface close path was as follows: 1/ disable tx, rx 2/ disable interrupts 3/ free dma buffers. Fixes: 7897b071ac3b ("net: macb: convert to phylink") Signed-off-by: Claudiu Beznea Signed-off-by: David S. Miller --- drivers/net/ethernet/cadence/macb_main.c | 116 +++++++++++++++++++------------ 1 file changed, 73 insertions(+), 43 deletions(-) diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index 5705359a3612..52582e8ed90e 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -3762,15 +3762,9 @@ static int macb_init(struct platform_device *pdev) static struct sifive_fu540_macb_mgmt *mgmt; -/* Initialize and start the Receiver and Transmit subsystems */ -static int at91ether_start(struct net_device *dev) +static int at91ether_alloc_coherent(struct macb *lp) { - struct macb *lp = netdev_priv(dev); struct macb_queue *q = &lp->queues[0]; - struct macb_dma_desc *desc; - dma_addr_t addr; - u32 ctl; - int i; q->rx_ring = dma_alloc_coherent(&lp->pdev->dev, (AT91ETHER_MAX_RX_DESCR * @@ -3792,6 +3786,43 @@ static int at91ether_start(struct net_device *dev) return -ENOMEM; } + return 0; +} + +static void at91ether_free_coherent(struct macb *lp) +{ + struct macb_queue *q = &lp->queues[0]; + + if (q->rx_ring) { + dma_free_coherent(&lp->pdev->dev, + AT91ETHER_MAX_RX_DESCR * + macb_dma_desc_get_size(lp), + q->rx_ring, q->rx_ring_dma); + q->rx_ring = NULL; + } + + if (q->rx_buffers) { + dma_free_coherent(&lp->pdev->dev, + AT91ETHER_MAX_RX_DESCR * + AT91ETHER_MAX_RBUFF_SZ, + q->rx_buffers, q->rx_buffers_dma); + q->rx_buffers = NULL; + } +} + +/* Initialize and start the Receiver and Transmit subsystems */ +static int at91ether_start(struct macb *lp) +{ + struct macb_queue *q = &lp->queues[0]; + struct macb_dma_desc *desc; + dma_addr_t addr; + u32 ctl; + int i, ret; + + ret = at91ether_alloc_coherent(lp); + if (ret) + return ret; + addr = q->rx_buffers_dma; for (i = 0; i < AT91ETHER_MAX_RX_DESCR; i++) { desc = macb_rx_desc(q, i); @@ -3813,9 +3844,39 @@ static int at91ether_start(struct net_device *dev) ctl = macb_readl(lp, NCR); macb_writel(lp, NCR, ctl | MACB_BIT(RE) | MACB_BIT(TE)); + /* Enable MAC interrupts */ + macb_writel(lp, IER, MACB_BIT(RCOMP) | + MACB_BIT(RXUBR) | + MACB_BIT(ISR_TUND) | + MACB_BIT(ISR_RLE) | + MACB_BIT(TCOMP) | + MACB_BIT(ISR_ROVR) | + MACB_BIT(HRESP)); + return 0; } +static void at91ether_stop(struct macb *lp) +{ + u32 ctl; + + /* Disable MAC interrupts */ + macb_writel(lp, IDR, MACB_BIT(RCOMP) | + MACB_BIT(RXUBR) | + MACB_BIT(ISR_TUND) | + MACB_BIT(ISR_RLE) | + MACB_BIT(TCOMP) | + MACB_BIT(ISR_ROVR) | + MACB_BIT(HRESP)); + + /* Disable Receiver and Transmitter */ + ctl = macb_readl(lp, NCR); + macb_writel(lp, NCR, ctl & ~(MACB_BIT(TE) | MACB_BIT(RE))); + + /* Free resources. */ + at91ether_free_coherent(lp); +} + /* Open the ethernet interface */ static int at91ether_open(struct net_device *dev) { @@ -3835,27 +3896,20 @@ static int at91ether_open(struct net_device *dev) macb_set_hwaddr(lp); - ret = at91ether_start(dev); + ret = at91ether_start(lp); if (ret) goto pm_exit; - /* Enable MAC interrupts */ - macb_writel(lp, IER, MACB_BIT(RCOMP) | - MACB_BIT(RXUBR) | - MACB_BIT(ISR_TUND) | - MACB_BIT(ISR_RLE) | - MACB_BIT(TCOMP) | - MACB_BIT(ISR_ROVR) | - MACB_BIT(HRESP)); - ret = macb_phylink_connect(lp); if (ret) - goto pm_exit; + goto stop; netif_start_queue(dev); return 0; +stop: + at91ether_stop(lp); pm_exit: pm_runtime_put_sync(&lp->pdev->dev); return ret; @@ -3865,37 +3919,13 @@ pm_exit: static int at91ether_close(struct net_device *dev) { struct macb *lp = netdev_priv(dev); - struct macb_queue *q = &lp->queues[0]; - u32 ctl; - - /* Disable Receiver and Transmitter */ - ctl = macb_readl(lp, NCR); - macb_writel(lp, NCR, ctl & ~(MACB_BIT(TE) | MACB_BIT(RE))); - - /* Disable MAC interrupts */ - macb_writel(lp, IDR, MACB_BIT(RCOMP) | - MACB_BIT(RXUBR) | - MACB_BIT(ISR_TUND) | - MACB_BIT(ISR_RLE) | - MACB_BIT(TCOMP) | - MACB_BIT(ISR_ROVR) | - MACB_BIT(HRESP)); netif_stop_queue(dev); phylink_stop(lp->phylink); phylink_disconnect_phy(lp->phylink); - dma_free_coherent(&lp->pdev->dev, - AT91ETHER_MAX_RX_DESCR * - macb_dma_desc_get_size(lp), - q->rx_ring, q->rx_ring_dma); - q->rx_ring = NULL; - - dma_free_coherent(&lp->pdev->dev, - AT91ETHER_MAX_RX_DESCR * AT91ETHER_MAX_RBUFF_SZ, - q->rx_buffers, q->rx_buffers_dma); - q->rx_buffers = NULL; + at91ether_stop(lp); return pm_runtime_put(&lp->pdev->dev); } -- cgit v1.2.3 From e39109f59614e5646e6c53100a3e7e8f63dd1d2b Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 24 Jun 2020 16:54:44 +0300 Subject: net: dsa: sja1105: move sja1105_compose_gating_subschedule at the top It turns out that sja1105_compose_gating_subschedule must also be called from sja1105_vl_delete, to recalculate the overall tc-gate configuration. Currently this is not possible without introducing a forward declaration. So move the function at the top of the file, along with its dependencies. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/dsa/sja1105/sja1105_vl.c | 320 +++++++++++++++++------------------ 1 file changed, 160 insertions(+), 160 deletions(-) diff --git a/drivers/net/dsa/sja1105/sja1105_vl.c b/drivers/net/dsa/sja1105/sja1105_vl.c index 0056f9c1e471..5ffa71f02064 100644 --- a/drivers/net/dsa/sja1105/sja1105_vl.c +++ b/drivers/net/dsa/sja1105/sja1105_vl.c @@ -7,6 +7,166 @@ #define SJA1105_SIZE_VL_STATUS 8 +/* Insert into the global gate list, sorted by gate action time. */ +static int sja1105_insert_gate_entry(struct sja1105_gating_config *gating_cfg, + struct sja1105_rule *rule, + u8 gate_state, s64 entry_time, + struct netlink_ext_ack *extack) +{ + struct sja1105_gate_entry *e; + int rc; + + e = kzalloc(sizeof(*e), GFP_KERNEL); + if (!e) + return -ENOMEM; + + e->rule = rule; + e->gate_state = gate_state; + e->interval = entry_time; + + if (list_empty(&gating_cfg->entries)) { + list_add(&e->list, &gating_cfg->entries); + } else { + struct sja1105_gate_entry *p; + + list_for_each_entry(p, &gating_cfg->entries, list) { + if (p->interval == e->interval) { + NL_SET_ERR_MSG_MOD(extack, + "Gate conflict"); + rc = -EBUSY; + goto err; + } + + if (e->interval < p->interval) + break; + } + list_add(&e->list, p->list.prev); + } + + gating_cfg->num_entries++; + + return 0; +err: + kfree(e); + return rc; +} + +/* The gate entries contain absolute times in their e->interval field. Convert + * that to proper intervals (i.e. "0, 5, 10, 15" to "5, 5, 5, 5"). + */ +static void +sja1105_gating_cfg_time_to_interval(struct sja1105_gating_config *gating_cfg, + u64 cycle_time) +{ + struct sja1105_gate_entry *last_e; + struct sja1105_gate_entry *e; + struct list_head *prev; + + list_for_each_entry(e, &gating_cfg->entries, list) { + struct sja1105_gate_entry *p; + + prev = e->list.prev; + + if (prev == &gating_cfg->entries) + continue; + + p = list_entry(prev, struct sja1105_gate_entry, list); + p->interval = e->interval - p->interval; + } + last_e = list_last_entry(&gating_cfg->entries, + struct sja1105_gate_entry, list); + if (last_e->list.prev != &gating_cfg->entries) + last_e->interval = cycle_time - last_e->interval; +} + +static void sja1105_free_gating_config(struct sja1105_gating_config *gating_cfg) +{ + struct sja1105_gate_entry *e, *n; + + list_for_each_entry_safe(e, n, &gating_cfg->entries, list) { + list_del(&e->list); + kfree(e); + } +} + +static int sja1105_compose_gating_subschedule(struct sja1105_private *priv, + struct netlink_ext_ack *extack) +{ + struct sja1105_gating_config *gating_cfg = &priv->tas_data.gating_cfg; + struct sja1105_rule *rule; + s64 max_cycle_time = 0; + s64 its_base_time = 0; + int i, rc = 0; + + list_for_each_entry(rule, &priv->flow_block.rules, list) { + if (rule->type != SJA1105_RULE_VL) + continue; + if (rule->vl.type != SJA1105_VL_TIME_TRIGGERED) + continue; + + if (max_cycle_time < rule->vl.cycle_time) { + max_cycle_time = rule->vl.cycle_time; + its_base_time = rule->vl.base_time; + } + } + + if (!max_cycle_time) + return 0; + + dev_dbg(priv->ds->dev, "max_cycle_time %lld its_base_time %lld\n", + max_cycle_time, its_base_time); + + sja1105_free_gating_config(gating_cfg); + + gating_cfg->base_time = its_base_time; + gating_cfg->cycle_time = max_cycle_time; + gating_cfg->num_entries = 0; + + list_for_each_entry(rule, &priv->flow_block.rules, list) { + s64 time; + s64 rbt; + + if (rule->type != SJA1105_RULE_VL) + continue; + if (rule->vl.type != SJA1105_VL_TIME_TRIGGERED) + continue; + + /* Calculate the difference between this gating schedule's + * base time, and the base time of the gating schedule with the + * longest cycle time. We call it the relative base time (rbt). + */ + rbt = future_base_time(rule->vl.base_time, rule->vl.cycle_time, + its_base_time); + rbt -= its_base_time; + + time = rbt; + + for (i = 0; i < rule->vl.num_entries; i++) { + u8 gate_state = rule->vl.entries[i].gate_state; + s64 entry_time = time; + + while (entry_time < max_cycle_time) { + rc = sja1105_insert_gate_entry(gating_cfg, rule, + gate_state, + entry_time, + extack); + if (rc) + goto err; + + entry_time += rule->vl.cycle_time; + } + time += rule->vl.entries[i].interval; + } + } + + sja1105_gating_cfg_time_to_interval(gating_cfg, max_cycle_time); + + return 0; +err: + sja1105_free_gating_config(gating_cfg); + return rc; +} + /* The switch flow classification core implements TTEthernet, which 'thinks' in * terms of Virtual Links (VL), a concept borrowed from ARINC 664 part 7. * However it also has one other operating mode (VLLUPFORMAT=0) where it acts @@ -397,166 +557,6 @@ int sja1105_vl_delete(struct sja1105_private *priv, int port, return sja1105_static_config_reload(priv, SJA1105_VIRTUAL_LINKS); } -/* Insert into the global gate list, sorted by gate action time. */ -static int sja1105_insert_gate_entry(struct sja1105_gating_config *gating_cfg, - struct sja1105_rule *rule, - u8 gate_state, s64 entry_time, - struct netlink_ext_ack *extack) -{ - struct sja1105_gate_entry *e; - int rc; - - e = kzalloc(sizeof(*e), GFP_KERNEL); - if (!e) - return -ENOMEM; - - e->rule = rule; - e->gate_state = gate_state; - e->interval = entry_time; - - if (list_empty(&gating_cfg->entries)) { - list_add(&e->list, &gating_cfg->entries); - } else { - struct sja1105_gate_entry *p; - - list_for_each_entry(p, &gating_cfg->entries, list) { - if (p->interval == e->interval) { - NL_SET_ERR_MSG_MOD(extack, - "Gate conflict"); - rc = -EBUSY; - goto err; - } - - if (e->interval < p->interval) - break; - } - list_add(&e->list, p->list.prev); - } - - gating_cfg->num_entries++; - - return 0; -err: - kfree(e); - return rc; -} - -/* The gate entries contain absolute times in their e->interval field. Convert - * that to proper intervals (i.e. "0, 5, 10, 15" to "5, 5, 5, 5"). - */ -static void -sja1105_gating_cfg_time_to_interval(struct sja1105_gating_config *gating_cfg, - u64 cycle_time) -{ - struct sja1105_gate_entry *last_e; - struct sja1105_gate_entry *e; - struct list_head *prev; - - list_for_each_entry(e, &gating_cfg->entries, list) { - struct sja1105_gate_entry *p; - - prev = e->list.prev; - - if (prev == &gating_cfg->entries) - continue; - - p = list_entry(prev, struct sja1105_gate_entry, list); - p->interval = e->interval - p->interval; - } - last_e = list_last_entry(&gating_cfg->entries, - struct sja1105_gate_entry, list); - if (last_e->list.prev != &gating_cfg->entries) - last_e->interval = cycle_time - last_e->interval; -} - -static void sja1105_free_gating_config(struct sja1105_gating_config *gating_cfg) -{ - struct sja1105_gate_entry *e, *n; - - list_for_each_entry_safe(e, n, &gating_cfg->entries, list) { - list_del(&e->list); - kfree(e); - } -} - -static int sja1105_compose_gating_subschedule(struct sja1105_private *priv, - struct netlink_ext_ack *extack) -{ - struct sja1105_gating_config *gating_cfg = &priv->tas_data.gating_cfg; - struct sja1105_rule *rule; - s64 max_cycle_time = 0; - s64 its_base_time = 0; - int i, rc = 0; - - list_for_each_entry(rule, &priv->flow_block.rules, list) { - if (rule->type != SJA1105_RULE_VL) - continue; - if (rule->vl.type != SJA1105_VL_TIME_TRIGGERED) - continue; - - if (max_cycle_time < rule->vl.cycle_time) { - max_cycle_time = rule->vl.cycle_time; - its_base_time = rule->vl.base_time; - } - } - - if (!max_cycle_time) - return 0; - - dev_dbg(priv->ds->dev, "max_cycle_time %lld its_base_time %lld\n", - max_cycle_time, its_base_time); - - sja1105_free_gating_config(gating_cfg); - - gating_cfg->base_time = its_base_time; - gating_cfg->cycle_time = max_cycle_time; - gating_cfg->num_entries = 0; - - list_for_each_entry(rule, &priv->flow_block.rules, list) { - s64 time; - s64 rbt; - - if (rule->type != SJA1105_RULE_VL) - continue; - if (rule->vl.type != SJA1105_VL_TIME_TRIGGERED) - continue; - - /* Calculate the difference between this gating schedule's - * base time, and the base time of the gating schedule with the - * longest cycle time. We call it the relative base time (rbt). - */ - rbt = future_base_time(rule->vl.base_time, rule->vl.cycle_time, - its_base_time); - rbt -= its_base_time; - - time = rbt; - - for (i = 0; i < rule->vl.num_entries; i++) { - u8 gate_state = rule->vl.entries[i].gate_state; - s64 entry_time = time; - - while (entry_time < max_cycle_time) { - rc = sja1105_insert_gate_entry(gating_cfg, rule, - gate_state, - entry_time, - extack); - if (rc) - goto err; - - entry_time += rule->vl.cycle_time; - } - time += rule->vl.entries[i].interval; - } - } - - sja1105_gating_cfg_time_to_interval(gating_cfg, max_cycle_time); - - return 0; -err: - sja1105_free_gating_config(gating_cfg); - return rc; -} - int sja1105_vl_gate(struct sja1105_private *priv, int port, struct netlink_ext_ack *extack, unsigned long cookie, struct sja1105_key *key, u32 index, s32 prio, -- cgit v1.2.3 From 026bdb2b96525edd1fb5cc6f24587ff08d277d5c Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 24 Jun 2020 16:54:45 +0300 Subject: net: dsa: sja1105: unconditionally free old gating config Currently sja1105_compose_gating_subschedule is not prepared to be called for the case where we want to recompute the global tc-gate configuration after we've deleted those actions on a port. After deleting the tc-gate actions on the last port, max_cycle_time would become zero, and that would incorrectly prevent sja1105_free_gating_config from getting called. So move the freeing function above the check for the need to apply a new configuration. Fixes: 834f8933d5dd ("net: dsa: sja1105: implement tc-gate using time-triggered virtual links") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/dsa/sja1105/sja1105_vl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/dsa/sja1105/sja1105_vl.c b/drivers/net/dsa/sja1105/sja1105_vl.c index 5ffa71f02064..5ff370f507e6 100644 --- a/drivers/net/dsa/sja1105/sja1105_vl.c +++ b/drivers/net/dsa/sja1105/sja1105_vl.c @@ -98,6 +98,8 @@ static int sja1105_compose_gating_subschedule(struct sja1105_private *priv, s64 its_base_time = 0; int i, rc = 0; + sja1105_free_gating_config(gating_cfg); + list_for_each_entry(rule, &priv->flow_block.rules, list) { if (rule->type != SJA1105_RULE_VL) continue; @@ -116,8 +118,6 @@ static int sja1105_compose_gating_subschedule(struct sja1105_private *priv, dev_dbg(priv->ds->dev, "max_cycle_time %lld its_base_time %lld\n", max_cycle_time, its_base_time); - sja1105_free_gating_config(gating_cfg); - gating_cfg->base_time = its_base_time; gating_cfg->cycle_time = max_cycle_time; gating_cfg->num_entries = 0; -- cgit v1.2.3 From 82f6896a25ee2471fdf039eb7b286a692d78f504 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 24 Jun 2020 16:54:46 +0300 Subject: net: dsa: sja1105: recalculate gating subschedule after deleting tc-gate rules Currently, tas_data->enabled would remain true even after deleting all tc-gate rules from the switch ports, which would cause the sja1105_tas_state_machine to get unnecessarily scheduled. Also, if there were any errors which would prevent the hardware from enabling the gating schedule, the sja1105_tas_state_machine would continuously detect and print that, spamming the kernel log, even if the rules were subsequently deleted. The rules themselves are _not_ active, because sja1105_init_scheduling does enough of a job to not install the gating schedule in the static config. But the virtual link rules themselves are still present. So call the functions that remove the tc-gate configuration from priv->tas_data.gating_cfg, so that tas_data->enabled can be set to false, and sja1105_tas_state_machine will stop from being scheduled. Fixes: 834f8933d5dd ("net: dsa: sja1105: implement tc-gate using time-triggered virtual links") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/dsa/sja1105/sja1105_vl.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/net/dsa/sja1105/sja1105_vl.c b/drivers/net/dsa/sja1105/sja1105_vl.c index 5ff370f507e6..6b1fdc1b46e7 100644 --- a/drivers/net/dsa/sja1105/sja1105_vl.c +++ b/drivers/net/dsa/sja1105/sja1105_vl.c @@ -550,10 +550,18 @@ int sja1105_vl_delete(struct sja1105_private *priv, int port, kfree(rule); } + rc = sja1105_compose_gating_subschedule(priv, extack); + if (rc) + return rc; + rc = sja1105_init_virtual_links(priv, extack); if (rc) return rc; + rc = sja1105_init_scheduling(priv); + if (rc < 0) + return rc; + return sja1105_static_config_reload(priv, SJA1105_VIRTUAL_LINKS); } -- cgit v1.2.3 From 43ce887c5050a3c213450a3058505f6a06519dd4 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 24 Jun 2020 16:54:47 +0300 Subject: net: dsa: sja1105: fix tc-gate schedule with single element The sja1105_gating_cfg_time_to_interval function does this, as per the comments: /* The gate entries contain absolute times in their e->interval field. Convert * that to proper intervals (i.e. "0, 5, 10, 15" to "5, 5, 5, 5"). */ To perform that task, it iterates over gating_cfg->entries, at each step updating the interval of the _previous_ entry. So one interval remains to be updated at the end of the loop: the last one (since it isn't "prev" for anyone else). But there was an erroneous check, that the last element's interval should not be updated if it's also the only element. I'm not quite sure why that check was there, but it's clearly incorrect, as a tc-gate schedule with a single element would get an e->interval of zero, regardless of the duration requested by the user. The switch wouldn't even consider this configuration as valid: it will just drop all traffic that matches the rule. Fixes: 834f8933d5dd ("net: dsa: sja1105: implement tc-gate using time-triggered virtual links") Reported-by: Xiaoliang Yang Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/dsa/sja1105/sja1105_vl.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/dsa/sja1105/sja1105_vl.c b/drivers/net/dsa/sja1105/sja1105_vl.c index 6b1fdc1b46e7..af3565160db6 100644 --- a/drivers/net/dsa/sja1105/sja1105_vl.c +++ b/drivers/net/dsa/sja1105/sja1105_vl.c @@ -75,8 +75,7 @@ sja1105_gating_cfg_time_to_interval(struct sja1105_gating_config *gating_cfg, } last_e = list_last_entry(&gating_cfg->entries, struct sja1105_gate_entry, list); - if (last_e->list.prev != &gating_cfg->entries) - last_e->interval = cycle_time - last_e->interval; + last_e->interval = cycle_time - last_e->interval; } static void sja1105_free_gating_config(struct sja1105_gating_config *gating_cfg) -- cgit v1.2.3 From b344579ca8478598937215f7005d6c7b84d28aee Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Wed, 24 Jun 2020 12:42:02 -0400 Subject: tcp_cubic: fix spurious HYSTART_DELAY exit upon drop in min RTT Mirja Kuehlewind reported a bug in Linux TCP CUBIC Hystart, where Hystart HYSTART_DELAY mechanism can exit Slow Start spuriously on an ACK when the minimum rtt of a connection goes down. From inspection it is clear from the existing code that this could happen in an example like the following: o The first 8 RTT samples in a round trip are 150ms, resulting in a curr_rtt of 150ms and a delay_min of 150ms. o The 9th RTT sample is 100ms. The curr_rtt does not change after the first 8 samples, so curr_rtt remains 150ms. But delay_min can be lowered at any time, so delay_min falls to 100ms. The code executes the HYSTART_DELAY comparison between curr_rtt of 150ms and delay_min of 100ms, and the curr_rtt is declared far enough above delay_min to force a (spurious) exit of Slow start. The fix here is simple: allow every RTT sample in a round trip to lower the curr_rtt. Fixes: ae27e98a5152 ("[TCP] CUBIC v2.3") Reported-by: Mirja Kuehlewind Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_cubic.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 8f8eefd3a3ce..c7bf5b26bf0c 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -432,10 +432,9 @@ static void hystart_update(struct sock *sk, u32 delay) if (hystart_detect & HYSTART_DELAY) { /* obtain the minimum delay of more than sampling packets */ + if (ca->curr_rtt > delay) + ca->curr_rtt = delay; if (ca->sample_cnt < HYSTART_MIN_SAMPLES) { - if (ca->curr_rtt > delay) - ca->curr_rtt = delay; - ca->sample_cnt++; } else { if (ca->curr_rtt > ca->delay_min + -- cgit v1.2.3 From 7d21d54d624777358ab6c7be7ff778808fef70ba Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Wed, 24 Jun 2020 12:42:03 -0400 Subject: bpf: tcp: bpf_cubic: fix spurious HYSTART_DELAY exit upon drop in min RTT Apply the fix from: "tcp_cubic: fix spurious HYSTART_DELAY exit upon drop in min RTT" to the BPF implementation of TCP CUBIC congestion control. Repeating the commit description here for completeness: Mirja Kuehlewind reported a bug in Linux TCP CUBIC Hystart, where Hystart HYSTART_DELAY mechanism can exit Slow Start spuriously on an ACK when the minimum rtt of a connection goes down. From inspection it is clear from the existing code that this could happen in an example like the following: o The first 8 RTT samples in a round trip are 150ms, resulting in a curr_rtt of 150ms and a delay_min of 150ms. o The 9th RTT sample is 100ms. The curr_rtt does not change after the first 8 samples, so curr_rtt remains 150ms. But delay_min can be lowered at any time, so delay_min falls to 100ms. The code executes the HYSTART_DELAY comparison between curr_rtt of 150ms and delay_min of 100ms, and the curr_rtt is declared far enough above delay_min to force a (spurious) exit of Slow start. The fix here is simple: allow every RTT sample in a round trip to lower the curr_rtt. Fixes: 6de4a9c430b5 ("bpf: tcp: Add bpf_cubic example") Reported-by: Mirja Kuehlewind Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- tools/testing/selftests/bpf/progs/bpf_cubic.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/bpf_cubic.c b/tools/testing/selftests/bpf/progs/bpf_cubic.c index 7897c8f4d363..ef574087f1e1 100644 --- a/tools/testing/selftests/bpf/progs/bpf_cubic.c +++ b/tools/testing/selftests/bpf/progs/bpf_cubic.c @@ -480,10 +480,9 @@ static __always_inline void hystart_update(struct sock *sk, __u32 delay) if (hystart_detect & HYSTART_DELAY) { /* obtain the minimum delay of more than sampling packets */ + if (ca->curr_rtt > delay) + ca->curr_rtt = delay; if (ca->sample_cnt < HYSTART_MIN_SAMPLES) { - if (ca->curr_rtt > delay) - ca->curr_rtt = delay; - ca->sample_cnt++; } else { if (ca->curr_rtt > ca->delay_min + -- cgit v1.2.3 From b6186d413b1d50f45fe2415d760947576ef3691a Mon Sep 17 00:00:00 2001 From: Briana Oursler Date: Wed, 24 Jun 2020 12:29:14 -0700 Subject: tc-testing: avoid action cookies with odd length. Update odd length cookie hexstrings in csum.json, tunnel_key.json and bpf.json to be even length to comply with check enforced in commit 0149dabf2a1b ("tc: m_actions: check cookie hexstring len") in iproute2. Signed-off-by: Briana Oursler Reviewed-by: Stefano Brivio Reviewed-by: Davide Caratti Signed-off-by: David S. Miller --- tools/testing/selftests/tc-testing/tc-tests/actions/bpf.json | 4 ++-- tools/testing/selftests/tc-testing/tc-tests/actions/csum.json | 4 ++-- tools/testing/selftests/tc-testing/tc-tests/actions/tunnel_key.json | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/bpf.json b/tools/testing/selftests/tc-testing/tc-tests/actions/bpf.json index 47a3082b6661..503982b8f295 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/actions/bpf.json +++ b/tools/testing/selftests/tc-testing/tc-tests/actions/bpf.json @@ -260,10 +260,10 @@ 255 ] ], - "cmdUnderTest": "$TC action add action bpf bytecode '4,40 0 0 12,21 0 1 2054,6 0 0 262144,6 0 0 0' index 4294967296 cookie 12345", + "cmdUnderTest": "$TC action add action bpf bytecode '4,40 0 0 12,21 0 1 2054,6 0 0 262144,6 0 0 0' index 4294967296 cookie 123456", "expExitCode": "255", "verifyCmd": "$TC action ls action bpf", - "matchPattern": "action order [0-9]*: bpf bytecode '4,40 0 0 12,21 0 1 2048,6 0 0 262144,6 0 0 0' default-action pipe.*cookie 12345", + "matchPattern": "action order [0-9]*: bpf bytecode '4,40 0 0 12,21 0 1 2048,6 0 0 262144,6 0 0 0' default-action pipe.*cookie 123456", "matchCount": "0", "teardown": [ "$TC action flush action bpf" diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/csum.json b/tools/testing/selftests/tc-testing/tc-tests/actions/csum.json index 88ec134872e4..072febf25f55 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/actions/csum.json +++ b/tools/testing/selftests/tc-testing/tc-tests/actions/csum.json @@ -469,7 +469,7 @@ 255 ] ], - "cmdUnderTest": "bash -c \"for i in \\`seq 1 32\\`; do cmd=\\\"action csum tcp continue index \\$i cookie aaabbbcccdddeee \\\"; args=\"\\$args\\$cmd\"; done && $TC actions add \\$args\"", + "cmdUnderTest": "bash -c \"for i in \\`seq 1 32\\`; do cmd=\\\"action csum tcp continue index \\$i cookie 123456789abcde \\\"; args=\"\\$args\\$cmd\"; done && $TC actions add \\$args\"", "expExitCode": "0", "verifyCmd": "$TC actions ls action csum", "matchPattern": "^[ \t]+index [0-9]* ref", @@ -492,7 +492,7 @@ 1, 255 ], - "bash -c \"for i in \\`seq 1 32\\`; do cmd=\\\"action csum tcp continue index \\$i cookie aaabbbcccdddeee \\\"; args=\"\\$args\\$cmd\"; done && $TC actions add \\$args\"" + "bash -c \"for i in \\`seq 1 32\\`; do cmd=\\\"action csum tcp continue index \\$i cookie 123456789abcde \\\"; args=\"\\$args\\$cmd\"; done && $TC actions add \\$args\"" ], "cmdUnderTest": "bash -c \"for i in \\`seq 1 32\\`; do cmd=\\\"action csum index \\$i \\\"; args=\"\\$args\\$cmd\"; done && $TC actions del \\$args\"", "expExitCode": "0", diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/tunnel_key.json b/tools/testing/selftests/tc-testing/tc-tests/actions/tunnel_key.json index 7357c58fa2dc..d06346968bcb 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/actions/tunnel_key.json +++ b/tools/testing/selftests/tc-testing/tc-tests/actions/tunnel_key.json @@ -818,12 +818,12 @@ 1, 255 ], - "$TC actions add action tunnel_key set src_ip 10.10.10.1 dst_ip 20.20.20.2 dst_port 3128 nocsum id 1 index 1 cookie aabbccddeeff112233445566778800a" + "$TC actions add action tunnel_key set src_ip 10.10.10.1 dst_ip 20.20.20.2 dst_port 3128 nocsum id 1 index 1 cookie 123456" ], - "cmdUnderTest": "$TC actions replace action tunnel_key set src_ip 11.11.11.1 dst_ip 21.21.21.2 dst_port 3129 id 11 csum reclassify index 1 cookie a1b1c1d1", + "cmdUnderTest": "$TC actions replace action tunnel_key set src_ip 11.11.11.1 dst_ip 21.21.21.2 dst_port 3129 id 11 csum reclassify index 1 cookie 123456", "expExitCode": "0", "verifyCmd": "$TC actions get action tunnel_key index 1", - "matchPattern": "action order [0-9]+: tunnel_key.*set.*src_ip 11.11.11.1.*dst_ip 21.21.21.2.*key_id 11.*dst_port 3129.*csum reclassify.*index 1.*cookie a1b1c1d1", + "matchPattern": "action order [0-9]+: tunnel_key.*set.*src_ip 11.11.11.1.*dst_ip 21.21.21.2.*key_id 11.*dst_port 3129.*csum reclassify.*index 1.*cookie 123456", "matchCount": "1", "teardown": [ "$TC actions flush action tunnel_key" -- cgit v1.2.3 From 471e39df96b9a4c4ba88a2da9e25a126624d7a9c Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Wed, 24 Jun 2020 17:34:18 -0300 Subject: sctp: Don't advertise IPv4 addresses if ipv6only is set on the socket If a socket is set ipv6only, it will still send IPv4 addresses in the INIT and INIT_ACK packets. This potentially misleads the peer into using them, which then would cause association termination. The fix is to not add IPv4 addresses to ipv6only sockets. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: Corey Minyard Signed-off-by: Marcelo Ricardo Leitner Tested-by: Corey Minyard Signed-off-by: David S. Miller --- include/net/sctp/constants.h | 8 +++++--- net/sctp/associola.c | 5 ++++- net/sctp/bind_addr.c | 1 + net/sctp/protocol.c | 3 ++- 4 files changed, 12 insertions(+), 5 deletions(-) diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h index 15b4d9aec7ff..122d9e2d8dfd 100644 --- a/include/net/sctp/constants.h +++ b/include/net/sctp/constants.h @@ -353,11 +353,13 @@ enum { ipv4_is_anycast_6to4(a)) /* Flags used for the bind address copy functions. */ -#define SCTP_ADDR6_ALLOWED 0x00000001 /* IPv6 address is allowed by +#define SCTP_ADDR4_ALLOWED 0x00000001 /* IPv4 address is allowed by local sock family */ -#define SCTP_ADDR4_PEERSUPP 0x00000002 /* IPv4 address is supported by +#define SCTP_ADDR6_ALLOWED 0x00000002 /* IPv6 address is allowed by + local sock family */ +#define SCTP_ADDR4_PEERSUPP 0x00000004 /* IPv4 address is supported by peer */ -#define SCTP_ADDR6_PEERSUPP 0x00000004 /* IPv6 address is supported by +#define SCTP_ADDR6_PEERSUPP 0x00000008 /* IPv6 address is supported by peer */ /* Reasons to retransmit. */ diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 72315137d7e7..8d735461fa19 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -1565,12 +1565,15 @@ void sctp_assoc_rwnd_decrease(struct sctp_association *asoc, unsigned int len) int sctp_assoc_set_bind_addr_from_ep(struct sctp_association *asoc, enum sctp_scope scope, gfp_t gfp) { + struct sock *sk = asoc->base.sk; int flags; /* Use scoping rules to determine the subset of addresses from * the endpoint. */ - flags = (PF_INET6 == asoc->base.sk->sk_family) ? SCTP_ADDR6_ALLOWED : 0; + flags = (PF_INET6 == sk->sk_family) ? SCTP_ADDR6_ALLOWED : 0; + if (!inet_v6_ipv6only(sk)) + flags |= SCTP_ADDR4_ALLOWED; if (asoc->peer.ipv4_address) flags |= SCTP_ADDR4_PEERSUPP; if (asoc->peer.ipv6_address) diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c index 53bc61537f44..701c5a4e441d 100644 --- a/net/sctp/bind_addr.c +++ b/net/sctp/bind_addr.c @@ -461,6 +461,7 @@ static int sctp_copy_one_addr(struct net *net, struct sctp_bind_addr *dest, * well as the remote peer. */ if ((((AF_INET == addr->sa.sa_family) && + (flags & SCTP_ADDR4_ALLOWED) && (flags & SCTP_ADDR4_PEERSUPP))) || (((AF_INET6 == addr->sa.sa_family) && (flags & SCTP_ADDR6_ALLOWED) && diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 092d1afdee0d..cde29f3c7fb3 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -148,7 +148,8 @@ int sctp_copy_local_addr_list(struct net *net, struct sctp_bind_addr *bp, * sock as well as the remote peer. */ if (addr->a.sa.sa_family == AF_INET && - !(copy_flags & SCTP_ADDR4_PEERSUPP)) + (!(copy_flags & SCTP_ADDR4_ALLOWED) || + !(copy_flags & SCTP_ADDR4_PEERSUPP))) continue; if (addr->a.sa.sa_family == AF_INET6 && (!(copy_flags & SCTP_ADDR6_ALLOWED) || -- cgit v1.2.3 From b18e9834f7b248b41e8179a039ec80803bb3d67a Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Wed, 24 Jun 2020 14:02:36 -0700 Subject: vxlan: fix last fdb index during dump of fdb with nhid This patch fixes last saved fdb index in fdb dump handler when handling fdb's with nhid. Fixes: 1274e1cc4226 ("vxlan: ecmp support for mac fdb entries") Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index e8085ab6d484..89d85dcb200e 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1380,6 +1380,8 @@ static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct vxlan_rdst *rd; if (rcu_access_pointer(f->nh)) { + if (*idx < cb->args[2]) + goto skip_nh; err = vxlan_fdb_info(skb, vxlan, f, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, @@ -1387,6 +1389,8 @@ static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, NLM_F_MULTI, NULL); if (err < 0) goto out; +skip_nh: + *idx += 1; continue; } -- cgit v1.2.3 From df08126e3833e9dca19e2407db5f5860a7c194fb Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Wed, 24 Jun 2020 16:06:03 -0600 Subject: wireguard: receive: account for napi_gro_receive never returning GRO_DROP The napi_gro_receive function no longer returns GRO_DROP ever, making handling GRO_DROP dead code. This commit removes that dead code. Further, it's not even clear that device drivers have any business in taking action after passing off received packets; that's arguably out of their hands. Fixes: e7096c131e51 ("net: WireGuard secure network tunnel") Fixes: 6570bc79c0df ("net: core: use listified Rx for GRO_NORMAL in napi_gro_receive()") Signed-off-by: Jason A. Donenfeld Signed-off-by: David S. Miller --- drivers/net/wireguard/receive.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/drivers/net/wireguard/receive.c b/drivers/net/wireguard/receive.c index 91438144e4f7..9b2ab6fc91cd 100644 --- a/drivers/net/wireguard/receive.c +++ b/drivers/net/wireguard/receive.c @@ -414,14 +414,8 @@ static void wg_packet_consume_data_done(struct wg_peer *peer, if (unlikely(routed_peer != peer)) goto dishonest_packet_peer; - if (unlikely(napi_gro_receive(&peer->napi, skb) == GRO_DROP)) { - ++dev->stats.rx_dropped; - net_dbg_ratelimited("%s: Failed to give packet to userspace from peer %llu (%pISpfsc)\n", - dev->name, peer->internal_id, - &peer->endpoint.addr); - } else { - update_rx_stats(peer, message_data_len(len_before_trim)); - } + napi_gro_receive(&peer->napi, skb); + update_rx_stats(peer, message_data_len(len_before_trim)); return; dishonest_packet_peer: -- cgit v1.2.3 From e5e7d8052f6140985c03bd49ebaa0af9c2944bc6 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Wed, 24 Jun 2020 16:06:04 -0600 Subject: socionext: account for napi_gro_receive never returning GRO_DROP The napi_gro_receive function no longer returns GRO_DROP ever, making handling GRO_DROP dead code. This commit removes that dead code. Further, it's not even clear that device drivers have any business in taking action after passing off received packets; that's arguably out of their hands. Fixes: 6570bc79c0df ("net: core: use listified Rx for GRO_NORMAL in napi_gro_receive()") Signed-off-by: Jason A. Donenfeld Signed-off-by: David S. Miller --- drivers/net/ethernet/socionext/netsec.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/socionext/netsec.c b/drivers/net/ethernet/socionext/netsec.c index 328bc38848bb..0f366cc50b74 100644 --- a/drivers/net/ethernet/socionext/netsec.c +++ b/drivers/net/ethernet/socionext/netsec.c @@ -1044,8 +1044,9 @@ static int netsec_process_rx(struct netsec_priv *priv, int budget) skb->ip_summed = CHECKSUM_UNNECESSARY; next: - if ((skb && napi_gro_receive(&priv->napi, skb) != GRO_DROP) || - xdp_result) { + if (skb) + napi_gro_receive(&priv->napi, skb); + if (skb || xdp_result) { ndev->stats.rx_packets++; ndev->stats.rx_bytes += xdp.data_end - xdp.data; } -- cgit v1.2.3 From 93ab48a97af50c7c62c3c94996002949ffebf004 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Wed, 24 Jun 2020 16:06:05 -0600 Subject: hns: do not cast return value of napi_gro_receive to null Basically no drivers care about the return value here, and there's no __must_check that would make casting to void sensible, so remove it. Signed-off-by: Jason A. Donenfeld Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns/hns_enet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns/hns_enet.c b/drivers/net/ethernet/hisilicon/hns/hns_enet.c index c117074c16e3..23f278e46975 100644 --- a/drivers/net/ethernet/hisilicon/hns/hns_enet.c +++ b/drivers/net/ethernet/hisilicon/hns/hns_enet.c @@ -699,7 +699,7 @@ static void hns_nic_rx_up_pro(struct hns_nic_ring_data *ring_data, struct net_device *ndev = ring_data->napi.dev; skb->protocol = eth_type_trans(skb, ndev); - (void)napi_gro_receive(&ring_data->napi, skb); + napi_gro_receive(&ring_data->napi, skb); } static int hns_desc_unused(struct hnae_ring *ring) -- cgit v1.2.3 From 045790b7bc66a75070c112a61558c639cef2263e Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Wed, 24 Jun 2020 16:06:06 -0600 Subject: wil6210: account for napi_gro_receive never returning GRO_DROP The napi_gro_receive function no longer returns GRO_DROP ever, making handling GRO_DROP dead code. This commit removes that dead code. Further, it's not even clear that device drivers have any business in taking action after passing off received packets; that's arguably out of their hands. In this case, too, the non-gro path didn't bother checking the return value. Plus, this had some clunky debugging functions that duplicated code from elsewhere and was generally pretty messy. So, this commit cleans that all up too. Fixes: 6570bc79c0df ("net: core: use listified Rx for GRO_NORMAL in napi_gro_receive()") Signed-off-by: Jason A. Donenfeld Signed-off-by: David S. Miller --- drivers/net/wireless/ath/wil6210/txrx.c | 39 ++++++++++----------------------- 1 file changed, 11 insertions(+), 28 deletions(-) diff --git a/drivers/net/wireless/ath/wil6210/txrx.c b/drivers/net/wireless/ath/wil6210/txrx.c index bc8c15fb609d..080e5aa60bea 100644 --- a/drivers/net/wireless/ath/wil6210/txrx.c +++ b/drivers/net/wireless/ath/wil6210/txrx.c @@ -897,7 +897,6 @@ static void wil_rx_handle_eapol(struct wil6210_vif *vif, struct sk_buff *skb) void wil_netif_rx(struct sk_buff *skb, struct net_device *ndev, int cid, struct wil_net_stats *stats, bool gro) { - gro_result_t rc = GRO_NORMAL; struct wil6210_vif *vif = ndev_to_vif(ndev); struct wil6210_priv *wil = ndev_to_wil(ndev); struct wireless_dev *wdev = vif_to_wdev(vif); @@ -908,22 +907,16 @@ void wil_netif_rx(struct sk_buff *skb, struct net_device *ndev, int cid, */ int mcast = is_multicast_ether_addr(da); struct sk_buff *xmit_skb = NULL; - static const char * const gro_res_str[] = { - [GRO_MERGED] = "GRO_MERGED", - [GRO_MERGED_FREE] = "GRO_MERGED_FREE", - [GRO_HELD] = "GRO_HELD", - [GRO_NORMAL] = "GRO_NORMAL", - [GRO_DROP] = "GRO_DROP", - [GRO_CONSUMED] = "GRO_CONSUMED", - }; if (wdev->iftype == NL80211_IFTYPE_STATION) { sa = wil_skb_get_sa(skb); if (mcast && ether_addr_equal(sa, ndev->dev_addr)) { /* mcast packet looped back to us */ - rc = GRO_DROP; dev_kfree_skb(skb); - goto stats; + ndev->stats.rx_dropped++; + stats->rx_dropped++; + wil_dbg_txrx(wil, "Rx drop %d bytes\n", len); + return; } } else if (wdev->iftype == NL80211_IFTYPE_AP && !vif->ap_isolate) { if (mcast) { @@ -967,26 +960,16 @@ void wil_netif_rx(struct sk_buff *skb, struct net_device *ndev, int cid, wil_rx_handle_eapol(vif, skb); if (gro) - rc = napi_gro_receive(&wil->napi_rx, skb); + napi_gro_receive(&wil->napi_rx, skb); else netif_rx_ni(skb); - wil_dbg_txrx(wil, "Rx complete %d bytes => %s\n", - len, gro_res_str[rc]); - } -stats: - /* statistics. rc set to GRO_NORMAL for AP bridging */ - if (unlikely(rc == GRO_DROP)) { - ndev->stats.rx_dropped++; - stats->rx_dropped++; - wil_dbg_txrx(wil, "Rx drop %d bytes\n", len); - } else { - ndev->stats.rx_packets++; - stats->rx_packets++; - ndev->stats.rx_bytes += len; - stats->rx_bytes += len; - if (mcast) - ndev->stats.multicast++; } + ndev->stats.rx_packets++; + stats->rx_packets++; + ndev->stats.rx_bytes += len; + stats->rx_bytes += len; + if (mcast) + ndev->stats.multicast++; } void wil_netif_rx_any(struct sk_buff *skb, struct net_device *ndev) -- cgit v1.2.3 From 1ae71d997a672739b22572cbe72d61a0eed8c42c Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Thu, 25 Jun 2020 00:09:08 +0200 Subject: ethtool: fix error handling in linkstate_prepare_data() When getting SQI or maximum SQI value fails in linkstate_prepare_data(), we must not return without calling ethnl_ops_complete(dev) as that could result in imbalance between ethtool_ops ->begin() and ->complete() calls. Fixes: 806602191592 ("ethtool: provide UAPI for PHY Signal Quality Index (SQI)") Signed-off-by: Michal Kubecek Signed-off-by: David S. Miller --- net/ethtool/linkstate.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/net/ethtool/linkstate.c b/net/ethtool/linkstate.c index 7f47ba89054e..afe5ac8a0f00 100644 --- a/net/ethtool/linkstate.c +++ b/net/ethtool/linkstate.c @@ -78,19 +78,18 @@ static int linkstate_prepare_data(const struct ethnl_req_info *req_base, ret = linkstate_get_sqi(dev); if (ret < 0 && ret != -EOPNOTSUPP) - return ret; - + goto out; data->sqi = ret; ret = linkstate_get_sqi_max(dev); if (ret < 0 && ret != -EOPNOTSUPP) - return ret; - + goto out; data->sqi_max = ret; + ret = 0; +out: ethnl_ops_complete(dev); - - return 0; + return ret; } static int linkstate_reply_size(const struct ethnl_req_info *req_base, -- cgit v1.2.3 From 9208d2863ac689a563b92f2161d8d1e7127d0add Mon Sep 17 00:00:00 2001 From: Ilya Ponetayev Date: Thu, 25 Jun 2020 22:12:07 +0200 Subject: sch_cake: don't try to reallocate or unshare skb unconditionally MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cake_handle_diffserv() tries to linearize mac and network header parts of skb and to make it writable unconditionally. In some cases it leads to full skb reallocation, which reduces throughput and increases CPU load. Some measurements of IPv4 forward + NAPT on MIPS router with 580 MHz single-core CPU was conducted. It appears that on kernel 4.9 skb_try_make_writable() reallocates skb, if skb was allocated in ethernet driver via so-called 'build skb' method from page cache (it was discovered by strange increase of kmalloc-2048 slab at first). Obtain DSCP value via read-only skb_header_pointer() call, and leave linearization only for DSCP bleaching or ECN CE setting. And, as an additional optimisation, skip diffserv parsing entirely if it is not needed by the current configuration. Fixes: c87b4ecdbe8d ("sch_cake: Make sure we can write the IP header before changing DSCP bits") Signed-off-by: Ilya Ponetayev [ fix a few style issues, reflow commit message ] Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- net/sched/sch_cake.c | 41 ++++++++++++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 60f8ae578819..cae006bef565 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -1553,30 +1553,49 @@ static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free) static u8 cake_handle_diffserv(struct sk_buff *skb, u16 wash) { - int wlen = skb_network_offset(skb); + const int offset = skb_network_offset(skb); + u16 *buf, buf_; u8 dscp; switch (tc_skb_protocol(skb)) { case htons(ETH_P_IP): - wlen += sizeof(struct iphdr); - if (!pskb_may_pull(skb, wlen) || - skb_try_make_writable(skb, wlen)) + buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_); + if (unlikely(!buf)) return 0; - dscp = ipv4_get_dsfield(ip_hdr(skb)) >> 2; - if (wash && dscp) + /* ToS is in the second byte of iphdr */ + dscp = ipv4_get_dsfield((struct iphdr *)buf) >> 2; + + if (wash && dscp) { + const int wlen = offset + sizeof(struct iphdr); + + if (!pskb_may_pull(skb, wlen) || + skb_try_make_writable(skb, wlen)) + return 0; + ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0); + } + return dscp; case htons(ETH_P_IPV6): - wlen += sizeof(struct ipv6hdr); - if (!pskb_may_pull(skb, wlen) || - skb_try_make_writable(skb, wlen)) + buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_); + if (unlikely(!buf)) return 0; - dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2; - if (wash && dscp) + /* Traffic class is in the first and second bytes of ipv6hdr */ + dscp = ipv6_get_dsfield((struct ipv6hdr *)buf) >> 2; + + if (wash && dscp) { + const int wlen = offset + sizeof(struct ipv6hdr); + + if (!pskb_may_pull(skb, wlen) || + skb_try_make_writable(skb, wlen)) + return 0; + ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0); + } + return dscp; case htons(ETH_P_ARP): -- cgit v1.2.3 From 8c95eca0bb8c4bd2231a0d581f1ad0d50c90488c Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Thu, 25 Jun 2020 22:12:08 +0200 Subject: sch_cake: don't call diffserv parsing code when it is not needed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As a further optimisation of the diffserv parsing codepath, we can skip it entirely if CAKE is configured to neither use diffserv-based classification, nor to zero out the diffserv bits. Fixes: c87b4ecdbe8d ("sch_cake: Make sure we can write the IP header before changing DSCP bits") Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- net/sched/sch_cake.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index cae006bef565..094d6e652deb 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -1551,7 +1551,7 @@ static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free) return idx + (tin << 16); } -static u8 cake_handle_diffserv(struct sk_buff *skb, u16 wash) +static u8 cake_handle_diffserv(struct sk_buff *skb, bool wash) { const int offset = skb_network_offset(skb); u16 *buf, buf_; @@ -1612,14 +1612,17 @@ static struct cake_tin_data *cake_select_tin(struct Qdisc *sch, { struct cake_sched_data *q = qdisc_priv(sch); u32 tin, mark; + bool wash; u8 dscp; /* Tin selection: Default to diffserv-based selection, allow overriding - * using firewall marks or skb->priority. + * using firewall marks or skb->priority. Call DSCP parsing early if + * wash is enabled, otherwise defer to below to skip unneeded parsing. */ - dscp = cake_handle_diffserv(skb, - q->rate_flags & CAKE_FLAG_WASH); mark = (skb->mark & q->fwmark_mask) >> q->fwmark_shft; + wash = !!(q->rate_flags & CAKE_FLAG_WASH); + if (wash) + dscp = cake_handle_diffserv(skb, wash); if (q->tin_mode == CAKE_DIFFSERV_BESTEFFORT) tin = 0; @@ -1633,6 +1636,8 @@ static struct cake_tin_data *cake_select_tin(struct Qdisc *sch, tin = q->tin_order[TC_H_MIN(skb->priority) - 1]; else { + if (!wash) + dscp = cake_handle_diffserv(skb, wash); tin = q->tin_index[dscp]; if (unlikely(tin >= q->tin_cnt)) -- cgit v1.2.3 From 3f608f0c41360b11b04c763f348b712f651c8bac Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Thu, 25 Jun 2020 22:12:09 +0200 Subject: sch_cake: fix a few style nits MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I spotted a few nits when comparing the in-tree version of sch_cake with the out-of-tree one: A redundant error variable declaration shadowing an outer declaration, and an indentation alignment issue. Fix both of these. Fixes: 046f6fd5daef ("sched: Add Common Applications Kept Enhanced (cake) qdisc") Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- net/sched/sch_cake.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 094d6e652deb..ca813697728e 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -2715,7 +2715,7 @@ static int cake_init(struct Qdisc *sch, struct nlattr *opt, qdisc_watchdog_init(&q->watchdog, sch); if (opt) { - int err = cake_change(sch, opt, extack); + err = cake_change(sch, opt, extack); if (err) return err; @@ -3032,7 +3032,7 @@ static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl, PUT_STAT_S32(BLUE_TIMER_US, ktime_to_us( ktime_sub(now, - flow->cvars.blue_timer))); + flow->cvars.blue_timer))); } if (flow->cvars.dropping) { PUT_STAT_S32(DROP_NEXT_US, -- cgit v1.2.3 From 4c342f778fe234e0c2a2601d87fec8ba42f0d2c6 Mon Sep 17 00:00:00 2001 From: Rao Shoaib Date: Thu, 25 Jun 2020 13:46:00 -0700 Subject: rds: transport module should be auto loaded when transport is set MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This enhancement auto loads transport module when the transport is set via SO_RDS_TRANSPORT socket option. Reviewed-by: Ka-Cheong Poon Reviewed-by: Håkon Bugge Signed-off-by: Rao Shoaib Signed-off-by: Somasundaram Krishnasamy Signed-off-by: David S. Miller --- include/uapi/linux/rds.h | 4 +++- net/rds/transport.c | 26 +++++++++++++++++--------- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/include/uapi/linux/rds.h b/include/uapi/linux/rds.h index cba368e55863..c21edb966c19 100644 --- a/include/uapi/linux/rds.h +++ b/include/uapi/linux/rds.h @@ -64,10 +64,12 @@ /* supported values for SO_RDS_TRANSPORT */ #define RDS_TRANS_IB 0 -#define RDS_TRANS_IWARP 1 +#define RDS_TRANS_GAP 1 #define RDS_TRANS_TCP 2 #define RDS_TRANS_COUNT 3 #define RDS_TRANS_NONE (~0) +/* don't use RDS_TRANS_IWARP - it is deprecated */ +#define RDS_TRANS_IWARP RDS_TRANS_GAP /* IOCTLS commands for SOL_RDS */ #define SIOCRDSSETTOS (SIOCPROTOPRIVATE) diff --git a/net/rds/transport.c b/net/rds/transport.c index 46f709a4b577..f8001ec80867 100644 --- a/net/rds/transport.c +++ b/net/rds/transport.c @@ -38,6 +38,12 @@ #include "rds.h" #include "loop.h" +static char * const rds_trans_modules[] = { + [RDS_TRANS_IB] = "rds_rdma", + [RDS_TRANS_GAP] = NULL, + [RDS_TRANS_TCP] = "rds_tcp", +}; + static struct rds_transport *transports[RDS_TRANS_COUNT]; static DECLARE_RWSEM(rds_trans_sem); @@ -110,18 +116,20 @@ struct rds_transport *rds_trans_get(int t_type) { struct rds_transport *ret = NULL; struct rds_transport *trans; - unsigned int i; down_read(&rds_trans_sem); - for (i = 0; i < RDS_TRANS_COUNT; i++) { - trans = transports[i]; - - if (trans && trans->t_type == t_type && - (!trans->t_owner || try_module_get(trans->t_owner))) { - ret = trans; - break; - } + trans = transports[t_type]; + if (!trans) { + up_read(&rds_trans_sem); + if (rds_trans_modules[t_type]) + request_module(rds_trans_modules[t_type]); + down_read(&rds_trans_sem); + trans = transports[t_type]; } + if (trans && trans->t_type == t_type && + (!trans->t_owner || try_module_get(trans->t_owner))) + ret = trans; + up_read(&rds_trans_sem); return ret; -- cgit v1.2.3