From 66b5f1c439843bcbab01cc7f3854ae2742f3d1e3 Mon Sep 17 00:00:00 2001 From: Maciej Żenczykowski Date: Thu, 18 Jul 2019 23:30:03 -0700 Subject: net-ipv6-ndisc: add support for RFC7710 RA Captive Portal Identifier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is trivial since we already have support for the entirely identical (from the kernel's point of view) RDNSS and DNSSL that also contain opaque data that needs to be passed down to userspace. As specified in RFC7710, Captive Portal option contains a URL. 8-bit identifier of the option type as assigned by the IANA is 37. This option should also be treated as userland. Hence, treat ND option 37 as userland (Captive Portal support) See: https://tools.ietf.org/html/rfc7710 https://www.iana.org/assignments/icmpv6-parameters/icmpv6-parameters.xhtml Fixes: e35f30c131a56 Signed-off-by: Maciej Żenczykowski Cc: Lorenzo Colitti Cc: Remin Nguyen Van Cc: Alexey I. Froloff Signed-off-by: David S. Miller --- net/ipv6/ndisc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv6') diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 083cc1c94cd3..53caf59c591e 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -196,6 +196,7 @@ static inline int ndisc_is_useropt(const struct net_device *dev, { return opt->nd_opt_type == ND_OPT_RDNSS || opt->nd_opt_type == ND_OPT_DNSSL || + opt->nd_opt_type == ND_OPT_CAPTIVE_PORTAL || ndisc_ops_is_useropt(dev, opt->nd_opt_type); } -- cgit v1.2.3 From 280b0b8e89ade4277147e598d5806de12bff5fbc Mon Sep 17 00:00:00 2001 From: Jonathan Lemon Date: Fri, 26 Jul 2019 12:16:09 -0700 Subject: ipv6: remove printk ipv6_find_hdr() prints a non-rate limited error message when it cannot find an ipv6 header at a specific offset. This could be used as a DoS, so just remove it. Signed-off-by: Jonathan Lemon Signed-off-by: David S. Miller --- net/ipv6/exthdrs_core.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c index b358f1a4dd08..da46c4284676 100644 --- a/net/ipv6/exthdrs_core.c +++ b/net/ipv6/exthdrs_core.c @@ -197,10 +197,8 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, struct ipv6hdr _ip6, *ip6; ip6 = skb_header_pointer(skb, *offset, sizeof(_ip6), &_ip6); - if (!ip6 || (ip6->version != 6)) { - printk(KERN_ERR "IPv6 header not found\n"); + if (!ip6 || (ip6->version != 6)) return -EBADMSG; - } start = *offset + sizeof(struct ipv6hdr); nexthdr = ip6->nexthdr; } -- cgit v1.2.3 From 9349d600fb6a1ca0aaeb515523e1bb5409483d76 Mon Sep 17 00:00:00 2001 From: Petar Penkov Date: Mon, 29 Jul 2019 09:59:14 -0700 Subject: tcp: add skb-less helpers to retrieve SYN cookie This patch allows generation of a SYN cookie before an SKB has been allocated, as is the case at XDP. Signed-off-by: Petar Penkov Reviewed-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov --- include/net/tcp.h | 10 +++++++ net/ipv4/tcp_input.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++++ net/ipv4/tcp_ipv4.c | 15 +++++++++++ net/ipv6/tcp_ipv6.c | 15 +++++++++++ 4 files changed, 113 insertions(+) (limited to 'net/ipv6') diff --git a/include/net/tcp.h b/include/net/tcp.h index e5cf514ba118..fb7e153aecc5 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -414,6 +414,16 @@ void tcp_parse_options(const struct net *net, const struct sk_buff *skb, int estab, struct tcp_fastopen_cookie *foc); const u8 *tcp_parse_md5sig_option(const struct tcphdr *th); +/* + * BPF SKB-less helpers + */ +u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, + struct tcphdr *th, u32 *cookie); +u16 tcp_v6_get_syncookie(struct sock *sk, struct ipv6hdr *iph, + struct tcphdr *th, u32 *cookie); +u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops, + const struct tcp_request_sock_ops *af_ops, + struct sock *sk, struct tcphdr *th); /* * TCP v4 functions exported for the inet6 API */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8892df6de1d4..706cbb3b2986 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3782,6 +3782,49 @@ static void smc_parse_options(const struct tcphdr *th, #endif } +/* Try to parse the MSS option from the TCP header. Return 0 on failure, clamped + * value on success. + */ +static u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss) +{ + const unsigned char *ptr = (const unsigned char *)(th + 1); + int length = (th->doff * 4) - sizeof(struct tcphdr); + u16 mss = 0; + + while (length > 0) { + int opcode = *ptr++; + int opsize; + + switch (opcode) { + case TCPOPT_EOL: + return mss; + case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ + length--; + continue; + default: + if (length < 2) + return mss; + opsize = *ptr++; + if (opsize < 2) /* "silly options" */ + return mss; + if (opsize > length) + return mss; /* fail on partial options */ + if (opcode == TCPOPT_MSS && opsize == TCPOLEN_MSS) { + u16 in_mss = get_unaligned_be16(ptr); + + if (in_mss) { + if (user_mss && user_mss < in_mss) + in_mss = user_mss; + mss = in_mss; + } + } + ptr += opsize - 2; + length -= opsize; + } + } + return mss; +} + /* Look for tcp options. Normally only called on SYN and SYNACK packets. * But, this can also be called on packets in the established flow when * the fast version below fails. @@ -6464,6 +6507,36 @@ static void tcp_reqsk_record_syn(const struct sock *sk, } } +/* If a SYN cookie is required and supported, returns a clamped MSS value to be + * used for SYN cookie generation. + */ +u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops, + const struct tcp_request_sock_ops *af_ops, + struct sock *sk, struct tcphdr *th) +{ + struct tcp_sock *tp = tcp_sk(sk); + u16 mss; + + if (sock_net(sk)->ipv4.sysctl_tcp_syncookies != 2 && + !inet_csk_reqsk_queue_is_full(sk)) + return 0; + + if (!tcp_syn_flood_action(sk, rsk_ops->slab_name)) + return 0; + + if (sk_acceptq_is_full(sk)) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); + return 0; + } + + mss = tcp_parse_mss_option(th, tp->rx_opt.user_mss); + if (!mss) + mss = af_ops->mss_clamp; + + return mss; +} +EXPORT_SYMBOL_GPL(tcp_get_syncookie_mss); + int tcp_conn_request(struct request_sock_ops *rsk_ops, const struct tcp_request_sock_ops *af_ops, struct sock *sk, struct sk_buff *skb) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index d57641cb3477..10217393cda6 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1515,6 +1515,21 @@ static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) return sk; } +u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, + struct tcphdr *th, u32 *cookie) +{ + u16 mss = 0; +#ifdef CONFIG_SYN_COOKIES + mss = tcp_get_syncookie_mss(&tcp_request_sock_ops, + &tcp_request_sock_ipv4_ops, sk, th); + if (mss) { + *cookie = __cookie_v4_init_sequence(iph, th, &mss); + tcp_synq_overflow(sk); + } +#endif + return mss; +} + /* The socket must have it's spinlock held when we get * here, unless it is a TCP_LISTEN socket. * diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 5da069e91cac..87f44d3250ee 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1063,6 +1063,21 @@ static struct sock *tcp_v6_cookie_check(struct sock *sk, struct sk_buff *skb) return sk; } +u16 tcp_v6_get_syncookie(struct sock *sk, struct ipv6hdr *iph, + struct tcphdr *th, u32 *cookie) +{ + u16 mss = 0; +#ifdef CONFIG_SYN_COOKIES + mss = tcp_get_syncookie_mss(&tcp6_request_sock_ops, + &tcp_request_sock_ipv6_ops, sk, th); + if (mss) { + *cookie = __cookie_v6_init_sequence(iph, th, &mss); + tcp_synq_overflow(sk); + } +#endif + return mss; +} + static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) { if (skb->protocol == htons(ETH_P_IP)) -- cgit v1.2.3 From 8c0bb7873815bf8c3c4dfb24e8ebf4fefb4c35d2 Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Wed, 10 Jul 2019 12:05:59 +0200 Subject: netfilter: synproxy: rename mss synproxy_options field After introduce "mss_encode" field in the synproxy_options struct the field "mss" is a little confusing. It has been renamed to "mss_option". Signed-off-by: Fernando Fernandez Mancera Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_synproxy.h | 2 +- net/ipv4/netfilter/ipt_SYNPROXY.c | 4 ++-- net/ipv6/netfilter/ip6t_SYNPROXY.c | 4 ++-- net/netfilter/nf_synproxy_core.c | 8 ++++---- net/netfilter/nft_synproxy.c | 4 ++-- 5 files changed, 11 insertions(+), 11 deletions(-) (limited to 'net/ipv6') diff --git a/include/net/netfilter/nf_conntrack_synproxy.h b/include/net/netfilter/nf_conntrack_synproxy.h index 44513b93bd55..2f0171d24997 100644 --- a/include/net/netfilter/nf_conntrack_synproxy.h +++ b/include/net/netfilter/nf_conntrack_synproxy.h @@ -67,7 +67,7 @@ static inline struct synproxy_net *synproxy_pernet(struct net *net) struct synproxy_options { u8 options; u8 wscale; - u16 mss; + u16 mss_option; u16 mss_encode; u32 tsval; u32 tsecr; diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c index 0e70f3f65f6f..748dc3ce58d3 100644 --- a/net/ipv4/netfilter/ipt_SYNPROXY.c +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c @@ -36,8 +36,8 @@ synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par) opts.options |= XT_SYNPROXY_OPT_ECN; opts.options &= info->options; - opts.mss_encode = opts.mss; - opts.mss = info->mss; + opts.mss_encode = opts.mss_option; + opts.mss_option = info->mss; if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) synproxy_init_timestamp_cookie(info, &opts); else diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c index 5cdb4a69d277..fd1f52a21bf1 100644 --- a/net/ipv6/netfilter/ip6t_SYNPROXY.c +++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c @@ -36,8 +36,8 @@ synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par) opts.options |= XT_SYNPROXY_OPT_ECN; opts.options &= info->options; - opts.mss_encode = opts.mss; - opts.mss = info->mss; + opts.mss_encode = opts.mss_option; + opts.mss_option = info->mss; if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) synproxy_init_timestamp_cookie(info, &opts); else diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index c769462a839e..b0930d4aba22 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -56,7 +56,7 @@ synproxy_parse_options(const struct sk_buff *skb, unsigned int doff, switch (opcode) { case TCPOPT_MSS: if (opsize == TCPOLEN_MSS) { - opts->mss = get_unaligned_be16(ptr); + opts->mss_option = get_unaligned_be16(ptr); opts->options |= NF_SYNPROXY_OPT_MSS; } break; @@ -115,7 +115,7 @@ synproxy_build_options(struct tcphdr *th, const struct synproxy_options *opts) if (options & NF_SYNPROXY_OPT_MSS) *ptr++ = htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | - opts->mss); + opts->mss_option); if (options & NF_SYNPROXY_OPT_TIMESTAMP) { if (options & NF_SYNPROXY_OPT_SACK_PERM) @@ -642,7 +642,7 @@ synproxy_recv_client_ack(struct net *net, } this_cpu_inc(snet->stats->cookie_valid); - opts->mss = mss; + opts->mss_option = mss; opts->options |= NF_SYNPROXY_OPT_MSS; if (opts->options & NF_SYNPROXY_OPT_TIMESTAMP) @@ -1060,7 +1060,7 @@ synproxy_recv_client_ack_ipv6(struct net *net, } this_cpu_inc(snet->stats->cookie_valid); - opts->mss = mss; + opts->mss_option = mss; opts->options |= NF_SYNPROXY_OPT_MSS; if (opts->options & NF_SYNPROXY_OPT_TIMESTAMP) diff --git a/net/netfilter/nft_synproxy.c b/net/netfilter/nft_synproxy.c index 928e661d1517..db4c23f5dfcb 100644 --- a/net/netfilter/nft_synproxy.c +++ b/net/netfilter/nft_synproxy.c @@ -31,8 +31,8 @@ static void nft_synproxy_tcp_options(struct synproxy_options *opts, opts->options |= NF_SYNPROXY_OPT_ECN; opts->options &= priv->info.options; - opts->mss_encode = opts->mss; - opts->mss = info->mss; + opts->mss_encode = opts->mss_option; + opts->mss_option = info->mss; if (opts->options & NF_SYNPROXY_OPT_TIMESTAMP) synproxy_init_timestamp_cookie(info, opts); else -- cgit v1.2.3 From 43a4b60d04362185cd5475fd77a02bf6c56c07e4 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 1 Aug 2019 15:18:08 -0700 Subject: ipv6: have a single rcu unlock point in __ip6_rt_update_pmtu Simplify the unlock path in __ip6_rt_update_pmtu by using a single point where rcu_read_unlock is called. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index e49fec767a10..3c5c331b50f1 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2725,10 +2725,9 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, rcu_read_lock(); res.f6i = rcu_dereference(rt6->from); - if (!res.f6i) { - rcu_read_unlock(); - return; - } + if (!res.f6i) + goto out_unlock; + res.fib6_flags = res.f6i->fib6_flags; res.fib6_type = res.f6i->fib6_type; @@ -2744,10 +2743,8 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, /* fib6_info uses a nexthop that does not have fib6_nh * using the dst->dev + gw. Should be impossible. */ - if (!arg.match) { - rcu_read_unlock(); - return; - } + if (!arg.match) + goto out_unlock; res.nh = arg.match; } else { @@ -2760,6 +2757,7 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, if (rt6_insert_exception(nrt6, &res)) dst_release_immediate(&nrt6->dst); } +out_unlock: rcu_read_unlock(); } } -- cgit v1.2.3 From c7a42eb49212f93a800560662d17d5293960d3c3 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 23 Aug 2019 19:33:03 +0800 Subject: net: ipv6: fix listify ip6_rcv_finish in case of forwarding We need a similar fix for ipv6 as Commit 0761680d5215 ("net: ipv4: fix listify ip_rcv_finish in case of forwarding") does for ipv4. This issue can be reprocuded by syzbot since Commit 323ebb61e32b ("net: use listified RX for handling GRO_NORMAL skbs") on net-next. The call trace was: kernel BUG at include/linux/skbuff.h:2225! RIP: 0010:__skb_pull include/linux/skbuff.h:2225 [inline] RIP: 0010:skb_pull+0xea/0x110 net/core/skbuff.c:1902 Call Trace: sctp_inq_pop+0x2f1/0xd80 net/sctp/inqueue.c:202 sctp_endpoint_bh_rcv+0x184/0x8d0 net/sctp/endpointola.c:385 sctp_inq_push+0x1e4/0x280 net/sctp/inqueue.c:80 sctp_rcv+0x2807/0x3590 net/sctp/input.c:256 sctp6_rcv+0x17/0x30 net/sctp/ipv6.c:1049 ip6_protocol_deliver_rcu+0x2fe/0x1660 net/ipv6/ip6_input.c:397 ip6_input_finish+0x84/0x170 net/ipv6/ip6_input.c:438 NF_HOOK include/linux/netfilter.h:305 [inline] NF_HOOK include/linux/netfilter.h:299 [inline] ip6_input+0xe4/0x3f0 net/ipv6/ip6_input.c:447 dst_input include/net/dst.h:442 [inline] ip6_sublist_rcv_finish+0x98/0x1e0 net/ipv6/ip6_input.c:84 ip6_list_rcv_finish net/ipv6/ip6_input.c:118 [inline] ip6_sublist_rcv+0x80c/0xcf0 net/ipv6/ip6_input.c:282 ipv6_list_rcv+0x373/0x4b0 net/ipv6/ip6_input.c:316 __netif_receive_skb_list_ptype net/core/dev.c:5049 [inline] __netif_receive_skb_list_core+0x5fc/0x9d0 net/core/dev.c:5097 __netif_receive_skb_list net/core/dev.c:5149 [inline] netif_receive_skb_list_internal+0x7eb/0xe60 net/core/dev.c:5244 gro_normal_list.part.0+0x1e/0xb0 net/core/dev.c:5757 gro_normal_list net/core/dev.c:5755 [inline] gro_normal_one net/core/dev.c:5769 [inline] napi_frags_finish net/core/dev.c:5782 [inline] napi_gro_frags+0xa6a/0xea0 net/core/dev.c:5855 tun_get_user+0x2e98/0x3fa0 drivers/net/tun.c:1974 tun_chr_write_iter+0xbd/0x156 drivers/net/tun.c:2020 Fixes: d8269e2cbf90 ("net: ipv6: listify ipv6_rcv() and ip6_rcv_finish()") Fixes: 323ebb61e32b ("net: use listified RX for handling GRO_NORMAL skbs") Reported-by: syzbot+eb349eeee854e389c36d@syzkaller.appspotmail.com Reported-by: syzbot+4a0643a653ac375612d1@syzkaller.appspotmail.com Signed-off-by: Xin Long Acked-by: Edward Cree Signed-off-by: David S. Miller --- net/ipv6/ip6_input.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net/ipv6') diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index fa014d5f1732..d432d0011c16 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -80,8 +80,10 @@ static void ip6_sublist_rcv_finish(struct list_head *head) { struct sk_buff *skb, *next; - list_for_each_entry_safe(skb, next, head, list) + list_for_each_entry_safe(skb, next, head, list) { + skb_list_del_init(skb); dst_input(skb); + } } static void ip6_list_rcv_finish(struct net *net, struct sock *sk, -- cgit v1.2.3 From 0079ad8e8dc3a4d1af0dd4a53345580a6947beba Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 6 Sep 2019 15:36:01 +0800 Subject: ipmr: remove hard code cache_resolve_queue_len limit This is a re-post of previous patch wrote by David Miller[1]. Phil Karn reported[2] that on busy networks with lots of unresolved multicast routing entries, the creation of new multicast group routes can be extremely slow and unreliable. The reason is we hard-coded multicast route entries with unresolved source addresses(cache_resolve_queue_len) to 10. If some multicast route never resolves and the unresolved source addresses increased, there will be no ability to create new multicast route cache. To resolve this issue, we need either add a sysctl entry to make the cache_resolve_queue_len configurable, or just remove cache_resolve_queue_len limit directly, as we already have the socket receive queue limits of mrouted socket, pointed by David. >From my side, I'd perfer to remove the cache_resolve_queue_len limit instead of creating two more(IPv4 and IPv6 version) sysctl entry. [1] https://lkml.org/lkml/2018/7/22/11 [2] https://lkml.org/lkml/2018/7/21/343 v3: instead of remove cache_resolve_queue_len totally, let's only remove the hard code limit when allocate the unresolved cache, as Eric Dumazet suggested, so we don't need to re-count it in other places. v2: hold the mfc_unres_lock while walking the unresolved list in queue_count(), as Nikolay Aleksandrov remind. Reported-by: Phil Karn Signed-off-by: Hangbin Liu Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/ipv4/ipmr.c | 4 ++-- net/ipv6/ip6mr.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index c07bc82cbbe9..313470f6bb14 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1134,8 +1134,8 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, if (!found) { /* Create a new entry if allowable */ - if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 || - (c = ipmr_cache_alloc_unres()) == NULL) { + c = ipmr_cache_alloc_unres(); + if (!c) { spin_unlock_bh(&mfc_unres_lock); kfree_skb(skb); diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index e80d36c5073d..857a89ad4d6c 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -1148,8 +1148,8 @@ static int ip6mr_cache_unresolved(struct mr_table *mrt, mifi_t mifi, * Create a new entry if allowable */ - if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 || - (c = ip6mr_cache_alloc_unres()) == NULL) { + c = ip6mr_cache_alloc_unres(); + if (!c) { spin_unlock_bh(&mfc_unres_lock); kfree_skb(skb); -- cgit v1.2.3 From cbfd68913c5d260260e56bcef8b3c4a449751795 Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Mon, 9 Sep 2019 22:44:06 +0200 Subject: ipv6: Don't use dst gateway directly in ip6_confirm_neigh() This is the equivalent of commit 2c6b55f45d53 ("ipv6: fix neighbour resolution with raw socket") for ip6_confirm_neigh(): we can send a packet with MSG_CONFIRM on a raw socket for a connected route, so the gateway would be :: here, and we should pick the next hop using rt6_nexthop() instead. This was found by code review and, to the best of my knowledge, doesn't actually fix a practical issue: the destination address from the packet is not considered while confirming a neighbour, as ip6_confirm_neigh() calls choose_neigh_daddr() without passing the packet, so there are no similar issues as the one fixed by said commit. A possible source of issues with the existing implementation might come from the fact that, if we have a cached dst, we won't consider it, while rt6_nexthop() takes care of that. I might just not be creative enough to find a practical problem here: the only way to affect this with cached routes is to have one coming from an ICMPv6 redirect, but if the next hop is a directly connected host, there should be no topology for which a redirect applies here, and tests with redirected routes show no differences for MSG_CONFIRM (and MSG_PROBE) packets on raw sockets destined to a directly connected host. However, directly using the dst gateway here is not consistent anymore with neighbour resolution, and, in general, as we want the next hop, using rt6_nexthop() looks like the only sane way to fetch it. Reported-by: Guillaume Nault Signed-off-by: Stefano Brivio Acked-by: Guillaume Nault Acked-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/ipv6/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv6') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 7a5d331cdefa..874641d4d2a1 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -227,7 +227,7 @@ static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) struct net_device *dev = dst->dev; struct rt6_info *rt = (struct rt6_info *)dst; - daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr); + daddr = choose_neigh_daddr(rt6_nexthop(rt, &in6addr_any), NULL, daddr); if (!daddr) return; if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) -- cgit v1.2.3 From 40d102cde0a2aabb5e542ab1ab1aa4aaa1fd4372 Mon Sep 17 00:00:00 2001 From: Jeremy Sowden Date: Fri, 13 Sep 2019 09:13:05 +0100 Subject: netfilter: update include directives. Include some headers in files which require them, and remove others which are not required. Signed-off-by: Jeremy Sowden Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_core.h | 3 ++- include/net/netfilter/nf_conntrack_zones.h | 3 ++- include/net/netfilter/nf_nat.h | 13 ++++++------- include/net/netfilter/nf_nat_masquerade.h | 1 + net/bridge/netfilter/nf_conntrack_bridge.c | 1 - net/ipv6/netfilter/nf_socket_ipv6.c | 1 - net/netfilter/nf_conntrack_ecache.c | 1 + net/netfilter/nf_conntrack_expect.c | 2 ++ net/netfilter/nf_conntrack_helper.c | 5 +++-- net/netfilter/nf_conntrack_timeout.c | 1 + net/netfilter/nf_flow_table_core.c | 1 + net/netfilter/nf_nat_core.c | 6 +++--- net/netfilter/nft_flow_offload.c | 3 ++- net/netfilter/xt_connlimit.c | 2 ++ net/sched/act_ct.c | 2 +- 15 files changed, 27 insertions(+), 18 deletions(-) (limited to 'net/ipv6') diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h index 71a2d9cb64ea..d340886e012d 100644 --- a/include/net/netfilter/nf_conntrack_core.h +++ b/include/net/netfilter/nf_conntrack_core.h @@ -14,8 +14,9 @@ #define _NF_CONNTRACK_CORE_H #include -#include +#include #include +#include /* This header is used to share core functionality between the standalone connection tracking module, and the compatibility layer's use diff --git a/include/net/netfilter/nf_conntrack_zones.h b/include/net/netfilter/nf_conntrack_zones.h index 52950baa3ab5..33b91d19cb7d 100644 --- a/include/net/netfilter/nf_conntrack_zones.h +++ b/include/net/netfilter/nf_conntrack_zones.h @@ -5,7 +5,8 @@ #include #if IS_ENABLED(CONFIG_NF_CONNTRACK) -#include + +#include static inline const struct nf_conntrack_zone * nf_ct_zone(const struct nf_conn *ct) diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h index eec208fb9c23..eeb336809679 100644 --- a/include/net/netfilter/nf_nat.h +++ b/include/net/netfilter/nf_nat.h @@ -1,9 +1,14 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NF_NAT_H #define _NF_NAT_H + +#include #include -#include +#include +#include +#include #include +#include enum nf_nat_manip_type { NF_NAT_MANIP_SRC, @@ -14,10 +19,6 @@ enum nf_nat_manip_type { #define HOOK2MANIP(hooknum) ((hooknum) != NF_INET_POST_ROUTING && \ (hooknum) != NF_INET_LOCAL_IN) -#include -#include -#include - /* per conntrack: nat application helper private data */ union nf_conntrack_nat_help { /* insert nat helper private data here */ @@ -26,8 +27,6 @@ union nf_conntrack_nat_help { #endif }; -struct nf_conn; - /* The structure embedded in the conntrack structure. */ struct nf_conn_nat { union nf_conntrack_nat_help help; diff --git a/include/net/netfilter/nf_nat_masquerade.h b/include/net/netfilter/nf_nat_masquerade.h index 54a14d643c34..be7abc9d5f22 100644 --- a/include/net/netfilter/nf_nat_masquerade.h +++ b/include/net/netfilter/nf_nat_masquerade.h @@ -2,6 +2,7 @@ #ifndef _NF_NAT_MASQUERADE_H_ #define _NF_NAT_MASQUERADE_H_ +#include #include unsigned int diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c index 4f5444d2a526..c9ce321fcac1 100644 --- a/net/bridge/netfilter/nf_conntrack_bridge.c +++ b/net/bridge/netfilter/nf_conntrack_bridge.c @@ -17,7 +17,6 @@ #include #include -#include #include #include "../br_private.h" diff --git a/net/ipv6/netfilter/nf_socket_ipv6.c b/net/ipv6/netfilter/nf_socket_ipv6.c index 437d95545c31..b9df879c48d3 100644 --- a/net/ipv6/netfilter/nf_socket_ipv6.c +++ b/net/ipv6/netfilter/nf_socket_ipv6.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c index 5e2812ee2149..6fba74b5aaf7 100644 --- a/net/netfilter/nf_conntrack_ecache.c +++ b/net/netfilter/nf_conntrack_ecache.c @@ -24,6 +24,7 @@ #include #include +#include #include static DEFINE_MUTEX(nf_ct_ecache_mutex); diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 65364de915d1..42557d2b6a90 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -25,8 +25,10 @@ #include #include +#include #include #include +#include #include #include diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 8d729e7c36ff..118f415928ae 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -21,10 +21,11 @@ #include #include -#include -#include #include +#include #include +#include +#include #include static DEFINE_MUTEX(nf_ct_helper_mutex); diff --git a/net/netfilter/nf_conntrack_timeout.c b/net/netfilter/nf_conntrack_timeout.c index 13d0f4a92647..14387e0b8008 100644 --- a/net/netfilter/nf_conntrack_timeout.c +++ b/net/netfilter/nf_conntrack_timeout.c @@ -19,6 +19,7 @@ #include #include #include +#include #include struct nf_ct_timeout * diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 80a8f9ae4c93..09310a1bd91f 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -11,6 +11,7 @@ #include #include #include +#include #include struct flow_offload_entry { diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 3f6023ed4966..bfc555fcbc72 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -18,12 +18,12 @@ #include #include -#include -#include #include #include #include -#include +#include +#include +#include #include "nf_internals.h" diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index 01705ad74a9a..22cf236eb5d5 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -6,12 +6,13 @@ #include #include #include +#include #include #include /* for ipv4 options. */ #include #include #include -#include +#include #include struct nft_flow_offload { diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index bc6c8ab0fa62..46fcac75f726 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -13,6 +13,8 @@ */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include #include #include #include diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index cdd6f3818097..fcc46025e790 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -24,12 +24,12 @@ #include #include -#include #include #include #include #include #include +#include static struct tc_action_ops act_ct_ops; static unsigned int ct_net_id; -- cgit v1.2.3 From 44dde23698a7a8a807d974a5124cf64b7ab2c9d5 Mon Sep 17 00:00:00 2001 From: Jeremy Sowden Date: Fri, 13 Sep 2019 09:13:07 +0100 Subject: netfilter: move inline nf_ip6_ext_hdr() function to a more appropriate header. There is an inline function in ip6_tables.h which is not specific to ip6tables and is used elswhere in netfilter. Move it into netfilter_ipv6.h and update the callers. Signed-off-by: Jeremy Sowden Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_ipv6.h | 12 ++++++++++++ include/linux/netfilter_ipv6/ip6_tables.h | 12 ------------ net/ipv6/netfilter/ip6t_ipv6header.c | 4 ++-- net/ipv6/netfilter/nf_log_ipv6.c | 4 ++-- 4 files changed, 16 insertions(+), 16 deletions(-) (limited to 'net/ipv6') diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h index a889e376d197..c1500209cfaf 100644 --- a/include/linux/netfilter_ipv6.h +++ b/include/linux/netfilter_ipv6.h @@ -10,6 +10,18 @@ #include #include +/* Check for an extension */ +static inline int +nf_ip6_ext_hdr(u8 nexthdr) +{ return (nexthdr == IPPROTO_HOPOPTS) || + (nexthdr == IPPROTO_ROUTING) || + (nexthdr == IPPROTO_FRAGMENT) || + (nexthdr == IPPROTO_ESP) || + (nexthdr == IPPROTO_AH) || + (nexthdr == IPPROTO_NONE) || + (nexthdr == IPPROTO_DSTOPTS); +} + /* Extra routing may needed on local out, as the QUEUE target never returns * control to the table. */ diff --git a/include/linux/netfilter_ipv6/ip6_tables.h b/include/linux/netfilter_ipv6/ip6_tables.h index 666450c117bf..3a0a2bd054cc 100644 --- a/include/linux/netfilter_ipv6/ip6_tables.h +++ b/include/linux/netfilter_ipv6/ip6_tables.h @@ -36,18 +36,6 @@ extern unsigned int ip6t_do_table(struct sk_buff *skb, struct xt_table *table); #endif -/* Check for an extension */ -static inline int -ip6t_ext_hdr(u8 nexthdr) -{ return (nexthdr == IPPROTO_HOPOPTS) || - (nexthdr == IPPROTO_ROUTING) || - (nexthdr == IPPROTO_FRAGMENT) || - (nexthdr == IPPROTO_ESP) || - (nexthdr == IPPROTO_AH) || - (nexthdr == IPPROTO_NONE) || - (nexthdr == IPPROTO_DSTOPTS); -} - #ifdef CONFIG_COMPAT #include diff --git a/net/ipv6/netfilter/ip6t_ipv6header.c b/net/ipv6/netfilter/ip6t_ipv6header.c index 0fc6326ef499..c52ff929c93b 100644 --- a/net/ipv6/netfilter/ip6t_ipv6header.c +++ b/net/ipv6/netfilter/ip6t_ipv6header.c @@ -16,7 +16,7 @@ #include #include -#include +#include #include MODULE_LICENSE("GPL"); @@ -42,7 +42,7 @@ ipv6header_mt6(const struct sk_buff *skb, struct xt_action_param *par) len = skb->len - ptr; temp = 0; - while (ip6t_ext_hdr(nexthdr)) { + while (nf_ip6_ext_hdr(nexthdr)) { const struct ipv6_opt_hdr *hp; struct ipv6_opt_hdr _hdr; int hdrlen; diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c index f53bd8f01219..22b80db6d882 100644 --- a/net/ipv6/netfilter/nf_log_ipv6.c +++ b/net/ipv6/netfilter/nf_log_ipv6.c @@ -18,7 +18,7 @@ #include #include -#include +#include #include #include @@ -70,7 +70,7 @@ static void dump_ipv6_packet(struct net *net, struct nf_log_buf *m, fragment = 0; ptr = ip6hoff + sizeof(struct ipv6hdr); currenthdr = ih->nexthdr; - while (currenthdr != NEXTHDR_NONE && ip6t_ext_hdr(currenthdr)) { + while (currenthdr != NEXTHDR_NONE && nf_ip6_ext_hdr(currenthdr)) { struct ipv6_opt_hdr _hdr; const struct ipv6_opt_hdr *hp; -- cgit v1.2.3 From 46705b070c279b352bbbe8118d78aa31b0768245 Mon Sep 17 00:00:00 2001 From: Jeremy Sowden Date: Fri, 13 Sep 2019 09:13:09 +0100 Subject: netfilter: move nf_bridge_frag_data struct definition to a more appropriate header. There is a struct definition function in nf_conntrack_bridge.h which is not specific to conntrack and is used elswhere in netfilter. Move it into netfilter_bridge.h. Signed-off-by: Jeremy Sowden Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_bridge.h | 7 +++++++ include/linux/netfilter_ipv6.h | 14 +++++++------- include/net/netfilter/nf_conntrack_bridge.h | 7 ------- net/bridge/netfilter/nf_conntrack_bridge.c | 14 +++++++------- net/ipv6/netfilter.c | 4 ++-- 5 files changed, 23 insertions(+), 23 deletions(-) (limited to 'net/ipv6') diff --git a/include/linux/netfilter_bridge.h b/include/linux/netfilter_bridge.h index 5f2614d02e03..f980edfdd278 100644 --- a/include/linux/netfilter_bridge.h +++ b/include/linux/netfilter_bridge.h @@ -5,6 +5,13 @@ #include #include +struct nf_bridge_frag_data { + char mac[ETH_HLEN]; + bool vlan_present; + u16 vlan_tci; + __be16 vlan_proto; +}; + #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb); diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h index c1500209cfaf..aac42c28fe62 100644 --- a/include/linux/netfilter_ipv6.h +++ b/include/linux/netfilter_ipv6.h @@ -32,7 +32,7 @@ struct ip6_rt_info { }; struct nf_queue_entry; -struct nf_ct_bridge_frag_data; +struct nf_bridge_frag_data; /* * Hook functions for ipv6 to allow xt_* modules to be built-in even @@ -61,9 +61,9 @@ struct nf_ipv6_ops { int (*br_defrag)(struct net *net, struct sk_buff *skb, u32 user); int (*br_fragment)(struct net *net, struct sock *sk, struct sk_buff *skb, - struct nf_ct_bridge_frag_data *data, + struct nf_bridge_frag_data *data, int (*output)(struct net *, struct sock *sk, - const struct nf_ct_bridge_frag_data *data, + const struct nf_bridge_frag_data *data, struct sk_buff *)); #endif }; @@ -135,16 +135,16 @@ static inline int nf_ipv6_br_defrag(struct net *net, struct sk_buff *skb, } int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, - struct nf_ct_bridge_frag_data *data, + struct nf_bridge_frag_data *data, int (*output)(struct net *, struct sock *sk, - const struct nf_ct_bridge_frag_data *data, + const struct nf_bridge_frag_data *data, struct sk_buff *)); static inline int nf_br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, - struct nf_ct_bridge_frag_data *data, + struct nf_bridge_frag_data *data, int (*output)(struct net *, struct sock *sk, - const struct nf_ct_bridge_frag_data *data, + const struct nf_bridge_frag_data *data, struct sk_buff *)) { #if IS_MODULE(CONFIG_IPV6) diff --git a/include/net/netfilter/nf_conntrack_bridge.h b/include/net/netfilter/nf_conntrack_bridge.h index 34c28f248b18..01b62fd5efa2 100644 --- a/include/net/netfilter/nf_conntrack_bridge.h +++ b/include/net/netfilter/nf_conntrack_bridge.h @@ -16,11 +16,4 @@ struct nf_ct_bridge_info { void nf_ct_bridge_register(struct nf_ct_bridge_info *info); void nf_ct_bridge_unregister(struct nf_ct_bridge_info *info); -struct nf_ct_bridge_frag_data { - char mac[ETH_HLEN]; - bool vlan_present; - u16 vlan_tci; - __be16 vlan_proto; -}; - #endif diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c index c9ce321fcac1..8842798c29e6 100644 --- a/net/bridge/netfilter/nf_conntrack_bridge.c +++ b/net/bridge/netfilter/nf_conntrack_bridge.c @@ -26,9 +26,9 @@ */ static int nf_br_ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, - struct nf_ct_bridge_frag_data *data, + struct nf_bridge_frag_data *data, int (*output)(struct net *, struct sock *sk, - const struct nf_ct_bridge_frag_data *data, + const struct nf_bridge_frag_data *data, struct sk_buff *)) { int frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size; @@ -278,7 +278,7 @@ static unsigned int nf_ct_bridge_pre(void *priv, struct sk_buff *skb, } static void nf_ct_bridge_frag_save(struct sk_buff *skb, - struct nf_ct_bridge_frag_data *data) + struct nf_bridge_frag_data *data) { if (skb_vlan_tag_present(skb)) { data->vlan_present = true; @@ -293,10 +293,10 @@ static void nf_ct_bridge_frag_save(struct sk_buff *skb, static unsigned int nf_ct_bridge_refrag(struct sk_buff *skb, const struct nf_hook_state *state, int (*output)(struct net *, struct sock *sk, - const struct nf_ct_bridge_frag_data *data, + const struct nf_bridge_frag_data *data, struct sk_buff *)) { - struct nf_ct_bridge_frag_data data; + struct nf_bridge_frag_data data; if (!BR_INPUT_SKB_CB(skb)->frag_max_size) return NF_ACCEPT; @@ -319,7 +319,7 @@ nf_ct_bridge_refrag(struct sk_buff *skb, const struct nf_hook_state *state, /* Actually only slow path refragmentation needs this. */ static int nf_ct_bridge_frag_restore(struct sk_buff *skb, - const struct nf_ct_bridge_frag_data *data) + const struct nf_bridge_frag_data *data) { int err; @@ -340,7 +340,7 @@ static int nf_ct_bridge_frag_restore(struct sk_buff *skb, } static int nf_ct_bridge_refrag_post(struct net *net, struct sock *sk, - const struct nf_ct_bridge_frag_data *data, + const struct nf_bridge_frag_data *data, struct sk_buff *skb) { int err; diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index 61819ed858b1..a9bff556d3b2 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -113,9 +113,9 @@ int __nf_ip6_route(struct net *net, struct dst_entry **dst, EXPORT_SYMBOL_GPL(__nf_ip6_route); int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, - struct nf_ct_bridge_frag_data *data, + struct nf_bridge_frag_data *data, int (*output)(struct net *, struct sock *sk, - const struct nf_ct_bridge_frag_data *data, + const struct nf_bridge_frag_data *data, struct sk_buff *)) { int frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size; -- cgit v1.2.3 From c6af0c227a22bb6bb8ff72f043e0fb6d99fd6515 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Wed, 11 Sep 2019 15:50:51 -0400 Subject: ip: support SO_MARK cmsg Enable setting skb->mark for UDP and RAW sockets using cmsg. This is analogous to existing support for TOS, TTL, txtime, etc. Packet sockets already support this as of commit c7d39e32632e ("packet: support per-packet fwmark for af_packet sendmsg"). Similar to other fields, implement by 1. initialize the sockcm_cookie.mark from socket option sk_mark 2. optionally overwrite this in ip_cmsg_send/ip6_datagram_send_ctl 3. initialize inet_cork.mark from sockcm_cookie.mark 4. initialize each (usually just one) skb->mark from inet_cork.mark Step 1 is handled in one location for most protocols by ipcm_init_sk as of commit 351782067b6b ("ipv4: ipcm_cookie initializers"). Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/inet_sock.h | 1 + include/net/ip.h | 1 + net/ipv4/ip_output.c | 3 ++- net/ipv4/ping.c | 2 +- net/ipv4/raw.c | 4 ++-- net/ipv4/udp.c | 2 +- net/ipv6/ip6_output.c | 3 ++- net/ipv6/raw.c | 4 +++- net/ipv6/udp.c | 3 ++- 9 files changed, 15 insertions(+), 8 deletions(-) (limited to 'net/ipv6') diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 7769c9b36d75..34c4436fd18f 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -160,6 +160,7 @@ struct inet_cork { char priority; __u16 gso_size; u64 transmit_time; + u32 mark; }; struct inet_cork_full { diff --git a/include/net/ip.h b/include/net/ip.h index 29d89de39822..95bb77f95bcc 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -88,6 +88,7 @@ static inline void ipcm_init_sk(struct ipcm_cookie *ipcm, { ipcm_init(ipcm); + ipcm->sockc.mark = inet->sk.sk_mark; ipcm->sockc.tsflags = inet->sk.sk_tsflags; ipcm->oif = inet->sk.sk_bound_dev_if; ipcm->addr = inet->inet_saddr; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index cc7ef0d05bbd..5eb73775c3f7 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1266,6 +1266,7 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, cork->length = 0; cork->ttl = ipc->ttl; cork->tos = ipc->tos; + cork->mark = ipc->sockc.mark; cork->priority = ipc->priority; cork->transmit_time = ipc->sockc.transmit_time; cork->tx_flags = 0; @@ -1529,7 +1530,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk, } skb->priority = (cork->tos != -1) ? cork->priority: sk->sk_priority; - skb->mark = sk->sk_mark; + skb->mark = cork->mark; skb->tstamp = cork->transmit_time; /* * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 9d24ef5c5d8f..535427292194 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -781,7 +781,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } else if (!ipc.oif) ipc.oif = inet->uc_index; - flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos, + flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, tos, RT_SCOPE_UNIVERSE, sk->sk_protocol, inet_sk_flowi_flags(sk), faddr, saddr, 0, 0, sk->sk_uid); diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 40a6abbc9cf6..80da5a66d5d7 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -375,7 +375,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, skb_reserve(skb, hlen); skb->priority = sk->sk_priority; - skb->mark = sk->sk_mark; + skb->mark = sockc->mark; skb->tstamp = sockc->transmit_time; skb_dst_set(skb, &rt->dst); *rtp = NULL; @@ -623,7 +623,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } } - flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos, + flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, tos, RT_SCOPE_UNIVERSE, hdrincl ? IPPROTO_RAW : sk->sk_protocol, inet_sk_flowi_flags(sk) | diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index d88821c794fb..fbcd9be3a470 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1130,7 +1130,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl4 = &fl4_stack; - flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos, + flowi4_init_output(fl4, ipc.oif, ipc.sockc.mark, tos, RT_SCOPE_UNIVERSE, sk->sk_protocol, flow_flags, faddr, saddr, dport, inet->inet_sport, diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 8e49fd62eea9..89a4c7c2e25d 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1294,6 +1294,7 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, cork->base.fragsize = mtu; cork->base.gso_size = ipc6->gso_size; cork->base.tx_flags = 0; + cork->base.mark = ipc6->sockc.mark; sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags); if (dst_allfrag(xfrm_dst_path(&rt->dst))) @@ -1764,7 +1765,7 @@ struct sk_buff *__ip6_make_skb(struct sock *sk, hdr->daddr = *final_dst; skb->priority = sk->sk_priority; - skb->mark = sk->sk_mark; + skb->mark = cork->base.mark; skb->tstamp = cork->base.transmit_time; diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 8a6131991e38..6e1888ee4036 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -646,7 +646,7 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, skb->protocol = htons(ETH_P_IPV6); skb->priority = sk->sk_priority; - skb->mark = sk->sk_mark; + skb->mark = sockc->mark; skb->tstamp = sockc->transmit_time; skb_put(skb, length); @@ -810,6 +810,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ipcm6_init(&ipc6); ipc6.sockc.tsflags = sk->sk_tsflags; + ipc6.sockc.mark = sk->sk_mark; if (sin6) { if (addr_len < SIN6_LEN_RFC2133) @@ -891,6 +892,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) opt = ipv6_fixup_options(&opt_space, opt); fl6.flowi6_proto = proto; + fl6.flowi6_mark = ipc6.sockc.mark; if (!hdrincl) { rfv.msg = msg; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 827fe7385078..2c8beb3896d1 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1230,6 +1230,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ipcm6_init(&ipc6); ipc6.gso_size = up->gso_size; ipc6.sockc.tsflags = sk->sk_tsflags; + ipc6.sockc.mark = sk->sk_mark; /* destination address check */ if (sin6) { @@ -1352,7 +1353,7 @@ do_udp_sendmsg: if (!fl6.flowi6_oif) fl6.flowi6_oif = np->sticky_pktinfo.ipi6_ifindex; - fl6.flowi6_mark = sk->sk_mark; + fl6.flowi6_mark = ipc6.sockc.mark; fl6.flowi6_uid = sk->sk_uid; if (msg->msg_controllen) { -- cgit v1.2.3 From acdcecc61285faed359f1a3568c32089cc3a8329 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 12 Sep 2019 21:16:39 -0400 Subject: udp: correct reuseport selection with connected sockets UDP reuseport groups can hold a mix unconnected and connected sockets. Ensure that connections only receive all traffic to their 4-tuple. Fast reuseport returns on the first reuseport match on the assumption that all matches are equal. Only if connections are present, return to the previous behavior of scoring all sockets. Record if connections are present and if so (1) treat such connected sockets as an independent match from the group, (2) only return 2-tuple matches from reuseport and (3) do not return on the first 2-tuple reuseport match to allow for a higher scoring match later. New field has_conns is set without locks. No other fields in the bitmap are modified at runtime and the field is only ever set unconditionally, so an RMW cannot miss a change. Fixes: e32ea7e74727 ("soreuseport: fast reuseport UDP socket selection") Link: http://lkml.kernel.org/r/CA+FuTSfRP09aJNYRt04SS6qj22ViiOEWaWmLAwX0psk8-PGNxw@mail.gmail.com Signed-off-by: Willem de Bruijn Acked-by: Paolo Abeni Acked-by: Craig Gallek Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/sock_reuseport.h | 20 +++++++++++++++++++- net/core/sock_reuseport.c | 15 +++++++++++++-- net/ipv4/datagram.c | 2 ++ net/ipv4/udp.c | 5 +++-- net/ipv6/datagram.c | 2 ++ net/ipv6/udp.c | 5 +++-- 6 files changed, 42 insertions(+), 7 deletions(-) (limited to 'net/ipv6') diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h index d9112de85261..43f4a818d88f 100644 --- a/include/net/sock_reuseport.h +++ b/include/net/sock_reuseport.h @@ -21,7 +21,8 @@ struct sock_reuseport { unsigned int synq_overflow_ts; /* ID stays the same even after the size of socks[] grows. */ unsigned int reuseport_id; - bool bind_inany; + unsigned int bind_inany:1; + unsigned int has_conns:1; struct bpf_prog __rcu *prog; /* optional BPF sock selector */ struct sock *socks[0]; /* array of sock pointers */ }; @@ -37,6 +38,23 @@ extern struct sock *reuseport_select_sock(struct sock *sk, extern int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog); extern int reuseport_detach_prog(struct sock *sk); +static inline bool reuseport_has_conns(struct sock *sk, bool set) +{ + struct sock_reuseport *reuse; + bool ret = false; + + rcu_read_lock(); + reuse = rcu_dereference(sk->sk_reuseport_cb); + if (reuse) { + if (set) + reuse->has_conns = 1; + ret = reuse->has_conns; + } + rcu_read_unlock(); + + return ret; +} + int reuseport_get_id(struct sock_reuseport *reuse); #endif /* _SOCK_REUSEPORT_H */ diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index 9408f9264d05..f3ceec93f392 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -295,8 +295,19 @@ struct sock *reuseport_select_sock(struct sock *sk, select_by_hash: /* no bpf or invalid bpf result: fall back to hash usage */ - if (!sk2) - sk2 = reuse->socks[reciprocal_scale(hash, socks)]; + if (!sk2) { + int i, j; + + i = j = reciprocal_scale(hash, socks); + while (reuse->socks[i]->sk_state == TCP_ESTABLISHED) { + i++; + if (i >= reuse->num_socks) + i = 0; + if (i == j) + goto out; + } + sk2 = reuse->socks[i]; + } } out: diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index 7bd29e694603..9a0fe0c2fa02 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -15,6 +15,7 @@ #include #include #include +#include int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { @@ -69,6 +70,7 @@ int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len } inet->inet_daddr = fl4->daddr; inet->inet_dport = usin->sin_port; + reuseport_has_conns(sk, true); sk->sk_state = TCP_ESTABLISHED; sk_set_txhash(sk); inet->inet_id = jiffies; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index d88821c794fb..16486c8b708b 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -423,12 +423,13 @@ static struct sock *udp4_lib_lookup2(struct net *net, score = compute_score(sk, net, saddr, sport, daddr, hnum, dif, sdif); if (score > badness) { - if (sk->sk_reuseport) { + if (sk->sk_reuseport && + sk->sk_state != TCP_ESTABLISHED) { hash = udp_ehashfn(net, daddr, hnum, saddr, sport); result = reuseport_select_sock(sk, hash, skb, sizeof(struct udphdr)); - if (result) + if (result && !reuseport_has_conns(sk, false)) return result; } badness = score; diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 9ab897ded4df..96f939248d2f 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -254,6 +255,7 @@ ipv4_connected: goto out; } + reuseport_has_conns(sk, true); sk->sk_state = TCP_ESTABLISHED; sk_set_txhash(sk); out: diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 827fe7385078..5995fdc99d3f 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -158,13 +158,14 @@ static struct sock *udp6_lib_lookup2(struct net *net, score = compute_score(sk, net, saddr, sport, daddr, hnum, dif, sdif); if (score > badness) { - if (sk->sk_reuseport) { + if (sk->sk_reuseport && + sk->sk_state != TCP_ESTABLISHED) { hash = udp6_ehashfn(net, daddr, hnum, saddr, sport); result = reuseport_select_sock(sk, hash, skb, sizeof(struct udphdr)); - if (result) + if (result && !reuseport_has_conns(sk, false)) return result; } result = sk; -- cgit v1.2.3 From 28e486037747c2180470b77c290d4090ad42f259 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 13 Sep 2019 17:45:47 +0800 Subject: ip6_gre: fix a dst leak in ip6erspan_tunnel_xmit In ip6erspan_tunnel_xmit(), if the skb will not be sent out, it has to be freed on the tx_err path. Otherwise when deleting a netns, it would cause dst/dev to leak, and dmesg shows: unregister_netdevice: waiting for lo to become free. Usage count = 1 Fixes: ef7baf5e083c ("ip6_gre: add ip6 erspan collect_md mode") Signed-off-by: Xin Long Acked-by: William Tu Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv6') diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index dd2d0b963260..d5779d6a6065 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -968,7 +968,7 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || ip_tunnel_info_af(tun_info) != AF_INET6)) - return -EINVAL; + goto tx_err; key = &tun_info->key; memset(&fl6, 0, sizeof(fl6)); -- cgit v1.2.3