diff options
Diffstat (limited to 'net/ipv4/inet_connection_sock.c')
-rw-r--r-- | net/ipv4/inet_connection_sock.c | 275 |
1 files changed, 200 insertions, 75 deletions
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 19ea045c50ed..096a085611ab 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -31,6 +31,86 @@ const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; EXPORT_SYMBOL(inet_csk_timer_bug_msg); #endif +#if IS_ENABLED(CONFIG_IPV6) +/* match_wildcard == true: IPV6_ADDR_ANY equals to any IPv6 addresses if IPv6 + * only, and any IPv4 addresses if not IPv6 only + * match_wildcard == false: addresses must be exactly the same, i.e. + * IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY, + * and 0.0.0.0 equals to 0.0.0.0 only + */ +static int ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6, + const struct in6_addr *sk2_rcv_saddr6, + __be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr, + bool sk1_ipv6only, bool sk2_ipv6only, + bool match_wildcard) +{ + int addr_type = ipv6_addr_type(sk1_rcv_saddr6); + int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED; + + /* if both are mapped, treat as IPv4 */ + if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) { + if (!sk2_ipv6only) { + if (sk1_rcv_saddr == sk2_rcv_saddr) + return 1; + if (!sk1_rcv_saddr || !sk2_rcv_saddr) + return match_wildcard; + } + return 0; + } + + if (addr_type == IPV6_ADDR_ANY && addr_type2 == IPV6_ADDR_ANY) + return 1; + + if (addr_type2 == IPV6_ADDR_ANY && match_wildcard && + !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED)) + return 1; + + if (addr_type == IPV6_ADDR_ANY && match_wildcard && + !(sk1_ipv6only && addr_type2 == IPV6_ADDR_MAPPED)) + return 1; + + if (sk2_rcv_saddr6 && + ipv6_addr_equal(sk1_rcv_saddr6, sk2_rcv_saddr6)) + return 1; + + return 0; +} +#endif + +/* match_wildcard == true: 0.0.0.0 equals to any IPv4 addresses + * match_wildcard == false: addresses must be exactly the same, i.e. + * 0.0.0.0 only equals to 0.0.0.0 + */ +static int ipv4_rcv_saddr_equal(__be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr, + bool sk2_ipv6only, bool match_wildcard) +{ + if (!sk2_ipv6only) { + if (sk1_rcv_saddr == sk2_rcv_saddr) + return 1; + if (!sk1_rcv_saddr || !sk2_rcv_saddr) + return match_wildcard; + } + return 0; +} + +int inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, + bool match_wildcard) +{ +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) + return ipv6_rcv_saddr_equal(&sk->sk_v6_rcv_saddr, + &sk2->sk_v6_rcv_saddr, + sk->sk_rcv_saddr, + sk2->sk_rcv_saddr, + ipv6_only_sock(sk), + ipv6_only_sock(sk2), + match_wildcard); +#endif + return ipv4_rcv_saddr_equal(sk->sk_rcv_saddr, sk2->sk_rcv_saddr, + ipv6_only_sock(sk2), match_wildcard); +} +EXPORT_SYMBOL(inet_rcv_saddr_equal); + void inet_get_local_port_range(struct net *net, int *low, int *high) { unsigned int seq; @@ -44,9 +124,9 @@ void inet_get_local_port_range(struct net *net, int *low, int *high) } EXPORT_SYMBOL(inet_get_local_port_range); -int inet_csk_bind_conflict(const struct sock *sk, - const struct inet_bind_bucket *tb, bool relax, - bool reuseport_ok) +static int inet_csk_bind_conflict(const struct sock *sk, + const struct inet_bind_bucket *tb, + bool relax, bool reuseport_ok) { struct sock *sk2; bool reuse = sk->sk_reuse; @@ -62,7 +142,6 @@ int inet_csk_bind_conflict(const struct sock *sk, sk_for_each_bound(sk2, &tb->owners) { if (sk != sk2 && - !inet_v6_ipv6only(sk2) && (!sk->sk_bound_dev_if || !sk2->sk_bound_dev_if || sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { @@ -72,54 +151,34 @@ int inet_csk_bind_conflict(const struct sock *sk, rcu_access_pointer(sk->sk_reuseport_cb) || (sk2->sk_state != TCP_TIME_WAIT && !uid_eq(uid, sock_i_uid(sk2))))) { - - if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr || - sk2->sk_rcv_saddr == sk->sk_rcv_saddr) + if (inet_rcv_saddr_equal(sk, sk2, true)) break; } if (!relax && reuse && sk2->sk_reuse && sk2->sk_state != TCP_LISTEN) { - - if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr || - sk2->sk_rcv_saddr == sk->sk_rcv_saddr) + if (inet_rcv_saddr_equal(sk, sk2, true)) break; } } } return sk2 != NULL; } -EXPORT_SYMBOL_GPL(inet_csk_bind_conflict); -/* Obtain a reference to a local port for the given sock, - * if snum is zero it means select any available local port. - * We try to allocate an odd port (and leave even ports for connect()) +/* + * Find an open port number for the socket. Returns with the + * inet_bind_hashbucket lock held. */ -int inet_csk_get_port(struct sock *sk, unsigned short snum) +static struct inet_bind_hashbucket * +inet_csk_find_open_port(struct sock *sk, struct inet_bind_bucket **tb_ret, int *port_ret) { - bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo; - int ret = 1, attempts = 5, port = snum; - int smallest_size = -1, smallest_port; + int port = 0; struct inet_bind_hashbucket *head; struct net *net = sock_net(sk); int i, low, high, attempt_half; struct inet_bind_bucket *tb; - kuid_t uid = sock_i_uid(sk); u32 remaining, offset; - bool reuseport_ok = !!snum; - if (port) { -have_port: - head = &hinfo->bhash[inet_bhashfn(net, port, - hinfo->bhash_size)]; - spin_lock_bh(&head->lock); - inet_bind_bucket_for_each(tb, &head->chain) - if (net_eq(ib_net(tb), net) && tb->port == port) - goto tb_found; - - goto tb_not_found; - } -again: attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0; other_half_scan: inet_get_local_port_range(net, &low, &high); @@ -143,8 +202,6 @@ other_half_scan: * We do the opposite to not pollute connect() users. */ offset |= 1U; - smallest_size = -1; - smallest_port = low; /* avoid compiler warning */ other_parity_scan: port = low + offset; @@ -158,30 +215,17 @@ other_parity_scan: spin_lock_bh(&head->lock); inet_bind_bucket_for_each(tb, &head->chain) if (net_eq(ib_net(tb), net) && tb->port == port) { - if (((tb->fastreuse > 0 && reuse) || - (tb->fastreuseport > 0 && - sk->sk_reuseport && - !rcu_access_pointer(sk->sk_reuseport_cb) && - uid_eq(tb->fastuid, uid))) && - (tb->num_owners < smallest_size || smallest_size == -1)) { - smallest_size = tb->num_owners; - smallest_port = port; - } - if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false, - reuseport_ok)) - goto tb_found; + if (!inet_csk_bind_conflict(sk, tb, false, false)) + goto success; goto next_port; } - goto tb_not_found; + tb = NULL; + goto success; next_port: spin_unlock_bh(&head->lock); cond_resched(); } - if (smallest_size != -1) { - port = smallest_port; - goto have_port; - } offset--; if (!(offset & 1)) goto other_parity_scan; @@ -191,8 +235,74 @@ next_port: attempt_half = 2; goto other_half_scan; } - return ret; + return NULL; +success: + *port_ret = port; + *tb_ret = tb; + return head; +} +static inline int sk_reuseport_match(struct inet_bind_bucket *tb, + struct sock *sk) +{ + kuid_t uid = sock_i_uid(sk); + + if (tb->fastreuseport <= 0) + return 0; + if (!sk->sk_reuseport) + return 0; + if (rcu_access_pointer(sk->sk_reuseport_cb)) + return 0; + if (!uid_eq(tb->fastuid, uid)) + return 0; + /* We only need to check the rcv_saddr if this tb was once marked + * without fastreuseport and then was reset, as we can only know that + * the fast_*rcv_saddr doesn't have any conflicts with the socks on the + * owners list. + */ + if (tb->fastreuseport == FASTREUSEPORT_ANY) + return 1; +#if IS_ENABLED(CONFIG_IPV6) + if (tb->fast_sk_family == AF_INET6) + return ipv6_rcv_saddr_equal(&tb->fast_v6_rcv_saddr, + &sk->sk_v6_rcv_saddr, + tb->fast_rcv_saddr, + sk->sk_rcv_saddr, + tb->fast_ipv6_only, + ipv6_only_sock(sk), true); +#endif + return ipv4_rcv_saddr_equal(tb->fast_rcv_saddr, sk->sk_rcv_saddr, + ipv6_only_sock(sk), true); +} + +/* Obtain a reference to a local port for the given sock, + * if snum is zero it means select any available local port. + * We try to allocate an odd port (and leave even ports for connect()) + */ +int inet_csk_get_port(struct sock *sk, unsigned short snum) +{ + bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; + struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo; + int ret = 1, port = snum; + struct inet_bind_hashbucket *head; + struct net *net = sock_net(sk); + struct inet_bind_bucket *tb = NULL; + kuid_t uid = sock_i_uid(sk); + + if (!port) { + head = inet_csk_find_open_port(sk, &tb, &port); + if (!head) + return ret; + if (!tb) + goto tb_not_found; + goto success; + } + head = &hinfo->bhash[inet_bhashfn(net, port, + hinfo->bhash_size)]; + spin_lock_bh(&head->lock); + inet_bind_bucket_for_each(tb, &head->chain) + if (net_eq(ib_net(tb), net) && tb->port == port) + goto tb_found; tb_not_found: tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, net, head, port); @@ -203,39 +313,54 @@ tb_found: if (sk->sk_reuse == SK_FORCE_REUSE) goto success; - if (((tb->fastreuse > 0 && reuse) || - (tb->fastreuseport > 0 && - !rcu_access_pointer(sk->sk_reuseport_cb) && - sk->sk_reuseport && uid_eq(tb->fastuid, uid))) && - smallest_size == -1) + if ((tb->fastreuse > 0 && reuse) || + sk_reuseport_match(tb, sk)) goto success; - if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true, - reuseport_ok)) { - if ((reuse || - (tb->fastreuseport > 0 && - sk->sk_reuseport && - !rcu_access_pointer(sk->sk_reuseport_cb) && - uid_eq(tb->fastuid, uid))) && - !snum && smallest_size != -1 && --attempts >= 0) { - spin_unlock_bh(&head->lock); - goto again; - } + if (inet_csk_bind_conflict(sk, tb, true, true)) goto fail_unlock; + } +success: + if (!hlist_empty(&tb->owners)) { + tb->fastreuse = reuse; + if (sk->sk_reuseport) { + tb->fastreuseport = FASTREUSEPORT_ANY; + tb->fastuid = uid; + tb->fast_rcv_saddr = sk->sk_rcv_saddr; + tb->fast_ipv6_only = ipv6_only_sock(sk); +#if IS_ENABLED(CONFIG_IPV6) + tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr; +#endif + } else { + tb->fastreuseport = 0; } + } else { if (!reuse) tb->fastreuse = 0; - if (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid)) - tb->fastreuseport = 0; - } else { - tb->fastreuse = reuse; if (sk->sk_reuseport) { - tb->fastreuseport = 1; - tb->fastuid = uid; + /* We didn't match or we don't have fastreuseport set on + * the tb, but we have sk_reuseport set on this socket + * and we know that there are no bind conflicts with + * this socket in this tb, so reset our tb's reuseport + * settings so that any subsequent sockets that match + * our current socket will be put on the fast path. + * + * If we reset we need to set FASTREUSEPORT_STRICT so we + * do extra checking for all subsequent sk_reuseport + * socks. + */ + if (!sk_reuseport_match(tb, sk)) { + tb->fastreuseport = FASTREUSEPORT_STRICT; + tb->fastuid = uid; + tb->fast_rcv_saddr = sk->sk_rcv_saddr; + tb->fast_ipv6_only = ipv6_only_sock(sk); +#if IS_ENABLED(CONFIG_IPV6) + tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr; +#endif + } } else { tb->fastreuseport = 0; } } -success: if (!inet_csk(sk)->icsk_bind_hash) inet_bind_hash(sk, tb, port); WARN_ON(inet_csk(sk)->icsk_bind_hash != tb); |