From ea3bea3a1d38aab1542176b2ff11a99ce3db9656 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 25 Sep 2015 07:39:23 -0700 Subject: tcp/dccp: constify rtx_synack() and friends This is done to make sure we do not change listener socket while sending SYNACK packets while socket lock is not held. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/request_sock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net/request_sock.h') diff --git a/include/net/request_sock.h b/include/net/request_sock.h index 87935cad2f7b..ff7ce1e53ed4 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -32,7 +32,7 @@ struct request_sock_ops { int obj_size; struct kmem_cache *slab; char *slab_name; - int (*rtx_syn_ack)(struct sock *sk, + int (*rtx_syn_ack)(const struct sock *sk, struct request_sock *req); void (*send_ack)(struct sock *sk, struct sk_buff *skb, struct request_sock *req); -- cgit v1.2.3 From 1b70e977cef6ce7e7411c9bbec21f9adc8e29097 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 25 Sep 2015 07:39:24 -0700 Subject: inet: constify inet_rtx_syn_ack() sock argument SYNACK packets are sent on behalf on unlocked listeners or fastopen sockets. Mark socket as const to catch future changes that might break the assumption. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/request_sock.h | 2 +- net/ipv4/inet_connection_sock.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/net/request_sock.h') diff --git a/include/net/request_sock.h b/include/net/request_sock.h index ff7ce1e53ed4..181f97f9fe1c 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -42,7 +42,7 @@ struct request_sock_ops { void (*syn_ack_timeout)(const struct request_sock *req); }; -int inet_rtx_syn_ack(struct sock *parent, struct request_sock *req); +int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req); /* struct request_sock - mini sock to represent a connection request */ diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index ad087c14f020..bac205136e1c 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -563,7 +563,7 @@ static inline void syn_ack_recalc(struct request_sock *req, const int thresh, req->num_timeout >= rskq_defer_accept - 1; } -int inet_rtx_syn_ack(struct sock *parent, struct request_sock *req) +int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req) { int err = req->rsk_ops->rtx_syn_ack(parent, req); -- cgit v1.2.3 From a00e74442bac5ad19a929d097370da7e07540ea6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 29 Sep 2015 07:42:39 -0700 Subject: tcp/dccp: constify send_synack and send_reset socket argument None of these functions need to change the socket, make it const. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/request_sock.h | 4 ++-- net/dccp/dccp.h | 2 +- net/dccp/ipv4.c | 2 +- net/dccp/ipv6.c | 2 +- net/dccp/minisocks.c | 2 +- net/ipv4/tcp_ipv4.c | 4 ++-- net/ipv6/tcp_ipv6.c | 12 ++++++------ 7 files changed, 14 insertions(+), 14 deletions(-) (limited to 'include/net/request_sock.h') diff --git a/include/net/request_sock.h b/include/net/request_sock.h index 181f97f9fe1c..90247ec7955b 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -34,9 +34,9 @@ struct request_sock_ops { char *slab_name; int (*rtx_syn_ack)(const struct sock *sk, struct request_sock *req); - void (*send_ack)(struct sock *sk, struct sk_buff *skb, + void (*send_ack)(const struct sock *sk, struct sk_buff *skb, struct request_sock *req); - void (*send_reset)(struct sock *sk, + void (*send_reset)(const struct sock *sk, struct sk_buff *skb); void (*destructor)(struct request_sock *req); void (*syn_ack_timeout)(const struct request_sock *req); diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 31e96df500d1..8ed1df2771bd 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -229,7 +229,7 @@ void dccp_v4_send_check(struct sock *sk, struct sk_buff *skb); int dccp_retransmit_skb(struct sock *sk); void dccp_send_ack(struct sock *sk); -void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, +void dccp_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, struct request_sock *rsk); void dccp_send_sync(struct sock *sk, const u64 seq, diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index a46ae9c69ccf..00a14fa4270a 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -527,7 +527,7 @@ out: return err; } -static void dccp_v4_ctl_send_reset(struct sock *sk, struct sk_buff *rxskb) +static void dccp_v4_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb) { int err; const struct iphdr *rxiph; diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 4fa199dc69a3..aa719e700961 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -234,7 +234,7 @@ static void dccp_v6_reqsk_destructor(struct request_sock *req) kfree_skb(inet_rsk(req)->pktopts); } -static void dccp_v6_ctl_send_reset(struct sock *sk, struct sk_buff *rxskb) +static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb) { const struct ipv6hdr *rxip6h; struct sk_buff *skb; diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index 838f524cf11a..9bfd0dc1e6cb 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -236,7 +236,7 @@ int dccp_child_process(struct sock *parent, struct sock *child, EXPORT_SYMBOL_GPL(dccp_child_process); -void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, +void dccp_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, struct request_sock *rsk) { DCCP_BUG("DCCP-ACK packets are never sent in LISTEN/RESPOND state"); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a23ba7daecbf..4300d0132b9f 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -576,7 +576,7 @@ EXPORT_SYMBOL(tcp_v4_send_check); * Exception: precedence violation. We do not implement it in any case. */ -static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) +static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) { const struct tcphdr *th = tcp_hdr(skb); struct { @@ -795,7 +795,7 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) inet_twsk_put(tw); } -static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, +static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, struct request_sock *req) { /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 16fb299dcab8..c47e5c87a2a8 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -70,8 +70,8 @@ #include #include -static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb); -static void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, +static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb); +static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, struct request_sock *req); static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb); @@ -724,7 +724,7 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { .queue_hash_add = inet6_csk_reqsk_queue_hash_add, }; -static void tcp_v6_send_response(struct sock *sk, struct sk_buff *skb, u32 seq, +static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 tsval, u32 tsecr, int oif, struct tcp_md5sig_key *key, int rst, u8 tclass, u32 label) @@ -823,7 +823,7 @@ static void tcp_v6_send_response(struct sock *sk, struct sk_buff *skb, u32 seq, kfree_skb(buff); } -static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb) +static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) { const struct tcphdr *th = tcp_hdr(skb); u32 seq = 0, ack_seq = 0; @@ -894,7 +894,7 @@ release_sk1: #endif } -static void tcp_v6_send_ack(struct sock *sk, struct sk_buff *skb, u32 seq, +static void tcp_v6_send_ack(const struct sock *sk, struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 tsval, u32 tsecr, int oif, struct tcp_md5sig_key *key, u8 tclass, u32 label) @@ -917,7 +917,7 @@ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) inet_twsk_put(tw); } -static void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, +static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, struct request_sock *req) { /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV -- cgit v1.2.3 From 2985aaac010ebd5e562ce1a22cc61acbb0e40cf2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 29 Sep 2015 07:42:51 -0700 Subject: tcp: constify tcp_syn_flood_action() socket argument tcp_syn_flood_action() will soon be called with unlocked socket. In order to avoid SYN flood warning being emitted multiple times, use xchg(). Extend max_qlen_log and synflood_warned fields in struct listen_sock to u32 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/request_sock.h | 5 ++--- net/ipv4/tcp_input.c | 9 +++++---- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include/net/request_sock.h') diff --git a/include/net/request_sock.h b/include/net/request_sock.h index 90247ec7955b..c146b5284786 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -129,9 +129,8 @@ struct listen_sock { atomic_t qlen_dec; /* qlen = qlen_inc - qlen_dec */ atomic_t young_dec; - u8 max_qlen_log ____cacheline_aligned_in_smp; - u8 synflood_warned; - /* 2 bytes hole, try to use */ + u32 max_qlen_log ____cacheline_aligned_in_smp; + u32 synflood_warned; u32 hash_rnd; u32 nr_table_entries; struct request_sock *syn_table[0]; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 67b27aee8d28..e58cbcd2f07e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6064,7 +6064,7 @@ EXPORT_SYMBOL(inet_reqsk_alloc); /* * Return true if a syncookie should be sent */ -static bool tcp_syn_flood_action(struct sock *sk, +static bool tcp_syn_flood_action(const struct sock *sk, const struct sk_buff *skb, const char *proto) { @@ -6082,11 +6082,12 @@ static bool tcp_syn_flood_action(struct sock *sk, NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP); lopt = inet_csk(sk)->icsk_accept_queue.listen_opt; - if (!lopt->synflood_warned && sysctl_tcp_syncookies != 2) { - lopt->synflood_warned = 1; + if (!lopt->synflood_warned && + sysctl_tcp_syncookies != 2 && + xchg(&lopt->synflood_warned, 1) == 0) pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n", proto, ntohs(tcp_hdr(skb)->dest), msg); - } + return want_cookie; } -- cgit v1.2.3 From 0536fcc039a8926ec12ec587f41a83f7acafeb82 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 29 Sep 2015 07:42:52 -0700 Subject: tcp: prepare fastopen code for upcoming listener changes While auditing TCP stack for upcoming 'lockless' listener changes, I found I had to change fastopen_init_queue() to properly init the object before publishing it. Otherwise an other cpu could try to lock the spinlock before it gets properly initialized. Instead of adding appropriate barriers, just remove dynamic memory allocations : - Structure is 28 bytes on 64bit arches. Using additional 8 bytes for holding a pointer seems overkill. - Two listeners can share same cache line and performance would suffer. If we really want to save few bytes, we would instead dynamically allocate whole struct request_sock_queue in the future. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/tcp.h | 22 ++++------------------ include/net/request_sock.h | 7 ++----- net/core/request_sock.c | 9 ++++++++- net/ipv4/af_inet.c | 10 +++------- net/ipv4/inet_connection_sock.c | 17 ++++++++--------- net/ipv4/tcp.c | 14 ++------------ net/ipv4/tcp_fastopen.c | 10 +++++----- net/ipv4/tcp_ipv4.c | 2 +- net/ipv6/tcp_ipv6.c | 4 ++-- 9 files changed, 35 insertions(+), 60 deletions(-) (limited to 'include/net/request_sock.h') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index fcb573be75d9..e442e6e9a365 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -382,25 +382,11 @@ static inline bool tcp_passive_fastopen(const struct sock *sk) tcp_sk(sk)->fastopen_rsk != NULL); } -extern void tcp_sock_destruct(struct sock *sk); - -static inline int fastopen_init_queue(struct sock *sk, int backlog) +static inline void fastopen_queue_tune(struct sock *sk, int backlog) { - struct request_sock_queue *queue = - &inet_csk(sk)->icsk_accept_queue; - - if (queue->fastopenq == NULL) { - queue->fastopenq = kzalloc( - sizeof(struct fastopen_queue), - sk->sk_allocation); - if (queue->fastopenq == NULL) - return -ENOMEM; - - sk->sk_destruct = tcp_sock_destruct; - spin_lock_init(&queue->fastopenq->lock); - } - queue->fastopenq->max_qlen = backlog; - return 0; + struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; + + queue->fastopenq.max_qlen = backlog; } static inline void tcp_saved_syn_free(struct tcp_sock *tp) diff --git a/include/net/request_sock.h b/include/net/request_sock.h index c146b5284786..d2544de329bd 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -180,11 +180,8 @@ struct request_sock_queue { struct request_sock *rskq_accept_tail; u8 rskq_defer_accept; struct listen_sock *listen_opt; - struct fastopen_queue *fastopenq; /* This is non-NULL iff TFO has been - * enabled on this listener. Check - * max_qlen != 0 in fastopen_queue - * to determine if TFO is enabled - * right at this moment. + struct fastopen_queue fastopenq; /* Check max_qlen != 0 to determine + * if TFO is enabled. */ /* temporary alignment, our goal is to get rid of this lock */ diff --git a/net/core/request_sock.c b/net/core/request_sock.c index b42f0e26f89e..e22cfa4ed25f 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -59,6 +59,13 @@ int reqsk_queue_alloc(struct request_sock_queue *queue, get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd)); spin_lock_init(&queue->syn_wait_lock); + + spin_lock_init(&queue->fastopenq.lock); + queue->fastopenq.rskq_rst_head = NULL; + queue->fastopenq.rskq_rst_tail = NULL; + queue->fastopenq.qlen = 0; + queue->fastopenq.max_qlen = 0; + queue->rskq_accept_head = NULL; lopt->nr_table_entries = nr_table_entries; lopt->max_qlen_log = ilog2(nr_table_entries); @@ -174,7 +181,7 @@ void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, struct sock *lsk = req->rsk_listener; struct fastopen_queue *fastopenq; - fastopenq = inet_csk(lsk)->icsk_accept_queue.fastopenq; + fastopenq = &inet_csk(lsk)->icsk_accept_queue.fastopenq; tcp_sk(sk)->fastopen_rsk = NULL; spin_lock_bh(&fastopenq->lock); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 8a556643b874..3af85eecbe11 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -219,17 +219,13 @@ int inet_listen(struct socket *sock, int backlog) * shutdown() (rather than close()). */ if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) != 0 && - !inet_csk(sk)->icsk_accept_queue.fastopenq) { + !inet_csk(sk)->icsk_accept_queue.fastopenq.max_qlen) { if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) != 0) - err = fastopen_init_queue(sk, backlog); + fastopen_queue_tune(sk, backlog); else if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT2) != 0) - err = fastopen_init_queue(sk, + fastopen_queue_tune(sk, ((uint)sysctl_tcp_fastopen) >> 16); - else - err = 0; - if (err) - goto out; tcp_fastopen_init_key_once(true); } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 694a5e8f4f9f..e1527882a578 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -335,9 +335,8 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) sk_acceptq_removed(sk); if (sk->sk_protocol == IPPROTO_TCP && - tcp_rsk(req)->tfo_listener && - queue->fastopenq) { - spin_lock_bh(&queue->fastopenq->lock); + tcp_rsk(req)->tfo_listener) { + spin_lock_bh(&queue->fastopenq.lock); if (tcp_rsk(req)->tfo_listener) { /* We are still waiting for the final ACK from 3WHS * so can't free req now. Instead, we set req->sk to @@ -348,7 +347,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) req->sk = NULL; req = NULL; } - spin_unlock_bh(&queue->fastopenq->lock); + spin_unlock_bh(&queue->fastopenq.lock); } out: release_sock(sk); @@ -886,12 +885,12 @@ void inet_csk_listen_stop(struct sock *sk) sk_acceptq_removed(sk); reqsk_put(req); } - if (queue->fastopenq) { + if (queue->fastopenq.rskq_rst_head) { /* Free all the reqs queued in rskq_rst_head. */ - spin_lock_bh(&queue->fastopenq->lock); - acc_req = queue->fastopenq->rskq_rst_head; - queue->fastopenq->rskq_rst_head = NULL; - spin_unlock_bh(&queue->fastopenq->lock); + spin_lock_bh(&queue->fastopenq.lock); + acc_req = queue->fastopenq.rskq_rst_head; + queue->fastopenq.rskq_rst_head = NULL; + spin_unlock_bh(&queue->fastopenq.lock); while ((req = acc_req) != NULL) { acc_req = req->dl_next; reqsk_put(req); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index b8b8fa184f75..3c96fa87ff9e 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2253,13 +2253,6 @@ int tcp_disconnect(struct sock *sk, int flags) } EXPORT_SYMBOL(tcp_disconnect); -void tcp_sock_destruct(struct sock *sk) -{ - inet_sock_destruct(sk); - - kfree(inet_csk(sk)->icsk_accept_queue.fastopenq); -} - static inline bool tcp_can_repair_sock(const struct sock *sk) { return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) && @@ -2581,7 +2574,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, TCPF_LISTEN))) { tcp_fastopen_init_key_once(true); - err = fastopen_init_queue(sk, val); + fastopen_queue_tune(sk, val); } else { err = -EINVAL; } @@ -2849,10 +2842,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level, break; case TCP_FASTOPEN: - if (icsk->icsk_accept_queue.fastopenq) - val = icsk->icsk_accept_queue.fastopenq->max_qlen; - else - val = 0; + val = icsk->icsk_accept_queue.fastopenq.max_qlen; break; case TCP_TIMESTAMP: diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index db43c6286cf7..f69f436fcbcc 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -142,9 +142,9 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, if (!child) return NULL; - spin_lock(&queue->fastopenq->lock); - queue->fastopenq->qlen++; - spin_unlock(&queue->fastopenq->lock); + spin_lock(&queue->fastopenq.lock); + queue->fastopenq.qlen++; + spin_unlock(&queue->fastopenq.lock); /* Initialize the child socket. Have to fix some values to take * into account the child is a Fast Open socket and is created @@ -237,8 +237,8 @@ static bool tcp_fastopen_queue_check(struct sock *sk) * between qlen overflow causing Fast Open to be disabled * temporarily vs a server not supporting Fast Open at all. */ - fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq; - if (!fastopenq || fastopenq->max_qlen == 0) + fastopenq = &inet_csk(sk)->icsk_accept_queue.fastopenq; + if (fastopenq->max_qlen == 0) return false; if (fastopenq->qlen >= fastopenq->max_qlen) { diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index f551e9e862db..64ece718d66c 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2186,7 +2186,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) const struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); const struct inet_sock *inet = inet_sk(sk); - struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq; + const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; __be32 dest = inet->inet_daddr; __be32 src = inet->inet_rcv_saddr; __u16 destp = ntohs(inet->inet_dport); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 97bc26e0cd0f..0ac64f47f882 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1672,7 +1672,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) const struct inet_sock *inet = inet_sk(sp); const struct tcp_sock *tp = tcp_sk(sp); const struct inet_connection_sock *icsk = inet_csk(sp); - struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq; + const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; dest = &sp->sk_v6_daddr; src = &sp->sk_v6_rcv_saddr; @@ -1716,7 +1716,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong, tp->snd_cwnd, sp->sk_state == TCP_LISTEN ? - (fastopenq ? fastopenq->max_qlen : 0) : + fastopenq->max_qlen : (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh) ); } -- cgit v1.2.3 From fff1f3001cc58b5064a0f1154a7ac09b76f29c44 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 2 Oct 2015 11:43:23 -0700 Subject: tcp: add a spinlock to protect struct request_sock_queue struct request_sock_queue fields are currently protected by the listener 'lock' (not a real spinlock) We need to add a private spinlock instead, so that softirq handlers creating children do not have to worry with backlog notion that the listener 'lock' carries. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/request_sock.h | 37 ++++++++++++++++++------------------- net/core/request_sock.c | 1 + net/ipv4/inet_connection_sock.c | 21 +++++++-------------- 3 files changed, 26 insertions(+), 33 deletions(-) (limited to 'include/net/request_sock.h') diff --git a/include/net/request_sock.h b/include/net/request_sock.h index d2544de329bd..202e36163ae3 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -176,9 +176,11 @@ struct fastopen_queue { * */ struct request_sock_queue { + spinlock_t rskq_lock; + u8 rskq_defer_accept; + struct request_sock *rskq_accept_head; struct request_sock *rskq_accept_tail; - u8 rskq_defer_accept; struct listen_sock *listen_opt; struct fastopen_queue fastopenq; /* Check max_qlen != 0 to determine * if TFO is enabled. @@ -196,16 +198,7 @@ void reqsk_queue_destroy(struct request_sock_queue *queue); void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, bool reset); -static inline struct request_sock * - reqsk_queue_yank_acceptq(struct request_sock_queue *queue) -{ - struct request_sock *req = queue->rskq_accept_head; - - queue->rskq_accept_head = NULL; - return req; -} - -static inline int reqsk_queue_empty(struct request_sock_queue *queue) +static inline bool reqsk_queue_empty(const struct request_sock_queue *queue) { return queue->rskq_accept_head == NULL; } @@ -215,6 +208,7 @@ static inline void reqsk_queue_add(struct request_sock_queue *queue, struct sock *parent, struct sock *child) { + spin_lock(&queue->rskq_lock); req->sk = child; sk_acceptq_added(parent); @@ -225,18 +219,23 @@ static inline void reqsk_queue_add(struct request_sock_queue *queue, queue->rskq_accept_tail = req; req->dl_next = NULL; + spin_unlock(&queue->rskq_lock); } -static inline struct request_sock *reqsk_queue_remove(struct request_sock_queue *queue) +static inline struct request_sock *reqsk_queue_remove(struct request_sock_queue *queue, + struct sock *parent) { - struct request_sock *req = queue->rskq_accept_head; - - WARN_ON(req == NULL); - - queue->rskq_accept_head = req->dl_next; - if (queue->rskq_accept_head == NULL) - queue->rskq_accept_tail = NULL; + struct request_sock *req; + spin_lock_bh(&queue->rskq_lock); + req = queue->rskq_accept_head; + if (req) { + sk_acceptq_removed(parent); + queue->rskq_accept_head = req->dl_next; + if (queue->rskq_accept_head == NULL) + queue->rskq_accept_tail = NULL; + } + spin_unlock_bh(&queue->rskq_lock); return req; } diff --git a/net/core/request_sock.c b/net/core/request_sock.c index e22cfa4ed25f..8d9fd31d3d06 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -58,6 +58,7 @@ int reqsk_queue_alloc(struct request_sock_queue *queue, return -ENOMEM; get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd)); + spin_lock_init(&queue->rskq_lock); spin_lock_init(&queue->syn_wait_lock); spin_lock_init(&queue->fastopenq.lock); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index e1527882a578..0085612b9e49 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -330,10 +330,9 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) if (error) goto out_err; } - req = reqsk_queue_remove(queue); + req = reqsk_queue_remove(queue, sk); newsk = req->sk; - sk_acceptq_removed(sk); if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) { spin_lock_bh(&queue->fastopenq.lock); @@ -832,11 +831,7 @@ void inet_csk_listen_stop(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct request_sock_queue *queue = &icsk->icsk_accept_queue; - struct request_sock *acc_req; - struct request_sock *req; - - /* make all the listen_opt local to us */ - acc_req = reqsk_queue_yank_acceptq(queue); + struct request_sock *next, *req; /* Following specs, it would be better either to send FIN * (and enter FIN-WAIT-1, it is normal close) @@ -848,11 +843,9 @@ void inet_csk_listen_stop(struct sock *sk) */ reqsk_queue_destroy(queue); - while ((req = acc_req) != NULL) { + while ((req = reqsk_queue_remove(queue, sk)) != NULL) { struct sock *child = req->sk; - acc_req = req->dl_next; - local_bh_disable(); bh_lock_sock(child); WARN_ON(sock_owned_by_user(child)); @@ -882,18 +875,18 @@ void inet_csk_listen_stop(struct sock *sk) local_bh_enable(); sock_put(child); - sk_acceptq_removed(sk); reqsk_put(req); } if (queue->fastopenq.rskq_rst_head) { /* Free all the reqs queued in rskq_rst_head. */ spin_lock_bh(&queue->fastopenq.lock); - acc_req = queue->fastopenq.rskq_rst_head; + req = queue->fastopenq.rskq_rst_head; queue->fastopenq.rskq_rst_head = NULL; spin_unlock_bh(&queue->fastopenq.lock); - while ((req = acc_req) != NULL) { - acc_req = req->dl_next; + while (req != NULL) { + next = req->dl_next; reqsk_put(req); + req = next; } } WARN_ON(sk->sk_ack_backlog); -- cgit v1.2.3 From aac065c50aba0c534a929aeb687eb68c58e523b8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 2 Oct 2015 11:43:24 -0700 Subject: tcp: move qlen/young out of struct listen_sock qlen_inc & young_inc were protected by listener lock, while qlen_dec & young_dec were atomic fields. Everything needs to be atomic for upcoming lockless listener. Also move qlen/young in request_sock_queue as we'll get rid of struct listen_sock eventually. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/request_sock.h | 40 ++++++++++------------------------------ net/core/request_sock.c | 8 ++++---- net/ipv4/inet_connection_sock.c | 6 +++--- net/ipv4/inet_diag.c | 2 +- 4 files changed, 18 insertions(+), 38 deletions(-) (limited to 'include/net/request_sock.h') diff --git a/include/net/request_sock.h b/include/net/request_sock.h index 202e36163ae3..d128e7f89042 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -122,14 +122,7 @@ extern int sysctl_max_syn_backlog; * @max_qlen_log - log_2 of maximal queued SYNs/REQUESTs */ struct listen_sock { - int qlen_inc; /* protected by listener lock */ - int young_inc;/* protected by listener lock */ - - /* following fields can be updated by timer */ - atomic_t qlen_dec; /* qlen = qlen_inc - qlen_dec */ - atomic_t young_dec; - - u32 max_qlen_log ____cacheline_aligned_in_smp; + u32 max_qlen_log; u32 synflood_warned; u32 hash_rnd; u32 nr_table_entries; @@ -179,6 +172,9 @@ struct request_sock_queue { spinlock_t rskq_lock; u8 rskq_defer_accept; + atomic_t qlen; + atomic_t young; + struct request_sock *rskq_accept_head; struct request_sock *rskq_accept_tail; struct listen_sock *listen_opt; @@ -242,41 +238,25 @@ static inline struct request_sock *reqsk_queue_remove(struct request_sock_queue static inline void reqsk_queue_removed(struct request_sock_queue *queue, const struct request_sock *req) { - struct listen_sock *lopt = queue->listen_opt; - if (req->num_timeout == 0) - atomic_inc(&lopt->young_dec); - atomic_inc(&lopt->qlen_dec); + atomic_dec(&queue->young); + atomic_dec(&queue->qlen); } static inline void reqsk_queue_added(struct request_sock_queue *queue) { - struct listen_sock *lopt = queue->listen_opt; - - lopt->young_inc++; - lopt->qlen_inc++; -} - -static inline int listen_sock_qlen(const struct listen_sock *lopt) -{ - return lopt->qlen_inc - atomic_read(&lopt->qlen_dec); -} - -static inline int listen_sock_young(const struct listen_sock *lopt) -{ - return lopt->young_inc - atomic_read(&lopt->young_dec); + atomic_inc(&queue->young); + atomic_inc(&queue->qlen); } static inline int reqsk_queue_len(const struct request_sock_queue *queue) { - const struct listen_sock *lopt = queue->listen_opt; - - return lopt ? listen_sock_qlen(lopt) : 0; + return atomic_read(&queue->qlen); } static inline int reqsk_queue_len_young(const struct request_sock_queue *queue) { - return listen_sock_young(queue->listen_opt); + return atomic_read(&queue->young); } static inline int reqsk_queue_is_full(const struct request_sock_queue *queue) diff --git a/net/core/request_sock.c b/net/core/request_sock.c index 8d9fd31d3d06..5ca624cea04c 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -102,7 +102,7 @@ void reqsk_queue_destroy(struct request_sock_queue *queue) /* make all the listen_opt local to us */ struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue); - if (listen_sock_qlen(lopt) != 0) { + if (reqsk_queue_len(queue) != 0) { unsigned int i; for (i = 0; i < lopt->nr_table_entries; i++) { @@ -116,7 +116,7 @@ void reqsk_queue_destroy(struct request_sock_queue *queue) * or risk a dead lock. */ spin_unlock_bh(&queue->syn_wait_lock); - atomic_inc(&lopt->qlen_dec); + atomic_dec(&queue->qlen); if (del_timer_sync(&req->rsk_timer)) reqsk_put(req); reqsk_put(req); @@ -126,8 +126,8 @@ void reqsk_queue_destroy(struct request_sock_queue *queue) } } - if (WARN_ON(listen_sock_qlen(lopt) != 0)) - pr_err("qlen %u\n", listen_sock_qlen(lopt)); + if (WARN_ON(reqsk_queue_len(queue) != 0)) + pr_err("qlen %u\n", reqsk_queue_len(queue)); kvfree(lopt); } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 0085612b9e49..093ef04e6ebf 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -640,9 +640,9 @@ static void reqsk_timer_handler(unsigned long data) * embrions; and abort old ones without pity, if old * ones are about to clog our table. */ - qlen = listen_sock_qlen(lopt); + qlen = reqsk_queue_len(queue); if (qlen >> (lopt->max_qlen_log - 1)) { - int young = listen_sock_young(lopt) << 1; + int young = reqsk_queue_len_young(queue) << 1; while (thresh > 2) { if (qlen < young) @@ -664,7 +664,7 @@ static void reqsk_timer_handler(unsigned long data) unsigned long timeo; if (req->num_timeout++ == 0) - atomic_inc(&lopt->young_dec); + atomic_dec(&queue->young); timeo = min(TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX); mod_timer_pinned(&req->rsk_timer, jiffies + timeo); return; diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index c3b1f3a0f4cf..0ac1d68dc8a6 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -753,7 +753,7 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, spin_lock(&icsk->icsk_accept_queue.syn_wait_lock); lopt = icsk->icsk_accept_queue.listen_opt; - if (!lopt || !listen_sock_qlen(lopt)) + if (!lopt || !reqsk_queue_len(&icsk->icsk_accept_queue)) goto out; if (bc) { -- cgit v1.2.3 From 8d2675f1e464aa5cedda63849adecffd8d33fead Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 2 Oct 2015 11:43:25 -0700 Subject: tcp: move synflood_warned into struct request_sock_queue long term plan is to remove struct listen_sock when its hash table is no longer there. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/request_sock.h | 2 +- net/ipv4/tcp_input.c | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'include/net/request_sock.h') diff --git a/include/net/request_sock.h b/include/net/request_sock.h index d128e7f89042..273fb7235ce3 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -123,7 +123,6 @@ extern int sysctl_max_syn_backlog; */ struct listen_sock { u32 max_qlen_log; - u32 synflood_warned; u32 hash_rnd; u32 nr_table_entries; struct request_sock *syn_table[0]; @@ -171,6 +170,7 @@ struct fastopen_queue { struct request_sock_queue { spinlock_t rskq_lock; u8 rskq_defer_accept; + u32 synflood_warned; atomic_t qlen; atomic_t young; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e58cbcd2f07e..8b0ce73c2049 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6068,9 +6068,9 @@ static bool tcp_syn_flood_action(const struct sock *sk, const struct sk_buff *skb, const char *proto) { + struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; const char *msg = "Dropping request"; bool want_cookie = false; - struct listen_sock *lopt; #ifdef CONFIG_SYN_COOKIES if (sysctl_tcp_syncookies) { @@ -6081,10 +6081,9 @@ static bool tcp_syn_flood_action(const struct sock *sk, #endif NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP); - lopt = inet_csk(sk)->icsk_accept_queue.listen_opt; - if (!lopt->synflood_warned && + if (!queue->synflood_warned && sysctl_tcp_syncookies != 2 && - xchg(&lopt->synflood_warned, 1) == 0) + xchg(&queue->synflood_warned, 1) == 0) pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n", proto, ntohs(tcp_hdr(skb)->dest), msg); -- cgit v1.2.3 From b267cdd1075d28501b7c05c7aeb8466775505e8d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 2 Oct 2015 11:43:27 -0700 Subject: tcp/dccp: init sk_prot and call sk_node_init() in reqsk_alloc() We plan to use generic functions to insert request sockets into ehash table. sk_prot needs to be set (to retrieve sk_prot->h.hashinfo) sk_node needs to be cleared. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/request_sock.h | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) (limited to 'include/net/request_sock.h') diff --git a/include/net/request_sock.h b/include/net/request_sock.h index 273fb7235ce3..97c1ba61ed2d 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -69,6 +69,16 @@ struct request_sock { u32 peer_secid; }; +static inline struct request_sock *inet_reqsk(struct sock *sk) +{ + return (struct request_sock *)sk; +} + +static inline struct sock *req_to_sk(struct request_sock *req) +{ + return (struct sock *)req; +} + static inline struct request_sock * reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener) { @@ -78,6 +88,8 @@ reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener) req->rsk_ops = ops; sock_hold(sk_listener); req->rsk_listener = sk_listener; + req_to_sk(req)->sk_prot = sk_listener->sk_prot; + sk_node_init(&req_to_sk(req)->sk_node); req->saved_syn = NULL; /* Following is temporary. It is coupled with debugging * helpers in reqsk_put() & reqsk_free() @@ -87,16 +99,6 @@ reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener) return req; } -static inline struct request_sock *inet_reqsk(struct sock *sk) -{ - return (struct request_sock *)sk; -} - -static inline struct sock *req_to_sk(struct request_sock *req) -{ - return (struct sock *)req; -} - static inline void reqsk_free(struct request_sock *req) { /* temporary debugging */ -- cgit v1.2.3 From 079096f103faca2dd87342cca6f23d4b34da8871 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 2 Oct 2015 11:43:32 -0700 Subject: tcp/dccp: install syn_recv requests into ehash table In this patch, we insert request sockets into TCP/DCCP regular ehash table (where ESTABLISHED and TIMEWAIT sockets are) instead of using the per listener hash table. ACK packets find SYN_RECV pseudo sockets without having to find and lock the listener. In nominal conditions, this halves pressure on listener lock. Note that this will allow for SO_REUSEPORT refinements, so that we can select a listener using cpu/numa affinities instead of the prior 'consistent hash', since only SYN packets will apply this selection logic. We will shrink listen_sock in the following patch to ease code review. Signed-off-by: Eric Dumazet Cc: Ying Cai Cc: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/inet_connection_sock.h | 4 -- include/net/inet_hashtables.h | 1 + include/net/request_sock.h | 4 -- include/net/tcp.h | 3 - net/core/request_sock.c | 28 +-------- net/dccp/ipv4.c | 64 +++++++------------- net/dccp/ipv6.c | 72 +++++++---------------- net/ipv4/inet_connection_sock.c | 103 +++++++------------------------- net/ipv4/inet_diag.c | 96 +++--------------------------- net/ipv4/inet_hashtables.c | 14 ++++- net/ipv4/syncookies.c | 4 ++ net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_ipv4.c | 117 +++++++++++-------------------------- net/ipv6/inet6_connection_sock.c | 67 --------------------- net/ipv6/tcp_ipv6.c | 82 ++++++++++++-------------- 15 files changed, 160 insertions(+), 501 deletions(-) (limited to 'include/net/request_sock.h') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index b2e2e30befa9..730aa034cd3d 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -258,10 +258,6 @@ inet_csk_rto_backoff(const struct inet_connection_sock *icsk, struct sock *inet_csk_accept(struct sock *sk, int flags, int *err); -struct request_sock *inet_csk_search_req(struct sock *sk, - const __be16 rport, - const __be32 raddr, - const __be32 laddr); int inet_csk_bind_conflict(const struct sock *sk, const struct inet_bind_bucket *tb, bool relax); int inet_csk_get_port(struct sock *sk, unsigned short snum); diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 3fb778d7c875..6683ada25fef 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -205,6 +205,7 @@ void inet_put_port(struct sock *sk); void inet_hashinfo_init(struct inet_hashinfo *h); +int inet_ehash_insert(struct sock *sk, struct sock *osk); void __inet_hash_nolisten(struct sock *sk, struct sock *osk); void __inet_hash(struct sock *sk, struct sock *osk); void inet_hash(struct sock *sk); diff --git a/include/net/request_sock.h b/include/net/request_sock.h index 97c1ba61ed2d..e1850923c4f5 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -266,8 +266,4 @@ static inline int reqsk_queue_is_full(const struct request_sock_queue *queue) return reqsk_queue_len(queue) >> queue->listen_opt->max_qlen_log; } -void reqsk_queue_hash_req(struct request_sock_queue *queue, - u32 hash, struct request_sock *req, - unsigned long timeout); - #endif /* _REQUEST_SOCK_H */ diff --git a/include/net/tcp.h b/include/net/tcp.h index a26341d2ad67..225e9561af35 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1618,7 +1618,6 @@ static inline bool tcp_stream_is_thin(struct tcp_sock *tp) /* /proc */ enum tcp_seq_states { TCP_SEQ_STATE_LISTENING, - TCP_SEQ_STATE_OPENREQ, TCP_SEQ_STATE_ESTABLISHED, }; @@ -1717,8 +1716,6 @@ struct tcp_request_sock_ops { int (*send_synack)(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, u16 queue_mapping, struct tcp_fastopen_cookie *foc); - void (*queue_hash_add)(struct sock *sk, struct request_sock *req, - const unsigned long timeout); }; #ifdef CONFIG_SYN_COOKIES diff --git a/net/core/request_sock.c b/net/core/request_sock.c index 5ca624cea04c..a4b305d8ca2b 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -99,35 +99,9 @@ static inline struct listen_sock *reqsk_queue_yank_listen_sk( void reqsk_queue_destroy(struct request_sock_queue *queue) { - /* make all the listen_opt local to us */ struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue); - if (reqsk_queue_len(queue) != 0) { - unsigned int i; - - for (i = 0; i < lopt->nr_table_entries; i++) { - struct request_sock *req; - - spin_lock_bh(&queue->syn_wait_lock); - while ((req = lopt->syn_table[i]) != NULL) { - lopt->syn_table[i] = req->dl_next; - /* Because of following del_timer_sync(), - * we must release the spinlock here - * or risk a dead lock. - */ - spin_unlock_bh(&queue->syn_wait_lock); - atomic_dec(&queue->qlen); - if (del_timer_sync(&req->rsk_timer)) - reqsk_put(req); - reqsk_put(req); - spin_lock_bh(&queue->syn_wait_lock); - } - spin_unlock_bh(&queue->syn_wait_lock); - } - } - - if (WARN_ON(reqsk_queue_len(queue) != 0)) - pr_err("qlen %u\n", reqsk_queue_len(queue)); + /* cleaning is done by req timers */ kvfree(lopt); } diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 5b7818c63cec..8910c9567719 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -444,36 +444,6 @@ put_and_exit: } EXPORT_SYMBOL_GPL(dccp_v4_request_recv_sock); -static struct sock *dccp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) -{ - const struct dccp_hdr *dh = dccp_hdr(skb); - const struct iphdr *iph = ip_hdr(skb); - struct sock *nsk; - /* Find possible connection requests. */ - struct request_sock *req = inet_csk_search_req(sk, dh->dccph_sport, - iph->saddr, iph->daddr); - if (req) { - nsk = dccp_check_req(sk, skb, req); - if (!nsk) - reqsk_put(req); - return nsk; - } - nsk = inet_lookup_established(sock_net(sk), &dccp_hashinfo, - iph->saddr, dh->dccph_sport, - iph->daddr, dh->dccph_dport, - inet_iif(skb)); - if (nsk != NULL) { - if (nsk->sk_state != DCCP_TIME_WAIT) { - bh_lock_sock(nsk); - return nsk; - } - inet_twsk_put(inet_twsk(nsk)); - return NULL; - } - - return sk; -} - static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk, struct sk_buff *skb) { @@ -705,18 +675,6 @@ int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) * NOTE: the check for the packet types is done in * dccp_rcv_state_process */ - if (sk->sk_state == DCCP_LISTEN) { - struct sock *nsk = dccp_v4_hnd_req(sk, skb); - - if (nsk == NULL) - goto discard; - - if (nsk != sk) { - if (dccp_child_process(sk, nsk, skb)) - goto reset; - return 0; - } - } if (dccp_rcv_state_process(sk, skb, dh, skb->len)) goto reset; @@ -724,7 +682,6 @@ int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) reset: dccp_v4_ctl_send_reset(sk, skb); -discard: kfree_skb(skb); return 0; } @@ -868,6 +825,27 @@ static int dccp_v4_rcv(struct sk_buff *skb) goto no_dccp_socket; } + if (sk->sk_state == DCCP_NEW_SYN_RECV) { + struct request_sock *req = inet_reqsk(sk); + struct sock *nsk = NULL; + + sk = req->rsk_listener; + if (sk->sk_state == DCCP_LISTEN) + nsk = dccp_check_req(sk, skb, req); + if (!nsk) { + reqsk_put(req); + goto discard_it; + } + if (nsk == sk) { + sock_hold(sk); + reqsk_put(req); + } else if (dccp_child_process(sk, nsk, skb)) { + dccp_v4_ctl_send_reset(sk, skb); + goto discard_it; + } else { + return 0; + } + } /* * RFC 4340, sec. 9.2.1: Minimum Checksum Coverage * o if MinCsCov = 0, only packets with CsCov = 0 are accepted diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index e8753aa3b7a4..1361a3f45df7 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -290,37 +290,6 @@ static struct request_sock_ops dccp6_request_sock_ops = { .syn_ack_timeout = dccp_syn_ack_timeout, }; -static struct sock *dccp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) -{ - const struct dccp_hdr *dh = dccp_hdr(skb); - const struct ipv6hdr *iph = ipv6_hdr(skb); - struct request_sock *req; - struct sock *nsk; - - req = inet6_csk_search_req(sk, dh->dccph_sport, &iph->saddr, - &iph->daddr, inet6_iif(skb)); - if (req) { - nsk = dccp_check_req(sk, skb, req); - if (!nsk) - reqsk_put(req); - return nsk; - } - nsk = __inet6_lookup_established(sock_net(sk), &dccp_hashinfo, - &iph->saddr, dh->dccph_sport, - &iph->daddr, ntohs(dh->dccph_dport), - inet6_iif(skb)); - if (nsk != NULL) { - if (nsk->sk_state != DCCP_TIME_WAIT) { - bh_lock_sock(nsk); - return nsk; - } - inet_twsk_put(inet_twsk(nsk)); - return NULL; - } - - return sk; -} - static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) { struct request_sock *req; @@ -398,7 +367,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) if (dccp_v6_send_response(sk, req)) goto drop_and_free; - inet6_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT); + inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT); return 0; drop_and_free: @@ -641,24 +610,6 @@ static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) * NOTE: the check for the packet types is done in * dccp_rcv_state_process */ - if (sk->sk_state == DCCP_LISTEN) { - struct sock *nsk = dccp_v6_hnd_req(sk, skb); - - if (nsk == NULL) - goto discard; - /* - * Queue it on the new socket if the new socket is active, - * otherwise we just shortcircuit this and continue with - * the new socket.. - */ - if (nsk != sk) { - if (dccp_child_process(sk, nsk, skb)) - goto reset; - if (opt_skb != NULL) - __kfree_skb(opt_skb); - return 0; - } - } if (dccp_rcv_state_process(sk, skb, dccp_hdr(skb), skb->len)) goto reset; @@ -732,6 +683,27 @@ static int dccp_v6_rcv(struct sk_buff *skb) goto no_dccp_socket; } + if (sk->sk_state == DCCP_NEW_SYN_RECV) { + struct request_sock *req = inet_reqsk(sk); + struct sock *nsk = NULL; + + sk = req->rsk_listener; + if (sk->sk_state == DCCP_LISTEN) + nsk = dccp_check_req(sk, skb, req); + if (!nsk) { + reqsk_put(req); + goto discard_it; + } + if (nsk == sk) { + sock_hold(sk); + reqsk_put(req); + } else if (dccp_child_process(sk, nsk, skb)) { + dccp_v6_ctl_send_reset(sk, skb); + goto discard_it; + } else { + return 0; + } + } /* * RFC 4340, sec. 9.2.1: Minimum Checksum Coverage * o if MinCsCov = 0, only packets with CsCov = 0 are accepted diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index e62f04775c93..80904df02187 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -476,65 +476,12 @@ no_route: } EXPORT_SYMBOL_GPL(inet_csk_route_child_sock); -static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport, - const u32 rnd, const u32 synq_hsize) -{ - return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1); -} - #if IS_ENABLED(CONFIG_IPV6) #define AF_INET_FAMILY(fam) ((fam) == AF_INET) #else #define AF_INET_FAMILY(fam) true #endif -/* Note: this is temporary : - * req sock will no longer be in listener hash table -*/ -struct request_sock *inet_csk_search_req(struct sock *sk, - const __be16 rport, - const __be32 raddr, - const __be32 laddr) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; - struct request_sock *req; - u32 hash = inet_synq_hash(raddr, rport, lopt->hash_rnd, - lopt->nr_table_entries); - - spin_lock(&icsk->icsk_accept_queue.syn_wait_lock); - for (req = lopt->syn_table[hash]; req != NULL; req = req->dl_next) { - const struct inet_request_sock *ireq = inet_rsk(req); - - if (ireq->ir_rmt_port == rport && - ireq->ir_rmt_addr == raddr && - ireq->ir_loc_addr == laddr && - AF_INET_FAMILY(req->rsk_ops->family)) { - atomic_inc(&req->rsk_refcnt); - WARN_ON(req->sk); - break; - } - } - spin_unlock(&icsk->icsk_accept_queue.syn_wait_lock); - - return req; -} -EXPORT_SYMBOL_GPL(inet_csk_search_req); - -void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, - unsigned long timeout) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; - const u32 h = inet_synq_hash(inet_rsk(req)->ir_rmt_addr, - inet_rsk(req)->ir_rmt_port, - lopt->hash_rnd, lopt->nr_table_entries); - - reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout); - inet_csk_reqsk_queue_added(sk); -} -EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add); - /* Only thing we need from tcp.h */ extern int sysctl_tcp_synack_retries; @@ -571,26 +518,20 @@ int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req) } EXPORT_SYMBOL(inet_rtx_syn_ack); -/* return true if req was found in the syn_table[] */ +/* return true if req was found in the ehash table */ static bool reqsk_queue_unlink(struct request_sock_queue *queue, struct request_sock *req) { - struct listen_sock *lopt = queue->listen_opt; - struct request_sock **prev; - bool found = false; + struct inet_hashinfo *hashinfo = req_to_sk(req)->sk_prot->h.hashinfo; + spinlock_t *lock; + bool found; - spin_lock(&queue->syn_wait_lock); + lock = inet_ehash_lockp(hashinfo, req->rsk_hash); - for (prev = &lopt->syn_table[req->rsk_hash]; *prev != NULL; - prev = &(*prev)->dl_next) { - if (*prev == req) { - *prev = req->dl_next; - found = true; - break; - } - } + spin_lock(lock); + found = __sk_nulls_del_node_init_rcu(req_to_sk(req)); + spin_unlock(lock); - spin_unlock(&queue->syn_wait_lock); if (timer_pending(&req->rsk_timer) && del_timer_sync(&req->rsk_timer)) reqsk_put(req); return found; @@ -616,10 +557,8 @@ static void reqsk_timer_handler(unsigned long data) int max_retries, thresh; u8 defer_accept; - if (sk_listener->sk_state != TCP_LISTEN || !lopt) { - reqsk_put(req); - return; - } + if (sk_listener->sk_state != TCP_LISTEN || !lopt) + goto drop; max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries; thresh = max_retries; @@ -669,36 +608,36 @@ static void reqsk_timer_handler(unsigned long data) mod_timer_pinned(&req->rsk_timer, jiffies + timeo); return; } +drop: inet_csk_reqsk_queue_drop(sk_listener, req); reqsk_put(req); } -void reqsk_queue_hash_req(struct request_sock_queue *queue, - u32 hash, struct request_sock *req, - unsigned long timeout) +static void reqsk_queue_hash_req(struct request_sock *req, + unsigned long timeout) { - struct listen_sock *lopt = queue->listen_opt; - req->num_retrans = 0; req->num_timeout = 0; req->sk = NULL; setup_timer(&req->rsk_timer, reqsk_timer_handler, (unsigned long)req); mod_timer_pinned(&req->rsk_timer, jiffies + timeout); - req->rsk_hash = hash; + inet_ehash_insert(req_to_sk(req), NULL); /* before letting lookups find us, make sure all req fields * are committed to memory and refcnt initialized. */ smp_wmb(); atomic_set(&req->rsk_refcnt, 2); +} - spin_lock(&queue->syn_wait_lock); - req->dl_next = lopt->syn_table[hash]; - lopt->syn_table[hash] = req; - spin_unlock(&queue->syn_wait_lock); +void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, + unsigned long timeout) +{ + reqsk_queue_hash_req(req, timeout); + inet_csk_reqsk_queue_added(sk); } -EXPORT_SYMBOL(reqsk_queue_hash_req); +EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add); /** * inet_csk_clone_lock - clone an inet socket, and lock its clone diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 0ac1d68dc8a6..ab9f8a66615d 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -730,91 +730,21 @@ static void twsk_build_assert(void) #endif } -static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, - struct netlink_callback *cb, - const struct inet_diag_req_v2 *r, - const struct nlattr *bc) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - struct inet_sock *inet = inet_sk(sk); - struct inet_diag_entry entry; - int j, s_j, reqnum, s_reqnum; - struct listen_sock *lopt; - int err = 0; - - s_j = cb->args[3]; - s_reqnum = cb->args[4]; - - if (s_j > 0) - s_j--; - - entry.family = sk->sk_family; - - spin_lock(&icsk->icsk_accept_queue.syn_wait_lock); - - lopt = icsk->icsk_accept_queue.listen_opt; - if (!lopt || !reqsk_queue_len(&icsk->icsk_accept_queue)) - goto out; - - if (bc) { - entry.sport = inet->inet_num; - entry.userlocks = sk->sk_userlocks; - } - - for (j = s_j; j < lopt->nr_table_entries; j++) { - struct request_sock *req, *head = lopt->syn_table[j]; - - reqnum = 0; - for (req = head; req; reqnum++, req = req->dl_next) { - struct inet_request_sock *ireq = inet_rsk(req); - - if (reqnum < s_reqnum) - continue; - if (r->id.idiag_dport != ireq->ir_rmt_port && - r->id.idiag_dport) - continue; - - if (bc) { - /* Note: entry.sport and entry.userlocks are already set */ - entry_fill_addrs(&entry, req_to_sk(req)); - entry.dport = ntohs(ireq->ir_rmt_port); - - if (!inet_diag_bc_run(bc, &entry)) - continue; - } - - err = inet_req_diag_fill(req_to_sk(req), skb, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI, cb->nlh); - if (err < 0) { - cb->args[3] = j + 1; - cb->args[4] = reqnum; - goto out; - } - } - - s_reqnum = 0; - } - -out: - spin_unlock(&icsk->icsk_accept_queue.syn_wait_lock); - - return err; -} - void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r, struct nlattr *bc) { struct net *net = sock_net(skb->sk); int i, num, s_i, s_num; + u32 idiag_states = r->idiag_states; + if (idiag_states & TCPF_SYN_RECV) + idiag_states |= TCPF_NEW_SYN_RECV; s_i = cb->args[1]; s_num = num = cb->args[2]; if (cb->args[0] == 0) { - if (!(r->idiag_states & (TCPF_LISTEN | TCPF_SYN_RECV))) + if (!(idiag_states & TCPF_LISTEN)) goto skip_listen_ht; for (i = s_i; i < INET_LHTABLE_SIZE; i++) { @@ -844,21 +774,11 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, r->id.idiag_sport) goto next_listen; - if (!(r->idiag_states & TCPF_LISTEN) || - r->id.idiag_dport || + if (r->id.idiag_dport || cb->args[3] > 0) - goto syn_recv; - - if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) { - spin_unlock_bh(&ilb->lock); - goto done; - } - -syn_recv: - if (!(r->idiag_states & TCPF_SYN_RECV)) goto next_listen; - if (inet_diag_dump_reqs(skb, sk, cb, r, bc) < 0) { + if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) { spin_unlock_bh(&ilb->lock); goto done; } @@ -879,7 +799,7 @@ skip_listen_ht: s_i = num = s_num = 0; } - if (!(r->idiag_states & ~(TCPF_LISTEN | TCPF_SYN_RECV))) + if (!(idiag_states & ~TCPF_LISTEN)) goto out; for (i = s_i; i <= hashinfo->ehash_mask; i++) { @@ -906,7 +826,7 @@ skip_listen_ht: goto next_normal; state = (sk->sk_state == TCP_TIME_WAIT) ? inet_twsk(sk)->tw_substate : sk->sk_state; - if (!(r->idiag_states & (1 << state))) + if (!(idiag_states & (1 << state))) goto next_normal; if (r->sdiag_family != AF_UNSPEC && sk->sk_family != r->sdiag_family) diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 56742e995dd3..bed8886a4b6c 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -398,14 +398,18 @@ static u32 inet_sk_port_offset(const struct sock *sk) inet->inet_dport); } -void __inet_hash_nolisten(struct sock *sk, struct sock *osk) +/* insert a socket into ehash, and eventually remove another one + * (The another one can be a SYN_RECV or TIMEWAIT + */ +int inet_ehash_insert(struct sock *sk, struct sock *osk) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; struct hlist_nulls_head *list; struct inet_ehash_bucket *head; spinlock_t *lock; + int ret = 0; - WARN_ON(!sk_unhashed(sk)); + WARN_ON_ONCE(!sk_unhashed(sk)); sk->sk_hash = sk_ehashfn(sk); head = inet_ehash_bucket(hashinfo, sk->sk_hash); @@ -419,6 +423,12 @@ void __inet_hash_nolisten(struct sock *sk, struct sock *osk) sk_nulls_del_node_init_rcu(osk); } spin_unlock(lock); + return ret; +} + +void __inet_hash_nolisten(struct sock *sk, struct sock *osk) +{ + inet_ehash_insert(sk, osk); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); } EXPORT_SYMBOL_GPL(__inet_hash_nolisten); diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 6b97b5f6457c..729ceb5f63c6 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -284,6 +284,10 @@ bool cookie_ecn_ok(const struct tcp_options_received *tcp_opt, } EXPORT_SYMBOL(cookie_ecn_ok); +/* On input, sk is a listener. + * Output is listener if incoming packet would not create a child + * NULL if memory could not be allocated. + */ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) { struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8b0ce73c2049..a56912772354 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6241,7 +6241,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, goto drop_and_free; tcp_rsk(req)->tfo_listener = false; - af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT); + inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); } tcp_reqsk_record_syn(sk, req, skb); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a33101616215..bfe9d39ee87d 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1224,7 +1224,6 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { .route_req = tcp_v4_route_req, .init_seq = tcp_v4_init_sequence, .send_synack = tcp_v4_send_synack, - .queue_hash_add = inet_csk_reqsk_queue_hash_add, }; int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) @@ -1343,34 +1342,11 @@ put_and_exit: } EXPORT_SYMBOL(tcp_v4_syn_recv_sock); -static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) +static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) { +#ifdef CONFIG_SYN_COOKIES const struct tcphdr *th = tcp_hdr(skb); - const struct iphdr *iph = ip_hdr(skb); - struct request_sock *req; - struct sock *nsk; - - req = inet_csk_search_req(sk, th->source, iph->saddr, iph->daddr); - if (req) { - nsk = tcp_check_req(sk, skb, req, false); - if (!nsk || nsk == sk) - reqsk_put(req); - return nsk; - } - - nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr, - th->source, iph->daddr, th->dest, inet_iif(skb)); - - if (nsk) { - if (nsk->sk_state != TCP_TIME_WAIT) { - bh_lock_sock(nsk); - return nsk; - } - inet_twsk_put(inet_twsk(nsk)); - return NULL; - } -#ifdef CONFIG_SYN_COOKIES if (!th->syn) sk = cookie_v4_check(sk, skb); #endif @@ -1409,10 +1385,10 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) goto csum_err; if (sk->sk_state == TCP_LISTEN) { - struct sock *nsk = tcp_v4_hnd_req(sk, skb); + struct sock *nsk = tcp_v4_cookie_check(sk, skb); + if (!nsk) goto discard; - if (nsk != sk) { sock_rps_save_rxhash(nsk, skb); sk_mark_napi_id(nsk, skb); @@ -1603,6 +1579,29 @@ process: if (sk->sk_state == TCP_TIME_WAIT) goto do_time_wait; + if (sk->sk_state == TCP_NEW_SYN_RECV) { + struct request_sock *req = inet_reqsk(sk); + struct sock *nsk = NULL; + + sk = req->rsk_listener; + if (tcp_v4_inbound_md5_hash(sk, skb)) + goto discard_and_relse; + if (sk->sk_state == TCP_LISTEN) + nsk = tcp_check_req(sk, skb, req, false); + if (!nsk) { + reqsk_put(req); + goto discard_it; + } + if (nsk == sk) { + sock_hold(sk); + reqsk_put(req); + } else if (tcp_child_process(sk, nsk, skb)) { + tcp_v4_send_reset(nsk, skb); + goto discard_it; + } else { + return 0; + } + } if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); goto discard_and_relse; @@ -1830,35 +1829,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur) ++st->num; ++st->offset; - if (st->state == TCP_SEQ_STATE_OPENREQ) { - struct request_sock *req = cur; - - icsk = inet_csk(st->syn_wait_sk); - req = req->dl_next; - while (1) { - while (req) { - if (req->rsk_ops->family == st->family) { - cur = req; - goto out; - } - req = req->dl_next; - } - if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries) - break; -get_req: - req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket]; - } - sk = sk_nulls_next(st->syn_wait_sk); - st->state = TCP_SEQ_STATE_LISTENING; - spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); - } else { - icsk = inet_csk(sk); - spin_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); - if (reqsk_queue_len(&icsk->icsk_accept_queue)) - goto start_req; - spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); - sk = sk_nulls_next(sk); - } + sk = sk_nulls_next(sk); get_sk: sk_nulls_for_each_from(sk, node) { if (!net_eq(sock_net(sk), net)) @@ -1868,15 +1839,6 @@ get_sk: goto out; } icsk = inet_csk(sk); - spin_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); - if (reqsk_queue_len(&icsk->icsk_accept_queue)) { -start_req: - st->syn_wait_sk = sk; - st->state = TCP_SEQ_STATE_OPENREQ; - st->sbucket = 0; - goto get_req; - } - spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); } spin_unlock_bh(&ilb->lock); st->offset = 0; @@ -2008,7 +1970,6 @@ static void *tcp_seek_last_pos(struct seq_file *seq) void *rc = NULL; switch (st->state) { - case TCP_SEQ_STATE_OPENREQ: case TCP_SEQ_STATE_LISTENING: if (st->bucket >= INET_LHTABLE_SIZE) break; @@ -2067,7 +2028,6 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) } switch (st->state) { - case TCP_SEQ_STATE_OPENREQ: case TCP_SEQ_STATE_LISTENING: rc = listening_get_next(seq, v); if (!rc) { @@ -2092,11 +2052,6 @@ static void tcp_seq_stop(struct seq_file *seq, void *v) struct tcp_iter_state *st = seq->private; switch (st->state) { - case TCP_SEQ_STATE_OPENREQ: - if (v) { - struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk); - spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); - } case TCP_SEQ_STATE_LISTENING: if (v != SEQ_START_TOKEN) spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock); @@ -2269,18 +2224,12 @@ static int tcp4_seq_show(struct seq_file *seq, void *v) } st = seq->private; - switch (st->state) { - case TCP_SEQ_STATE_LISTENING: - case TCP_SEQ_STATE_ESTABLISHED: - if (sk->sk_state == TCP_TIME_WAIT) - get_timewait4_sock(v, seq, st->num); - else - get_tcp4_sock(v, seq, st->num); - break; - case TCP_SEQ_STATE_OPENREQ: + if (sk->sk_state == TCP_TIME_WAIT) + get_timewait4_sock(v, seq, st->num); + else if (sk->sk_state == TCP_NEW_SYN_RECV) get_openreq4(v, seq, st->num); - break; - } + else + get_tcp4_sock(v, seq, st->num); out: seq_pad(seq, '\n'); return 0; diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index ea915aa5e4e2..5d1c7cee2cb2 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -94,73 +94,6 @@ struct dst_entry *inet6_csk_route_req(const struct sock *sk, } EXPORT_SYMBOL(inet6_csk_route_req); -/* - * request_sock (formerly open request) hash tables. - */ -static u32 inet6_synq_hash(const struct in6_addr *raddr, const __be16 rport, - const u32 rnd, const u32 synq_hsize) -{ - u32 c; - - c = jhash_3words((__force u32)raddr->s6_addr32[0], - (__force u32)raddr->s6_addr32[1], - (__force u32)raddr->s6_addr32[2], - rnd); - - c = jhash_2words((__force u32)raddr->s6_addr32[3], - (__force u32)rport, - c); - - return c & (synq_hsize - 1); -} - -struct request_sock *inet6_csk_search_req(struct sock *sk, - const __be16 rport, - const struct in6_addr *raddr, - const struct in6_addr *laddr, - const int iif) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; - struct request_sock *req; - u32 hash = inet6_synq_hash(raddr, rport, lopt->hash_rnd, - lopt->nr_table_entries); - - spin_lock(&icsk->icsk_accept_queue.syn_wait_lock); - for (req = lopt->syn_table[hash]; req != NULL; req = req->dl_next) { - const struct inet_request_sock *ireq = inet_rsk(req); - - if (ireq->ir_rmt_port == rport && - req->rsk_ops->family == AF_INET6 && - ipv6_addr_equal(&ireq->ir_v6_rmt_addr, raddr) && - ipv6_addr_equal(&ireq->ir_v6_loc_addr, laddr) && - (!ireq->ir_iif || ireq->ir_iif == iif)) { - atomic_inc(&req->rsk_refcnt); - WARN_ON(req->sk != NULL); - break; - } - } - spin_unlock(&icsk->icsk_accept_queue.syn_wait_lock); - - return req; -} -EXPORT_SYMBOL_GPL(inet6_csk_search_req); - -void inet6_csk_reqsk_queue_hash_add(struct sock *sk, - struct request_sock *req, - const unsigned long timeout) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; - const u32 h = inet6_synq_hash(&inet_rsk(req)->ir_v6_rmt_addr, - inet_rsk(req)->ir_rmt_port, - lopt->hash_rnd, lopt->nr_table_entries); - - reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout); - inet_csk_reqsk_queue_added(sk); -} -EXPORT_SYMBOL_GPL(inet6_csk_reqsk_queue_hash_add); - void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr) { struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) uaddr; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index cadb44a2d34e..a215614cfb2b 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -727,7 +727,6 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { .route_req = tcp_v6_route_req, .init_seq = tcp_v6_init_sequence, .send_synack = tcp_v6_send_synack, - .queue_hash_add = inet6_csk_reqsk_queue_hash_add, }; static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 seq, @@ -938,37 +937,11 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, } -static struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb) +static struct sock *tcp_v6_cookie_check(struct sock *sk, struct sk_buff *skb) { +#ifdef CONFIG_SYN_COOKIES const struct tcphdr *th = tcp_hdr(skb); - struct request_sock *req; - struct sock *nsk; - - /* Find possible connection requests. */ - req = inet6_csk_search_req(sk, th->source, - &ipv6_hdr(skb)->saddr, - &ipv6_hdr(skb)->daddr, tcp_v6_iif(skb)); - if (req) { - nsk = tcp_check_req(sk, skb, req, false); - if (!nsk || nsk == sk) - reqsk_put(req); - return nsk; - } - nsk = __inet6_lookup_established(sock_net(sk), &tcp_hashinfo, - &ipv6_hdr(skb)->saddr, th->source, - &ipv6_hdr(skb)->daddr, ntohs(th->dest), - tcp_v6_iif(skb)); - - if (nsk) { - if (nsk->sk_state != TCP_TIME_WAIT) { - bh_lock_sock(nsk); - return nsk; - } - inet_twsk_put(inet_twsk(nsk)); - return NULL; - } -#ifdef CONFIG_SYN_COOKIES if (!th->syn) sk = cookie_v6_check(sk, skb); #endif @@ -1258,15 +1231,11 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) goto csum_err; if (sk->sk_state == TCP_LISTEN) { - struct sock *nsk = tcp_v6_hnd_req(sk, skb); + struct sock *nsk = tcp_v6_cookie_check(sk, skb); + if (!nsk) goto discard; - /* - * Queue it on the new socket if the new socket is active, - * otherwise we just shortcircuit this and continue with - * the new socket.. - */ if (nsk != sk) { sock_rps_save_rxhash(nsk, skb); sk_mark_napi_id(nsk, skb); @@ -1402,6 +1371,33 @@ process: if (sk->sk_state == TCP_TIME_WAIT) goto do_time_wait; + if (sk->sk_state == TCP_NEW_SYN_RECV) { + struct request_sock *req = inet_reqsk(sk); + struct sock *nsk = NULL; + + sk = req->rsk_listener; + tcp_v6_fill_cb(skb, hdr, th); + if (tcp_v6_inbound_md5_hash(sk, skb)) { + reqsk_put(req); + goto discard_it; + } + if (sk->sk_state == TCP_LISTEN) + nsk = tcp_check_req(sk, skb, req, false); + if (!nsk) { + reqsk_put(req); + goto discard_it; + } + if (nsk == sk) { + sock_hold(sk); + reqsk_put(req); + tcp_v6_restore_cb(skb); + } else if (tcp_child_process(sk, nsk, skb)) { + tcp_v6_send_reset(nsk, skb); + goto discard_it; + } else { + return 0; + } + } if (hdr->hop_limit < inet6_sk(sk)->min_hopcount) { NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); goto discard_and_relse; @@ -1765,18 +1761,12 @@ static int tcp6_seq_show(struct seq_file *seq, void *v) } st = seq->private; - switch (st->state) { - case TCP_SEQ_STATE_LISTENING: - case TCP_SEQ_STATE_ESTABLISHED: - if (sk->sk_state == TCP_TIME_WAIT) - get_timewait6_sock(seq, v, st->num); - else - get_tcp6_sock(seq, v, st->num); - break; - case TCP_SEQ_STATE_OPENREQ: + if (sk->sk_state == TCP_TIME_WAIT) + get_timewait6_sock(seq, v, st->num); + else if (sk->sk_state == TCP_NEW_SYN_RECV) get_openreq6(seq, v, st->num); - break; - } + else + get_tcp6_sock(seq, v, st->num); out: return 0; } -- cgit v1.2.3 From 81b496b31a4331415b6a644b485a329ec0b45155 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 2 Oct 2015 11:43:33 -0700 Subject: tcp/dccp: shrink struct listen_sock We no longer use hash_rnd, nr_table_entries and syn_table[] For a listener with a backlog of 10 millions sockets, this saves 80 MBytes of vmalloced memory. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/request_sock.h | 3 --- net/core/request_sock.c | 14 +++----------- 2 files changed, 3 insertions(+), 14 deletions(-) (limited to 'include/net/request_sock.h') diff --git a/include/net/request_sock.h b/include/net/request_sock.h index e1850923c4f5..353cb61bb399 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -125,9 +125,6 @@ extern int sysctl_max_syn_backlog; */ struct listen_sock { u32 max_qlen_log; - u32 hash_rnd; - u32 nr_table_entries; - struct request_sock *syn_table[0]; }; /* diff --git a/net/core/request_sock.c b/net/core/request_sock.c index a4b305d8ca2b..124f61c5bfef 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -46,18 +46,11 @@ int reqsk_queue_alloc(struct request_sock_queue *queue, nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog); nr_table_entries = max_t(u32, nr_table_entries, 8); nr_table_entries = roundup_pow_of_two(nr_table_entries + 1); - lopt_size += nr_table_entries * sizeof(struct request_sock *); - if (lopt_size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) - lopt = kzalloc(lopt_size, GFP_KERNEL | - __GFP_NOWARN | - __GFP_NORETRY); - if (!lopt) - lopt = vzalloc(lopt_size); + lopt = kzalloc(lopt_size, GFP_KERNEL); if (!lopt) return -ENOMEM; - get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd)); spin_lock_init(&queue->rskq_lock); spin_lock_init(&queue->syn_wait_lock); @@ -68,7 +61,6 @@ int reqsk_queue_alloc(struct request_sock_queue *queue, queue->fastopenq.max_qlen = 0; queue->rskq_accept_head = NULL; - lopt->nr_table_entries = nr_table_entries; lopt->max_qlen_log = ilog2(nr_table_entries); spin_lock_bh(&queue->syn_wait_lock); @@ -81,7 +73,7 @@ int reqsk_queue_alloc(struct request_sock_queue *queue, void __reqsk_queue_destroy(struct request_sock_queue *queue) { /* This is an error recovery path only, no locking needed */ - kvfree(queue->listen_opt); + kfree(queue->listen_opt); } static inline struct listen_sock *reqsk_queue_yank_listen_sk( @@ -102,7 +94,7 @@ void reqsk_queue_destroy(struct request_sock_queue *queue) struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue); /* cleaning is done by req timers */ - kvfree(lopt); + kfree(lopt); } /* -- cgit v1.2.3 From 10cbc8f179177c1a6d5f56a46ebddc8f602ce5ac Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 2 Oct 2015 11:43:36 -0700 Subject: tcp/dccp: remove struct listen_sock It is enough to check listener sk_state, no need for an extra condition. max_qlen_log can be moved into struct request_sock_queue We can remove syn_wait_lock and the alignment it enforced. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/request_sock.h | 26 ++++------------------- net/core/request_sock.c | 47 +++-------------------------------------- net/ipv4/inet_connection_sock.c | 14 ++++-------- 3 files changed, 11 insertions(+), 76 deletions(-) (limited to 'include/net/request_sock.h') diff --git a/include/net/request_sock.h b/include/net/request_sock.h index 353cb61bb399..a66ab1345373 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -119,14 +119,6 @@ static inline void reqsk_put(struct request_sock *req) extern int sysctl_max_syn_backlog; -/** struct listen_sock - listen state - * - * @max_qlen_log - log_2 of maximal queued SYNs/REQUESTs - */ -struct listen_sock { - u32 max_qlen_log; -}; - /* * For a TCP Fast Open listener - * lock - protects the access to all the reqsk, which is co-owned by @@ -160,36 +152,26 @@ struct fastopen_queue { * @rskq_accept_head - FIFO head of established children * @rskq_accept_tail - FIFO tail of established children * @rskq_defer_accept - User waits for some data after accept() - * @syn_wait_lock - serializer - * - * %syn_wait_lock is necessary only to avoid proc interface having to grab the main - * lock sock while browsing the listening hash (otherwise it's deadlock prone). * */ struct request_sock_queue { spinlock_t rskq_lock; u8 rskq_defer_accept; + u8 max_qlen_log; u32 synflood_warned; - atomic_t qlen; atomic_t young; struct request_sock *rskq_accept_head; struct request_sock *rskq_accept_tail; - struct listen_sock *listen_opt; struct fastopen_queue fastopenq; /* Check max_qlen != 0 to determine * if TFO is enabled. */ - - /* temporary alignment, our goal is to get rid of this lock */ - spinlock_t syn_wait_lock ____cacheline_aligned_in_smp; }; -int reqsk_queue_alloc(struct request_sock_queue *queue, - unsigned int nr_table_entries); +void reqsk_queue_alloc(struct request_sock_queue *queue, + unsigned int nr_table_entries); -void __reqsk_queue_destroy(struct request_sock_queue *queue); -void reqsk_queue_destroy(struct request_sock_queue *queue); void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, bool reset); @@ -260,7 +242,7 @@ static inline int reqsk_queue_len_young(const struct request_sock_queue *queue) static inline int reqsk_queue_is_full(const struct request_sock_queue *queue) { - return reqsk_queue_len(queue) >> queue->listen_opt->max_qlen_log; + return reqsk_queue_len(queue) >> queue->max_qlen_log; } #endif /* _REQUEST_SOCK_H */ diff --git a/net/core/request_sock.c b/net/core/request_sock.c index 124f61c5bfef..ecf74189bd3f 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -37,22 +37,14 @@ int sysctl_max_syn_backlog = 256; EXPORT_SYMBOL(sysctl_max_syn_backlog); -int reqsk_queue_alloc(struct request_sock_queue *queue, - unsigned int nr_table_entries) +void reqsk_queue_alloc(struct request_sock_queue *queue, + unsigned int nr_table_entries) { - size_t lopt_size = sizeof(struct listen_sock); - struct listen_sock *lopt = NULL; - nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog); nr_table_entries = max_t(u32, nr_table_entries, 8); nr_table_entries = roundup_pow_of_two(nr_table_entries + 1); - lopt = kzalloc(lopt_size, GFP_KERNEL); - if (!lopt) - return -ENOMEM; - spin_lock_init(&queue->rskq_lock); - spin_lock_init(&queue->syn_wait_lock); spin_lock_init(&queue->fastopenq.lock); queue->fastopenq.rskq_rst_head = NULL; @@ -61,40 +53,7 @@ int reqsk_queue_alloc(struct request_sock_queue *queue, queue->fastopenq.max_qlen = 0; queue->rskq_accept_head = NULL; - lopt->max_qlen_log = ilog2(nr_table_entries); - - spin_lock_bh(&queue->syn_wait_lock); - queue->listen_opt = lopt; - spin_unlock_bh(&queue->syn_wait_lock); - - return 0; -} - -void __reqsk_queue_destroy(struct request_sock_queue *queue) -{ - /* This is an error recovery path only, no locking needed */ - kfree(queue->listen_opt); -} - -static inline struct listen_sock *reqsk_queue_yank_listen_sk( - struct request_sock_queue *queue) -{ - struct listen_sock *lopt; - - spin_lock_bh(&queue->syn_wait_lock); - lopt = queue->listen_opt; - queue->listen_opt = NULL; - spin_unlock_bh(&queue->syn_wait_lock); - - return lopt; -} - -void reqsk_queue_destroy(struct request_sock_queue *queue) -{ - struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue); - - /* cleaning is done by req timers */ - kfree(lopt); + queue->max_qlen_log = ilog2(nr_table_entries); } /* diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 099e0ea9242a..775483283fa7 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -552,12 +552,11 @@ static void reqsk_timer_handler(unsigned long data) struct sock *sk_listener = req->rsk_listener; struct inet_connection_sock *icsk = inet_csk(sk_listener); struct request_sock_queue *queue = &icsk->icsk_accept_queue; - struct listen_sock *lopt = queue->listen_opt; int qlen, expire = 0, resend = 0; int max_retries, thresh; u8 defer_accept; - if (sk_listener->sk_state != TCP_LISTEN || !lopt) + if (sk_listener->sk_state != TCP_LISTEN) goto drop; max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries; @@ -580,7 +579,7 @@ static void reqsk_timer_handler(unsigned long data) * ones are about to clog our table. */ qlen = reqsk_queue_len(queue); - if (qlen >> (lopt->max_qlen_log - 1)) { + if (qlen >> (queue->max_qlen_log - 1)) { int young = reqsk_queue_len_young(queue) << 1; while (thresh > 2) { @@ -730,12 +729,10 @@ EXPORT_SYMBOL(inet_csk_prepare_forced_close); int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) { - struct inet_sock *inet = inet_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); - int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries); + struct inet_sock *inet = inet_sk(sk); - if (rc != 0) - return rc; + reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries); sk->sk_max_ack_backlog = 0; sk->sk_ack_backlog = 0; @@ -757,7 +754,6 @@ int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) } sk->sk_state = TCP_CLOSE; - __reqsk_queue_destroy(&icsk->icsk_accept_queue); return -EADDRINUSE; } EXPORT_SYMBOL_GPL(inet_csk_listen_start); @@ -780,8 +776,6 @@ void inet_csk_listen_stop(struct sock *sk) * To be honest, we are not able to make either * of the variants now. --ANK */ - reqsk_queue_destroy(queue); - while ((req = reqsk_queue_remove(queue, sk)) != NULL) { struct sock *child = req->sk; -- cgit v1.2.3 From ef547f2ac16bd9d77a780a0e7c70857e69e8f23f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 2 Oct 2015 11:43:37 -0700 Subject: tcp: remove max_qlen_log This control variable was set at first listen(fd, backlog) call, but not updated if application tried to increase or decrease backlog. It made sense at the time listener had a non resizeable hash table. Also rounding to powers of two was not very friendly. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_connection_sock.h | 2 +- include/net/request_sock.h | 10 ++-------- net/core/request_sock.c | 8 +------- net/ipv4/inet_connection_sock.c | 4 ++-- 4 files changed, 6 insertions(+), 18 deletions(-) (limited to 'include/net/request_sock.h') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 730aa034cd3d..3208a65d1c28 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -295,7 +295,7 @@ static inline int inet_csk_reqsk_queue_young(const struct sock *sk) static inline int inet_csk_reqsk_queue_is_full(const struct sock *sk) { - return reqsk_queue_is_full(&inet_csk(sk)->icsk_accept_queue); + return inet_csk_reqsk_queue_len(sk) >= sk->sk_max_ack_backlog; } void inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req); diff --git a/include/net/request_sock.h b/include/net/request_sock.h index a66ab1345373..bae6936d75c4 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -157,7 +157,7 @@ struct fastopen_queue { struct request_sock_queue { spinlock_t rskq_lock; u8 rskq_defer_accept; - u8 max_qlen_log; + u32 synflood_warned; atomic_t qlen; atomic_t young; @@ -169,8 +169,7 @@ struct request_sock_queue { */ }; -void reqsk_queue_alloc(struct request_sock_queue *queue, - unsigned int nr_table_entries); +void reqsk_queue_alloc(struct request_sock_queue *queue); void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, bool reset); @@ -240,9 +239,4 @@ static inline int reqsk_queue_len_young(const struct request_sock_queue *queue) return atomic_read(&queue->young); } -static inline int reqsk_queue_is_full(const struct request_sock_queue *queue) -{ - return reqsk_queue_len(queue) >> queue->max_qlen_log; -} - #endif /* _REQUEST_SOCK_H */ diff --git a/net/core/request_sock.c b/net/core/request_sock.c index ecf74189bd3f..15c853806518 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -37,13 +37,8 @@ int sysctl_max_syn_backlog = 256; EXPORT_SYMBOL(sysctl_max_syn_backlog); -void reqsk_queue_alloc(struct request_sock_queue *queue, - unsigned int nr_table_entries) +void reqsk_queue_alloc(struct request_sock_queue *queue) { - nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog); - nr_table_entries = max_t(u32, nr_table_entries, 8); - nr_table_entries = roundup_pow_of_two(nr_table_entries + 1); - spin_lock_init(&queue->rskq_lock); spin_lock_init(&queue->fastopenq.lock); @@ -53,7 +48,6 @@ void reqsk_queue_alloc(struct request_sock_queue *queue, queue->fastopenq.max_qlen = 0; queue->rskq_accept_head = NULL; - queue->max_qlen_log = ilog2(nr_table_entries); } /* diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 775483283fa7..5f6e31a4aeae 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -579,7 +579,7 @@ static void reqsk_timer_handler(unsigned long data) * ones are about to clog our table. */ qlen = reqsk_queue_len(queue); - if (qlen >> (queue->max_qlen_log - 1)) { + if ((qlen << 1) > sk_listener->sk_max_ack_backlog) { int young = reqsk_queue_len_young(queue) << 1; while (thresh > 2) { @@ -732,7 +732,7 @@ int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet = inet_sk(sk); - reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries); + reqsk_queue_alloc(&icsk->icsk_accept_queue); sk->sk_max_ack_backlog = 0; sk->sk_ack_backlog = 0; -- cgit v1.2.3 From e96f78ab2703f3b0d512f6b469bc685d2ef20475 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 3 Oct 2015 06:27:28 -0700 Subject: tcp/dccp: add SLAB_DESTROY_BY_RCU flag for request sockets Before letting request sockets being put in TCP/DCCP regular ehash table, we need to add either : - SLAB_DESTROY_BY_RCU flag to their kmem_cache - add RCU grace period before freeing them. Since we carefully respected the SLAB_DESTROY_BY_RCU protocol like ESTABLISH and TIMEWAIT sockets, use it here. req_prot_init() being only used by TCP and DCCP, I did not add a new slab_flags into their rsk_prot, but reuse prot->slab_flags Since all reqsk_alloc() users are correctly dealing with a failure, add the __GFP_NOWARN flag to avoid traces under pressure. Fixes: 079096f103fa ("tcp/dccp: install syn_recv requests into ehash table") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/request_sock.h | 4 +++- net/core/sock.c | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include/net/request_sock.h') diff --git a/include/net/request_sock.h b/include/net/request_sock.h index bae6936d75c4..dd423d840852 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -82,7 +82,9 @@ static inline struct sock *req_to_sk(struct request_sock *req) static inline struct request_sock * reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener) { - struct request_sock *req = kmem_cache_alloc(ops->slab, GFP_ATOMIC); + struct request_sock *req; + + req = kmem_cache_alloc(ops->slab, GFP_ATOMIC | __GFP_NOWARN); if (req) { req->rsk_ops = ops; diff --git a/net/core/sock.c b/net/core/sock.c index 3307c02244d3..7dd1263e4c24 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2758,7 +2758,7 @@ static int req_prot_init(const struct proto *prot) rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name, rsk_prot->obj_size, 0, - 0, NULL); + prot->slab_flags, NULL); if (!rsk_prot->slab) { pr_crit("%s: Can't create request sock SLAB cache!\n", -- cgit v1.2.3 From 004a5d0140ce1d05c1f5fce5df4baa2717a330e0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 4 Oct 2015 21:08:10 -0700 Subject: net: use sk_fullsock() in __netdev_pick_tx() SYN_RECV & TIMEWAIT sockets are not full blown, they do not have a sk_dst_cache pointer. Fixes: ca6fb0651883 ("tcp: attach SYNACK messages to request sockets instead of listener") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/request_sock.h | 1 + net/core/dev.c | 1 + 2 files changed, 2 insertions(+) (limited to 'include/net/request_sock.h') diff --git a/include/net/request_sock.h b/include/net/request_sock.h index dd423d840852..f83669460d82 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -92,6 +92,7 @@ reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener) req->rsk_listener = sk_listener; req_to_sk(req)->sk_prot = sk_listener->sk_prot; sk_node_init(&req_to_sk(req)->sk_node); + sk_tx_queue_clear(req_to_sk(req)); req->saved_syn = NULL; /* Following is temporary. It is coupled with debugging * helpers in reqsk_put() & reqsk_free() diff --git a/net/core/dev.c b/net/core/dev.c index 323c04edd779..a229bf0d649d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2974,6 +2974,7 @@ static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb) new_index = skb_tx_hash(dev, skb); if (queue_index != new_index && sk && + sk_fullsock(sk) && rcu_access_pointer(sk->sk_dst_cache)) sk_tx_queue_set(sk, new_index); -- cgit v1.2.3 From a1a5344ddbe8fd3e080013b317ac9a664490cfdf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 4 Oct 2015 21:08:11 -0700 Subject: tcp: avoid two atomic ops for syncookies inet_reqsk_alloc() is used to allocate a temporary request in order to generate a SYNACK with a cookie. Then later, syncookie validation also uses a temporary request. These paths already took a reference on listener refcount, we can avoid a couple of atomic operations. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_sock.h | 3 ++- include/net/request_sock.h | 11 ++++++++--- net/dccp/ipv4.c | 2 +- net/dccp/ipv6.c | 2 +- net/ipv4/syncookies.c | 2 +- net/ipv4/tcp_input.c | 8 +++++--- net/ipv6/syncookies.c | 2 +- 7 files changed, 19 insertions(+), 11 deletions(-) (limited to 'include/net/request_sock.h') diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 47eb67b08abd..f5bf7310e334 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -245,7 +245,8 @@ static inline unsigned int __inet_ehashfn(const __be32 laddr, } struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops, - struct sock *sk_listener); + struct sock *sk_listener, + bool attach_listener); static inline __u8 inet_sk_flowi_flags(const struct sock *sk) { diff --git a/include/net/request_sock.h b/include/net/request_sock.h index f83669460d82..95ab5d7aab96 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -80,7 +80,8 @@ static inline struct sock *req_to_sk(struct request_sock *req) } static inline struct request_sock * -reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener) +reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener, + bool attach_listener) { struct request_sock *req; @@ -88,8 +89,12 @@ reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener) if (req) { req->rsk_ops = ops; - sock_hold(sk_listener); - req->rsk_listener = sk_listener; + if (attach_listener) { + sock_hold(sk_listener); + req->rsk_listener = sk_listener; + } else { + req->rsk_listener = NULL; + } req_to_sk(req)->sk_prot = sk_listener->sk_prot; sk_node_init(&req_to_sk(req)->sk_node); sk_tx_queue_clear(req_to_sk(req)); diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 8910c9567719..8e99681c8189 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -595,7 +595,7 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) goto drop; - req = inet_reqsk_alloc(&dccp_request_sock_ops, sk); + req = inet_reqsk_alloc(&dccp_request_sock_ops, sk, true); if (req == NULL) goto drop; diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 1361a3f45df7..aed314f8c7c6 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -319,7 +319,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) goto drop; - req = inet_reqsk_alloc(&dccp6_request_sock_ops, sk); + req = inet_reqsk_alloc(&dccp6_request_sock_ops, sk, true); if (req == NULL) goto drop; diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 729ceb5f63c6..8113c30ccf96 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -326,7 +326,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) goto out; ret = NULL; - req = inet_reqsk_alloc(&tcp_request_sock_ops, sk); /* for safety */ + req = inet_reqsk_alloc(&tcp_request_sock_ops, sk, false); /* for safety */ if (!req) goto out; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a95c8eb04ff7..ddadb318e850 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6042,9 +6042,11 @@ static void tcp_openreq_init(struct request_sock *req, } struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops, - struct sock *sk_listener) + struct sock *sk_listener, + bool attach_listener) { - struct request_sock *req = reqsk_alloc(ops, sk_listener); + struct request_sock *req = reqsk_alloc(ops, sk_listener, + attach_listener); if (req) { struct inet_request_sock *ireq = inet_rsk(req); @@ -6143,7 +6145,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, goto drop; } - req = inet_reqsk_alloc(rsk_ops, sk); + req = inet_reqsk_alloc(rsk_ops, sk, !want_cookie); if (!req) goto drop; diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 7606eba83e7b..f610b5310b17 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -170,7 +170,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) goto out; ret = NULL; - req = inet_reqsk_alloc(&tcp6_request_sock_ops, sk); + req = inet_reqsk_alloc(&tcp6_request_sock_ops, sk, false); if (!req) goto out; -- cgit v1.2.3 From 8e5eb54d303b7cb1174977ca79030e135728c95e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 8 Oct 2015 19:33:22 -0700 Subject: net: align sk_refcnt on 128 bytes boundary sk->sk_refcnt is dirtied for every TCP/UDP incoming packet. This is a performance issue if multiple cpus hit a common socket, or multiple sockets are chained due to SO_REUSEPORT. By moving sk_refcnt 8 bytes further, first 128 bytes of sockets are mostly read. As they contain the lookup keys, this has a considerable performance impact, as cpus can cache them. These 8 bytes are not wasted, we use them as a place holder for various fields, depending on the socket type. Tested: SYN flood hitting a 16 RX queues NIC. TCP listener using 16 sockets and SO_REUSEPORT and SO_INCOMING_CPU for proper siloing. Could process 6.0 Mpps SYN instead of 4.2 Mpps Kernel profile looked like : 11.68% [kernel] [k] sha_transform 6.51% [kernel] [k] __inet_lookup_listener 5.07% [kernel] [k] __inet_lookup_established 4.15% [kernel] [k] memcpy_erms 3.46% [kernel] [k] ipt_do_table 2.74% [kernel] [k] fib_table_lookup 2.54% [kernel] [k] tcp_make_synack 2.34% [kernel] [k] tcp_conn_request 2.05% [kernel] [k] __netif_receive_skb_core 2.03% [kernel] [k] kmem_cache_alloc Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_timewait_sock.h | 2 +- include/net/request_sock.h | 2 +- include/net/sock.h | 17 ++++++++++++++--- 3 files changed, 16 insertions(+), 5 deletions(-) (limited to 'include/net/request_sock.h') diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index 186f3a1e1b1f..e581fc69129d 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -70,6 +70,7 @@ struct inet_timewait_sock { #define tw_dport __tw_common.skc_dport #define tw_num __tw_common.skc_num #define tw_cookie __tw_common.skc_cookie +#define tw_dr __tw_common.skc_tw_dr int tw_timeout; volatile unsigned char tw_substate; @@ -88,7 +89,6 @@ struct inet_timewait_sock { kmemcheck_bitfield_end(flags); struct timer_list tw_timer; struct inet_bind_bucket *tw_tb; - struct inet_timewait_death_row *tw_dr; }; #define tw_tclass tw_tos diff --git a/include/net/request_sock.h b/include/net/request_sock.h index 95ab5d7aab96..6b818b77d5e5 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -50,9 +50,9 @@ struct request_sock { struct sock_common __req_common; #define rsk_refcnt __req_common.skc_refcnt #define rsk_hash __req_common.skc_hash +#define rsk_listener __req_common.skc_listener struct request_sock *dl_next; - struct sock *rsk_listener; u16 mss; u8 num_retrans; /* number of retransmits */ u8 cookie_ts:1; /* syncookie: encode tcpopts in timestamp */ diff --git a/include/net/sock.h b/include/net/sock.h index cf54739f30d5..65712409464b 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -150,6 +150,9 @@ typedef __u64 __bitwise __addrpair; * @skc_node: main hash linkage for various protocol lookup tables * @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol * @skc_tx_queue_mapping: tx queue number for this connection + * @skc_flags: place holder for sk_flags + * %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE, + * %SO_OOBINLINE settings, %SO_TIMESTAMPING settings * @skc_incoming_cpu: record/match cpu processing incoming packets * @skc_refcnt: reference count * @@ -201,6 +204,16 @@ struct sock_common { atomic64_t skc_cookie; + /* following fields are padding to force + * offset(struct sock, sk_refcnt) == 128 on 64bit arches + * assuming IPV6 is enabled. We use this padding differently + * for different kind of 'sockets' + */ + union { + unsigned long skc_flags; + struct sock *skc_listener; /* request_sock */ + struct inet_timewait_death_row *skc_tw_dr; /* inet_timewait_sock */ + }; /* * fields between dontcopy_begin/dontcopy_end * are not copied in sock_copy() @@ -246,8 +259,6 @@ struct cg_proto; * @sk_pacing_rate: Pacing rate (if supported by transport/packet scheduler) * @sk_max_pacing_rate: Maximum pacing rate (%SO_MAX_PACING_RATE) * @sk_sndbuf: size of send buffer in bytes - * @sk_flags: %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE, - * %SO_OOBINLINE settings, %SO_TIMESTAMPING settings * @sk_no_check_tx: %SO_NO_CHECK setting, set checksum in TX packets * @sk_no_check_rx: allow zero checksum in RX packets * @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO) @@ -334,6 +345,7 @@ struct sock { #define sk_v6_rcv_saddr __sk_common.skc_v6_rcv_saddr #define sk_cookie __sk_common.skc_cookie #define sk_incoming_cpu __sk_common.skc_incoming_cpu +#define sk_flags __sk_common.skc_flags socket_lock_t sk_lock; struct sk_buff_head sk_receive_queue; @@ -371,7 +383,6 @@ struct sock { #ifdef CONFIG_XFRM struct xfrm_policy *sk_policy[2]; #endif - unsigned long sk_flags; struct dst_entry *sk_rx_dst; struct dst_entry __rcu *sk_dst_cache; spinlock_t sk_dst_lock; -- cgit v1.2.3 From ed53d0ab761f5c71d77c8dc05fd19c0a851200db Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 8 Oct 2015 19:33:23 -0700 Subject: net: shrink struct sock and request_sock by 8 bytes One 32bit hole is following skc_refcnt, use it. skc_incoming_cpu can also be an union for request_sock rcv_wnd. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/request_sock.h | 5 ++--- include/net/sock.h | 14 +++++++++----- net/ipv4/syncookies.c | 4 ++-- net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_ipv4.c | 2 +- net/ipv4/tcp_minisocks.c | 18 +++++++++--------- net/ipv4/tcp_output.c | 2 +- net/ipv6/syncookies.c | 4 ++-- net/ipv6/tcp_ipv6.c | 2 +- 9 files changed, 28 insertions(+), 25 deletions(-) (limited to 'include/net/request_sock.h') diff --git a/include/net/request_sock.h b/include/net/request_sock.h index 6b818b77d5e5..2e73748956d5 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -51,15 +51,14 @@ struct request_sock { #define rsk_refcnt __req_common.skc_refcnt #define rsk_hash __req_common.skc_hash #define rsk_listener __req_common.skc_listener +#define rsk_window_clamp __req_common.skc_window_clamp +#define rsk_rcv_wnd __req_common.skc_rcv_wnd struct request_sock *dl_next; u16 mss; u8 num_retrans; /* number of retransmits */ u8 cookie_ts:1; /* syncookie: encode tcpopts in timestamp */ u8 num_timeout:7; /* number of timeouts */ - /* The following two fields can be easily recomputed I think -AK */ - u32 window_clamp; /* window clamp at creation time */ - u32 rcv_wnd; /* rcv_wnd offered first time */ u32 ts_recent; struct timer_list rsk_timer; const struct request_sock_ops *rsk_ops; diff --git a/include/net/sock.h b/include/net/sock.h index 65712409464b..19cfe1fc911c 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -226,11 +226,18 @@ struct sock_common { struct hlist_nulls_node skc_nulls_node; }; int skc_tx_queue_mapping; - int skc_incoming_cpu; + union { + int skc_incoming_cpu; + u32 skc_rcv_wnd; + }; atomic_t skc_refcnt; /* private: */ int skc_dontcopy_end[0]; + union { + u32 skc_rxhash; + u32 skc_window_clamp; + }; /* public: */ }; @@ -287,7 +294,6 @@ struct cg_proto; * @sk_rcvlowat: %SO_RCVLOWAT setting * @sk_rcvtimeo: %SO_RCVTIMEO setting * @sk_sndtimeo: %SO_SNDTIMEO setting - * @sk_rxhash: flow hash received from netif layer * @sk_txhash: computed flow hash for use on transmit * @sk_filter: socket filtering instructions * @sk_timer: sock cleanup timer @@ -346,6 +352,7 @@ struct sock { #define sk_cookie __sk_common.skc_cookie #define sk_incoming_cpu __sk_common.skc_incoming_cpu #define sk_flags __sk_common.skc_flags +#define sk_rxhash __sk_common.skc_rxhash socket_lock_t sk_lock; struct sk_buff_head sk_receive_queue; @@ -365,9 +372,6 @@ struct sock { } sk_backlog; #define sk_rmem_alloc sk_backlog.rmem_alloc int sk_forward_alloc; -#ifdef CONFIG_RPS - __u32 sk_rxhash; -#endif __u32 sk_txhash; #ifdef CONFIG_NET_RX_BUSY_POLL diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 2dbb11331f6c..4c0892badb8b 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -382,10 +382,10 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) } /* Try to redo what tcp_v4_send_synack did. */ - req->window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW); + req->rsk_window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW); tcp_select_initial_window(tcp_full_space(sk), req->mss, - &req->rcv_wnd, &req->window_clamp, + &req->rsk_rcv_wnd, &req->rsk_window_clamp, ireq->wscale_ok, &rcv_wscale, dst_metric(&rt->dst, RTAX_INITRWND)); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index ddadb318e850..3b35c3f4d268 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6022,7 +6022,7 @@ static void tcp_openreq_init(struct request_sock *req, { struct inet_request_sock *ireq = inet_rsk(req); - req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */ + req->rsk_rcv_wnd = 0; /* So that tcp_send_synack() knows! */ req->cookie_ts = 0; tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq; tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 34310748a365..ddb198392c7f 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -803,7 +803,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, */ tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt, - tcp_rsk(req)->rcv_nxt, req->rcv_wnd, + tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd, tcp_time_stamp, req->ts_recent, 0, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 1079e6ad77fe..41828bdc5d32 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -381,18 +381,18 @@ void tcp_openreq_init_rwin(struct request_sock *req, window_clamp = READ_ONCE(tp->window_clamp); /* Set this up on the first call only */ - req->window_clamp = window_clamp ? : dst_metric(dst, RTAX_WINDOW); + req->rsk_window_clamp = window_clamp ? : dst_metric(dst, RTAX_WINDOW); /* limit the window selection if the user enforce a smaller rx buffer */ if (sk_listener->sk_userlocks & SOCK_RCVBUF_LOCK && - (req->window_clamp > full_space || req->window_clamp == 0)) - req->window_clamp = full_space; + (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0)) + req->rsk_window_clamp = full_space; /* tcp_full_space because it is guaranteed to be the first packet */ tcp_select_initial_window(full_space, mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), - &req->rcv_wnd, - &req->window_clamp, + &req->rsk_rcv_wnd, + &req->rsk_window_clamp, ireq->wscale_ok, &rcv_wscale, dst_metric(dst, RTAX_INITRWND)); @@ -512,9 +512,9 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, if (sysctl_tcp_fack) tcp_enable_fack(newtp); } - newtp->window_clamp = req->window_clamp; - newtp->rcv_ssthresh = req->rcv_wnd; - newtp->rcv_wnd = req->rcv_wnd; + newtp->window_clamp = req->rsk_window_clamp; + newtp->rcv_ssthresh = req->rsk_rcv_wnd; + newtp->rcv_wnd = req->rsk_rcv_wnd; newtp->rx_opt.wscale_ok = ireq->wscale_ok; if (newtp->rx_opt.wscale_ok) { newtp->rx_opt.snd_wscale = ireq->snd_wscale; @@ -707,7 +707,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, /* RFC793: "first check sequence number". */ if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, - tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rcv_wnd)) { + tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rsk_rcv_wnd)) { /* Out of window: send ACK and drop. */ if (!(flg & TCP_FLAG_RST)) req->rsk_ops->send_ack(sk, skb, req); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 55ed3266b05f..6e79fcb0addb 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3023,7 +3023,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt); /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ - th->window = htons(min(req->rcv_wnd, 65535U)); + th->window = htons(min(req->rsk_rcv_wnd, 65535U)); tcp_options_write((__be32 *)(th + 1), NULL, &opts); th->doff = (tcp_header_size >> 2); TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_OUTSEGS); diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index f610b5310b17..bb8f2fa1c7fb 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -235,9 +235,9 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) goto out_free; } - req->window_clamp = tp->window_clamp ? :dst_metric(dst, RTAX_WINDOW); + req->rsk_window_clamp = tp->window_clamp ? :dst_metric(dst, RTAX_WINDOW); tcp_select_initial_window(tcp_full_space(sk), req->mss, - &req->rcv_wnd, &req->window_clamp, + &req->rsk_rcv_wnd, &req->rsk_window_clamp, ireq->wscale_ok, &rcv_wscale, dst_metric(dst, RTAX_INITRWND)); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 33334f0c217d..2887c8474b65 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -931,7 +931,7 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, */ tcp_v6_send_ack(sk, skb, (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt, - tcp_rsk(req)->rcv_nxt, req->rcv_wnd, + tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd, tcp_time_stamp, req->ts_recent, sk->sk_bound_dev_if, tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr), 0, 0); -- cgit v1.2.3 From ebb516af60e18258aac8e80bbe068740ef1579ed Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 14 Oct 2015 11:16:28 -0700 Subject: tcp/dccp: fix race at listener dismantle phase Under stress, a close() on a listener can trigger the WARN_ON(sk->sk_ack_backlog) in inet_csk_listen_stop() We need to test if listener is still active before queueing a child in inet_csk_reqsk_queue_add() Create a common inet_child_forget() helper, and use it from inet_csk_reqsk_queue_add() and inet_csk_listen_stop() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_connection_sock.h | 9 ++--- include/net/request_sock.h | 19 ---------- net/ipv4/inet_connection_sock.c | 71 ++++++++++++++++++++++++++------------ 3 files changed, 51 insertions(+), 48 deletions(-) (limited to 'include/net/request_sock.h') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index e84ea9f2498f..63615709839d 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -268,13 +268,8 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk, struct sock *newsk, const struct request_sock *req); -static inline void inet_csk_reqsk_queue_add(struct sock *sk, - struct request_sock *req, - struct sock *child) -{ - reqsk_queue_add(&inet_csk(sk)->icsk_accept_queue, req, sk, child); -} - +void inet_csk_reqsk_queue_add(struct sock *sk, struct request_sock *req, + struct sock *child); void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, unsigned long timeout); diff --git a/include/net/request_sock.h b/include/net/request_sock.h index 2e73748956d5..a0dde04eb178 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -186,25 +186,6 @@ static inline bool reqsk_queue_empty(const struct request_sock_queue *queue) return queue->rskq_accept_head == NULL; } -static inline void reqsk_queue_add(struct request_sock_queue *queue, - struct request_sock *req, - struct sock *parent, - struct sock *child) -{ - spin_lock(&queue->rskq_lock); - req->sk = child; - sk_acceptq_added(parent); - - if (queue->rskq_accept_head == NULL) - queue->rskq_accept_head = req; - else - queue->rskq_accept_tail->dl_next = req; - - queue->rskq_accept_tail = req; - req->dl_next = NULL; - spin_unlock(&queue->rskq_lock); -} - static inline struct request_sock *reqsk_queue_remove(struct request_sock_queue *queue, struct sock *parent) { diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index b85c720956a9..8430bc8ccd58 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -764,6 +764,53 @@ int inet_csk_listen_start(struct sock *sk, int backlog) } EXPORT_SYMBOL_GPL(inet_csk_listen_start); +static void inet_child_forget(struct sock *sk, struct request_sock *req, + struct sock *child) +{ + sk->sk_prot->disconnect(child, O_NONBLOCK); + + sock_orphan(child); + + percpu_counter_inc(sk->sk_prot->orphan_count); + + if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) { + BUG_ON(tcp_sk(child)->fastopen_rsk != req); + BUG_ON(sk != req->rsk_listener); + + /* Paranoid, to prevent race condition if + * an inbound pkt destined for child is + * blocked by sock lock in tcp_v4_rcv(). + * Also to satisfy an assertion in + * tcp_v4_destroy_sock(). + */ + tcp_sk(child)->fastopen_rsk = NULL; + } + inet_csk_destroy_sock(child); + reqsk_put(req); +} + +void inet_csk_reqsk_queue_add(struct sock *sk, struct request_sock *req, + struct sock *child) +{ + struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; + + spin_lock(&queue->rskq_lock); + if (unlikely(sk->sk_state != TCP_LISTEN)) { + inet_child_forget(sk, req, child); + } else { + req->sk = child; + req->dl_next = NULL; + if (queue->rskq_accept_head == NULL) + queue->rskq_accept_head = req; + else + queue->rskq_accept_tail->dl_next = req; + queue->rskq_accept_tail = req; + sk_acceptq_added(sk); + } + spin_unlock(&queue->rskq_lock); +} +EXPORT_SYMBOL(inet_csk_reqsk_queue_add); + /* * This routine closes sockets which have been at least partially * opened, but not yet accepted. @@ -790,31 +837,11 @@ void inet_csk_listen_stop(struct sock *sk) WARN_ON(sock_owned_by_user(child)); sock_hold(child); - sk->sk_prot->disconnect(child, O_NONBLOCK); - - sock_orphan(child); - - percpu_counter_inc(sk->sk_prot->orphan_count); - - if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) { - BUG_ON(tcp_sk(child)->fastopen_rsk != req); - BUG_ON(sk != req->rsk_listener); - - /* Paranoid, to prevent race condition if - * an inbound pkt destined for child is - * blocked by sock lock in tcp_v4_rcv(). - * Also to satisfy an assertion in - * tcp_v4_destroy_sock(). - */ - tcp_sk(child)->fastopen_rsk = NULL; - } - inet_csk_destroy_sock(child); - + inet_child_forget(sk, req, child); bh_unlock_sock(child); local_bh_enable(); sock_put(child); - reqsk_put(req); cond_resched(); } if (queue->fastopenq.rskq_rst_head) { @@ -829,7 +856,7 @@ void inet_csk_listen_stop(struct sock *sk) req = next; } } - WARN_ON(sk->sk_ack_backlog); + WARN_ON_ONCE(sk->sk_ack_backlog); } EXPORT_SYMBOL_GPL(inet_csk_listen_stop); -- cgit v1.2.3