diff options
Diffstat (limited to 'net/ipv4/tcp.c')
-rw-r--r-- | net/ipv4/tcp.c | 213 |
1 files changed, 160 insertions, 53 deletions
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 995a2259bcfc..45534a5ab430 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -252,6 +252,7 @@ #include <linux/types.h> #include <linux/fcntl.h> #include <linux/poll.h> +#include <linux/inet_diag.h> #include <linux/init.h> #include <linux/fs.h> #include <linux/skbuff.h> @@ -401,6 +402,7 @@ void tcp_init_sock(struct sock *sk) tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tp->snd_cwnd_clamp = ~0; tp->mss_cache = TCP_MSS_DEFAULT; + u64_stats_init(&tp->syncp); tp->reordering = sysctl_tcp_reordering; tcp_enable_early_retrans(tp); @@ -496,7 +498,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) /* Connected or passive Fast Open socket? */ if (sk->sk_state != TCP_SYN_SENT && - (sk->sk_state != TCP_SYN_RECV || tp->fastopen_rsk != NULL)) { + (sk->sk_state != TCP_SYN_RECV || tp->fastopen_rsk)) { int target = sock_rcvlowat(sk, 0, INT_MAX); if (tp->urg_seq == tp->copied_seq && @@ -520,8 +522,10 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) /* Race breaker. If space is freed after * wspace test but before the flags are set, - * IO signal will be lost. + * IO signal will be lost. Memory barrier + * pairs with the input side. */ + smp_mb__after_atomic(); if (sk_stream_is_writeable(sk)) mask |= POLLOUT | POLLWRNORM; } @@ -691,8 +695,9 @@ static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, struct tcp_splice_state *tss = rd_desc->arg.data; int ret; - ret = skb_splice_bits(skb, offset, tss->pipe, min(rd_desc->count, len), - tss->flags); + ret = skb_splice_bits(skb, skb->sk, offset, tss->pipe, + min(rd_desc->count, len), tss->flags, + skb_socket_splice); if (ret > 0) rd_desc->count -= ret; return ret; @@ -775,7 +780,7 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, ret = -EAGAIN; break; } - sk_wait_data(sk, &timeo); + sk_wait_data(sk, &timeo, NULL); if (signal_pending(current)) { ret = sock_intr_errno(timeo); break; @@ -805,16 +810,28 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, } EXPORT_SYMBOL(tcp_splice_read); -struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp) +struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, + bool force_schedule) { struct sk_buff *skb; /* The TCP header must be at least 32-bit aligned. */ size = ALIGN(size, 4); + if (unlikely(tcp_under_memory_pressure(sk))) + sk_mem_reclaim_partial(sk); + skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp); - if (skb) { - if (sk_wmem_schedule(sk, skb->truesize)) { + if (likely(skb)) { + bool mem_scheduled; + + if (force_schedule) { + mem_scheduled = true; + sk_forced_mem_schedule(sk, skb->truesize); + } else { + mem_scheduled = sk_wmem_schedule(sk, skb->truesize); + } + if (likely(mem_scheduled)) { skb_reserve(skb, sk->sk_prot->max_header); /* * Make sure that we have exactly size bytes @@ -904,7 +921,8 @@ new_segment: if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; - skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation); + skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, + skb_queue_empty(&sk->sk_write_queue)); if (!skb) goto wait_for_memory; @@ -983,6 +1001,9 @@ do_error: if (copied) goto out; out_err: + /* make sure we wake any epoll edge trigger waiter */ + if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN)) + sk->sk_write_space(sk); return sk_stream_error(sk, flags, err); } @@ -1028,7 +1049,7 @@ static inline int select_size(const struct sock *sk, bool sg) void tcp_free_fastopen_req(struct tcp_sock *tp) { - if (tp->fastopen_req != NULL) { + if (tp->fastopen_req) { kfree(tp->fastopen_req); tp->fastopen_req = NULL; } @@ -1042,12 +1063,12 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE)) return -EOPNOTSUPP; - if (tp->fastopen_req != NULL) + if (tp->fastopen_req) return -EALREADY; /* Another Fast Open is in progress */ tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request), sk->sk_allocation); - if (unlikely(tp->fastopen_req == NULL)) + if (unlikely(!tp->fastopen_req)) return -ENOBUFS; tp->fastopen_req->data = msg; tp->fastopen_req->size = size; @@ -1060,8 +1081,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, return err; } -int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, - size_t size) +int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; @@ -1120,7 +1140,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, sg = !!(sk->sk_route_caps & NETIF_F_SG); - while (iov_iter_count(&msg->msg_iter)) { + while (msg_data_left(msg)) { int copy = 0; int max = size_goal; @@ -1141,7 +1161,8 @@ new_segment: skb = sk_stream_alloc_skb(sk, select_size(sk, sg), - sk->sk_allocation); + sk->sk_allocation, + skb_queue_empty(&sk->sk_write_queue)); if (!skb) goto wait_for_memory; @@ -1164,8 +1185,8 @@ new_segment: } /* Try to append data to the end of skb. */ - if (copy > iov_iter_count(&msg->msg_iter)) - copy = iov_iter_count(&msg->msg_iter); + if (copy > msg_data_left(msg)) + copy = msg_data_left(msg); /* Where to copy to? */ if (skb_availroom(skb) > 0) { @@ -1222,7 +1243,7 @@ new_segment: tcp_skb_pcount_set(skb, 0); copied += copy; - if (!iov_iter_count(&msg->msg_iter)) { + if (!msg_data_left(msg)) { tcp_tx_timestamp(sk, skb); goto out; } @@ -1272,6 +1293,9 @@ do_error: goto out; out_err: err = sk_stream_error(sk, flags, err); + /* make sure we wake any epoll edge trigger waiter */ + if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN)) + sk->sk_write_space(sk); release_sock(sk); return err; } @@ -1539,8 +1563,8 @@ EXPORT_SYMBOL(tcp_read_sock); * Probably, code can be easily improved even more. */ -int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, - size_t len, int nonblock, int flags, int *addr_len) +int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, + int flags, int *addr_len) { struct tcp_sock *tp = tcp_sk(sk); int copied = 0; @@ -1551,7 +1575,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, int target; /* Read at least this many bytes */ long timeo; struct task_struct *user_recv = NULL; - struct sk_buff *skb; + struct sk_buff *skb, *last; u32 urg_hole = 0; if (unlikely(flags & MSG_ERRQUEUE)) @@ -1611,7 +1635,9 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, /* Next get a buffer. */ + last = skb_peek_tail(&sk->sk_receive_queue); skb_queue_walk(&sk->sk_receive_queue, skb) { + last = skb; /* Now that we have two receive queues this * shouldn't happen. */ @@ -1730,8 +1756,9 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, /* Do not sleep, just process backlog. */ release_sock(sk); lock_sock(sk); - } else - sk_wait_data(sk, &timeo); + } else { + sk_wait_data(sk, &timeo, last); + } if (user_recv) { int chunk; @@ -1914,18 +1941,19 @@ EXPORT_SYMBOL_GPL(tcp_set_state); static const unsigned char new_state[16] = { /* current state: new state: action: */ - /* (Invalid) */ TCP_CLOSE, - /* TCP_ESTABLISHED */ TCP_FIN_WAIT1 | TCP_ACTION_FIN, - /* TCP_SYN_SENT */ TCP_CLOSE, - /* TCP_SYN_RECV */ TCP_FIN_WAIT1 | TCP_ACTION_FIN, - /* TCP_FIN_WAIT1 */ TCP_FIN_WAIT1, - /* TCP_FIN_WAIT2 */ TCP_FIN_WAIT2, - /* TCP_TIME_WAIT */ TCP_CLOSE, - /* TCP_CLOSE */ TCP_CLOSE, - /* TCP_CLOSE_WAIT */ TCP_LAST_ACK | TCP_ACTION_FIN, - /* TCP_LAST_ACK */ TCP_LAST_ACK, - /* TCP_LISTEN */ TCP_CLOSE, - /* TCP_CLOSING */ TCP_CLOSING, + [0 /* (Invalid) */] = TCP_CLOSE, + [TCP_ESTABLISHED] = TCP_FIN_WAIT1 | TCP_ACTION_FIN, + [TCP_SYN_SENT] = TCP_CLOSE, + [TCP_SYN_RECV] = TCP_FIN_WAIT1 | TCP_ACTION_FIN, + [TCP_FIN_WAIT1] = TCP_FIN_WAIT1, + [TCP_FIN_WAIT2] = TCP_FIN_WAIT2, + [TCP_TIME_WAIT] = TCP_CLOSE, + [TCP_CLOSE] = TCP_CLOSE, + [TCP_CLOSE_WAIT] = TCP_LAST_ACK | TCP_ACTION_FIN, + [TCP_LAST_ACK] = TCP_LAST_ACK, + [TCP_LISTEN] = TCP_CLOSE, + [TCP_CLOSING] = TCP_CLOSING, + [TCP_NEW_SYN_RECV] = TCP_CLOSE, /* should not happen ! */ }; static int tcp_close_state(struct sock *sk) @@ -2138,7 +2166,7 @@ adjudge_to_death: * aborted (e.g., closed with unread data) before 3WHS * finishes. */ - if (req != NULL) + if (req) reqsk_fastopen_remove(sk, req, false); inet_csk_destroy_sock(sk); } @@ -2479,6 +2507,13 @@ static int do_tcp_setsockopt(struct sock *sk, int level, icsk->icsk_syn_retries = val; break; + case TCP_SAVE_SYN: + if (val < 0 || val > 1) + err = -EINVAL; + else + tp->save_syn = val; + break; + case TCP_LINGER2: if (val < 0) tp->linger2 = -1; @@ -2541,10 +2576,13 @@ static int do_tcp_setsockopt(struct sock *sk, int level, case TCP_FASTOPEN: if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE | - TCPF_LISTEN))) + TCPF_LISTEN))) { + tcp_fastopen_init_key_once(true); + err = fastopen_init_queue(sk, val); - else + } else { err = -EINVAL; + } break; case TCP_TIMESTAMP: if (!tp->repair) @@ -2590,13 +2628,17 @@ EXPORT_SYMBOL(compat_tcp_setsockopt); #endif /* Return information about state of tcp endpoint in API format. */ -void tcp_get_info(const struct sock *sk, struct tcp_info *info) +void tcp_get_info(struct sock *sk, struct tcp_info *info) { - const struct tcp_sock *tp = tcp_sk(sk); + const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */ const struct inet_connection_sock *icsk = inet_csk(sk); u32 now = tcp_time_stamp; + unsigned int start; + u32 rate; memset(info, 0, sizeof(*info)); + if (sk->sk_type != SOCK_STREAM) + return; info->tcpi_state = sk->sk_state; info->tcpi_ca_state = icsk->icsk_ca_state; @@ -2655,10 +2697,19 @@ void tcp_get_info(const struct sock *sk, struct tcp_info *info) info->tcpi_total_retrans = tp->total_retrans; - info->tcpi_pacing_rate = sk->sk_pacing_rate != ~0U ? - sk->sk_pacing_rate : ~0ULL; - info->tcpi_max_pacing_rate = sk->sk_max_pacing_rate != ~0U ? - sk->sk_max_pacing_rate : ~0ULL; + rate = READ_ONCE(sk->sk_pacing_rate); + info->tcpi_pacing_rate = rate != ~0U ? rate : ~0ULL; + + rate = READ_ONCE(sk->sk_max_pacing_rate); + info->tcpi_max_pacing_rate = rate != ~0U ? rate : ~0ULL; + + do { + start = u64_stats_fetch_begin_irq(&tp->syncp); + info->tcpi_bytes_acked = tp->bytes_acked; + info->tcpi_bytes_received = tp->bytes_received; + } while (u64_stats_fetch_retry_irq(&tp->syncp, start)); + info->tcpi_segs_out = tp->segs_out; + info->tcpi_segs_in = tp->segs_in; } EXPORT_SYMBOL_GPL(tcp_get_info); @@ -2730,6 +2781,26 @@ static int do_tcp_getsockopt(struct sock *sk, int level, return -EFAULT; return 0; } + case TCP_CC_INFO: { + const struct tcp_congestion_ops *ca_ops; + union tcp_cc_info info; + size_t sz = 0; + int attr; + + if (get_user(len, optlen)) + return -EFAULT; + + ca_ops = icsk->icsk_ca_ops; + if (ca_ops && ca_ops->get_info) + sz = ca_ops->get_info(sk, ~0U, &attr, &info); + + len = min_t(unsigned int, len, sz); + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &info, len)) + return -EFAULT; + return 0; + } case TCP_QUICKACK: val = !icsk->icsk_ack.pingpong; break; @@ -2776,7 +2847,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level, break; case TCP_FASTOPEN: - if (icsk->icsk_accept_queue.fastopenq != NULL) + if (icsk->icsk_accept_queue.fastopenq) val = icsk->icsk_accept_queue.fastopenq->max_qlen; else val = 0; @@ -2788,6 +2859,42 @@ static int do_tcp_getsockopt(struct sock *sk, int level, case TCP_NOTSENT_LOWAT: val = tp->notsent_lowat; break; + case TCP_SAVE_SYN: + val = tp->save_syn; + break; + case TCP_SAVED_SYN: { + if (get_user(len, optlen)) + return -EFAULT; + + lock_sock(sk); + if (tp->saved_syn) { + if (len < tp->saved_syn[0]) { + if (put_user(tp->saved_syn[0], optlen)) { + release_sock(sk); + return -EFAULT; + } + release_sock(sk); + return -EINVAL; + } + len = tp->saved_syn[0]; + if (put_user(len, optlen)) { + release_sock(sk); + return -EFAULT; + } + if (copy_to_user(optval, tp->saved_syn + 1, len)) { + release_sock(sk); + return -EFAULT; + } + tcp_saved_syn_free(tp); + release_sock(sk); + } else { + release_sock(sk); + len = 0; + if (put_user(len, optlen)) + return -EFAULT; + } + return 0; + } default: return -ENOPROTOOPT; } @@ -2960,7 +3067,7 @@ void tcp_done(struct sock *sk) tcp_set_state(sk, TCP_CLOSE); tcp_clear_xmit_timers(sk); - if (req != NULL) + if (req) reqsk_fastopen_remove(sk, req, false); sk->sk_shutdown = SHUTDOWN_MASK; @@ -2992,21 +3099,21 @@ __setup("thash_entries=", set_thash_entries); static void __init tcp_init_mem(void) { - unsigned long limit = nr_free_buffer_pages() / 8; + unsigned long limit = nr_free_buffer_pages() / 16; + limit = max(limit, 128UL); - sysctl_tcp_mem[0] = limit / 4 * 3; - sysctl_tcp_mem[1] = limit; - sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2; + sysctl_tcp_mem[0] = limit / 4 * 3; /* 4.68 % */ + sysctl_tcp_mem[1] = limit; /* 6.25 % */ + sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2; /* 9.37 % */ } void __init tcp_init(void) { - struct sk_buff *skb = NULL; unsigned long limit; int max_rshare, max_wshare, cnt; unsigned int i; - BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); + sock_skb_cb_check_size(sizeof(struct tcp_skb_cb)); percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL); percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL); |