summaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2017-01-25 14:04:39 -0500
committerDavid S. Miller <davem@davemloft.net>2017-01-25 14:04:39 -0500
commiteb92f76ef54f988af529c89e843958f8673de9e8 (patch)
tree87a4399e42ae6b0ae622718952d66e666e64d3ef /net
parenta9c54ad2c737853b1f404b10b09da4ca7fa18597 (diff)
parent19f6d3f3c8422d65b5e3d2162e30ef07c6e21ea2 (diff)
downloadlinux-eb92f76ef54f988af529c89e843958f8673de9e8.tar.bz2
Merge branch 'tcp-fastopen-new-API'
Wei Wang says: ==================== net/tcp-fastopen: Add new userspace API support The patch series is to add support for new userspace API for TCP fastopen sockets. In the current code, user has to call sendto()/sendmsg() with special flag MSG_FASTOPEN for TCP fastopen sockets. This API is quite different from the normal TCP socket API and can be cumbersome for applications to make use fastopen sockets. So this new patch introduces a new way of using TCP fastopen sockets which is similar to normal TCP sockets with a new sockopt TCP_FASTOPEN_CONNECT. More details about it is described in the third patch. (First 2 patches are preparations for the third patch.) ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net')
-rw-r--r--net/ipv4/af_inet.c31
-rw-r--r--net/ipv4/tcp.c35
-rw-r--r--net/ipv4/tcp_fastopen.c54
-rw-r--r--net/ipv4/tcp_ipv4.c7
-rw-r--r--net/ipv4/tcp_output.c16
-rw-r--r--net/ipv6/tcp_ipv6.c6
6 files changed, 125 insertions, 24 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 28fe8da4e1ac..92e7f3e957fa 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -576,13 +576,24 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
int err;
long timeo;
- if (addr_len < sizeof(uaddr->sa_family))
- return -EINVAL;
+ /*
+ * uaddr can be NULL and addr_len can be 0 if:
+ * sk is a TCP fastopen active socket and
+ * TCP_FASTOPEN_CONNECT sockopt is set and
+ * we already have a valid cookie for this socket.
+ * In this case, user can call write() after connect().
+ * write() will invoke tcp_sendmsg_fastopen() which calls
+ * __inet_stream_connect().
+ */
+ if (uaddr) {
+ if (addr_len < sizeof(uaddr->sa_family))
+ return -EINVAL;
- if (uaddr->sa_family == AF_UNSPEC) {
- err = sk->sk_prot->disconnect(sk, flags);
- sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
- goto out;
+ if (uaddr->sa_family == AF_UNSPEC) {
+ err = sk->sk_prot->disconnect(sk, flags);
+ sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
+ goto out;
+ }
}
switch (sock->state) {
@@ -593,7 +604,10 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
err = -EISCONN;
goto out;
case SS_CONNECTING:
- err = -EALREADY;
+ if (inet_sk(sk)->defer_connect)
+ err = -EINPROGRESS;
+ else
+ err = -EALREADY;
/* Fall out of switch with err, set for this state */
break;
case SS_UNCONNECTED:
@@ -607,6 +621,9 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
sock->state = SS_CONNECTING;
+ if (!err && inet_sk(sk)->defer_connect)
+ goto out;
+
/* Just entered SS_CONNECTING state; the only
* difference is that return value in non-blocking
* case is EINPROGRESS, rather than EALREADY.
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c43eb1a831d7..d9735b76d073 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -533,6 +533,12 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
if (tp->urg_data & TCP_URG_VALID)
mask |= POLLPRI;
+ } else if (sk->sk_state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) {
+ /* Active TCP fastopen socket with defer_connect
+ * Return POLLOUT so application can call write()
+ * in order for kernel to generate SYN+data
+ */
+ mask |= POLLOUT | POLLWRNORM;
}
/* This barrier is coupled with smp_wmb() in tcp_reset() */
smp_rmb();
@@ -1071,6 +1077,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
int *copied, size_t size)
{
struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_sock *inet = inet_sk(sk);
int err, flags;
if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE))
@@ -1085,9 +1092,19 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
tp->fastopen_req->data = msg;
tp->fastopen_req->size = size;
+ if (inet->defer_connect) {
+ err = tcp_connect(sk);
+ /* Same failure procedure as in tcp_v4/6_connect */
+ if (err) {
+ tcp_set_state(sk, TCP_CLOSE);
+ inet->inet_dport = 0;
+ sk->sk_route_caps = 0;
+ }
+ }
flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
err = __inet_stream_connect(sk->sk_socket, msg->msg_name,
msg->msg_namelen, flags);
+ inet->defer_connect = 0;
*copied = tp->fastopen_req->copied;
tcp_free_fastopen_req(tp);
return err;
@@ -1107,7 +1124,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
lock_sock(sk);
flags = msg->msg_flags;
- if (flags & MSG_FASTOPEN) {
+ if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect)) {
err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size);
if (err == -EINPROGRESS && copied_syn > 0)
goto out;
@@ -2656,6 +2673,18 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
err = -EINVAL;
}
break;
+ case TCP_FASTOPEN_CONNECT:
+ if (val > 1 || val < 0) {
+ err = -EINVAL;
+ } else if (sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) {
+ if (sk->sk_state == TCP_CLOSE)
+ tp->fastopen_connect = val;
+ else
+ err = -EINVAL;
+ } else {
+ err = -EOPNOTSUPP;
+ }
+ break;
case TCP_TIMESTAMP:
if (!tp->repair)
err = -EPERM;
@@ -3016,6 +3045,10 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
val = icsk->icsk_accept_queue.fastopenq.max_qlen;
break;
+ case TCP_FASTOPEN_CONNECT:
+ val = tp->fastopen_connect;
+ break;
+
case TCP_TIMESTAMP:
val = tcp_time_stamp + tp->tsoffset;
break;
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index f51919535ca7..9674bec4a0f8 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -325,3 +325,57 @@ fastopen:
*foc = valid_foc;
return NULL;
}
+
+bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
+ struct tcp_fastopen_cookie *cookie)
+{
+ unsigned long last_syn_loss = 0;
+ int syn_loss = 0;
+
+ tcp_fastopen_cache_get(sk, mss, cookie, &syn_loss, &last_syn_loss);
+
+ /* Recurring FO SYN losses: no cookie or data in SYN */
+ if (syn_loss > 1 &&
+ time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) {
+ cookie->len = -1;
+ return false;
+ }
+ if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) {
+ cookie->len = -1;
+ return true;
+ }
+ return cookie->len > 0;
+}
+
+/* This function checks if we want to defer sending SYN until the first
+ * write(). We defer under the following conditions:
+ * 1. fastopen_connect sockopt is set
+ * 2. we have a valid cookie
+ * Return value: return true if we want to defer until application writes data
+ * return false if we want to send out SYN immediately
+ */
+bool tcp_fastopen_defer_connect(struct sock *sk, int *err)
+{
+ struct tcp_fastopen_cookie cookie = { .len = 0 };
+ struct tcp_sock *tp = tcp_sk(sk);
+ u16 mss;
+
+ if (tp->fastopen_connect && !tp->fastopen_req) {
+ if (tcp_fastopen_cookie_check(sk, &mss, &cookie)) {
+ inet_sk(sk)->defer_connect = 1;
+ return true;
+ }
+
+ /* Alloc fastopen_req in order for FO option to be included
+ * in SYN
+ */
+ tp->fastopen_req = kzalloc(sizeof(*tp->fastopen_req),
+ sk->sk_allocation);
+ if (tp->fastopen_req)
+ tp->fastopen_req->cookie = cookie;
+ else
+ *err = -ENOBUFS;
+ }
+ return false;
+}
+EXPORT_SYMBOL(tcp_fastopen_defer_connect);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a90b4540c11e..8c9e9aa17d66 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -232,6 +232,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
/* OK, now commit destination to socket. */
sk->sk_gso_type = SKB_GSO_TCPV4;
sk_setup_caps(sk, &rt->dst);
+ rt = NULL;
if (!tp->write_seq && likely(!tp->repair))
tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
@@ -242,9 +243,13 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
inet->inet_id = tp->write_seq ^ jiffies;
+ if (tcp_fastopen_defer_connect(sk, &err))
+ return err;
+ if (err)
+ goto failure;
+
err = tcp_connect(sk);
- rt = NULL;
if (err)
goto failure;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 9a1a1494b9dd..671c69535671 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3267,23 +3267,11 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
{
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_fastopen_request *fo = tp->fastopen_req;
- int syn_loss = 0, space, err = 0;
- unsigned long last_syn_loss = 0;
+ int space, err = 0;
struct sk_buff *syn_data;
tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */
- tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie,
- &syn_loss, &last_syn_loss);
- /* Recurring FO SYN losses: revert to regular handshake temporarily */
- if (syn_loss > 1 &&
- time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) {
- fo->cookie.len = -1;
- goto fallback;
- }
-
- if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE)
- fo->cookie.len = -1;
- else if (fo->cookie.len <= 0)
+ if (!tcp_fastopen_cookie_check(sk, &tp->rx_opt.mss_clamp, &fo->cookie))
goto fallback;
/* MSS for SYN-data is based on cached MSS and bounded by PMTU and
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index f72100eedd5d..95c05e5293b1 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -287,6 +287,11 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
inet->inet_dport,
&tp->tsoffset);
+ if (tcp_fastopen_defer_connect(sk, &err))
+ return err;
+ if (err)
+ goto late_failure;
+
err = tcp_connect(sk);
if (err)
goto late_failure;
@@ -295,7 +300,6 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
late_failure:
tcp_set_state(sk, TCP_CLOSE);
- __sk_dst_reset(sk);
failure:
inet->inet_dport = 0;
sk->sk_route_caps = 0;