diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2013-09-05 14:54:29 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-09-05 14:54:29 -0700 |
commit | cc998ff8811530be521f6b316f37ab7676a07938 (patch) | |
tree | a054b3bf4b2ef406bf756a6cfc9be2f9115f17ae /net/ipv4/tcp.c | |
parent | 57d730924d5cc2c3e280af16a9306587c3a511db (diff) | |
parent | 0d40f75bdab241868c0eb6f97aef9f8b3a66f7b3 (diff) | |
download | linux-cc998ff8811530be521f6b316f37ab7676a07938.tar.bz2 |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking changes from David Miller:
"Noteworthy changes this time around:
1) Multicast rejoin support for team driver, from Jiri Pirko.
2) Centralize and simplify TCP RTT measurement handling in order to
reduce the impact of bad RTO seeding from SYN/ACKs. Also, when
both timestamps and local RTT measurements are available prefer
the later because there are broken middleware devices which
scramble the timestamp.
From Yuchung Cheng.
3) Add TCP_NOTSENT_LOWAT socket option to limit the amount of kernel
memory consumed to queue up unsend user data. From Eric Dumazet.
4) Add a "physical port ID" abstraction for network devices, from
Jiri Pirko.
5) Add a "suppress" operation to influence fib_rules lookups, from
Stefan Tomanek.
6) Add a networking development FAQ, from Paul Gortmaker.
7) Extend the information provided by tcp_probe and add ipv6 support,
from Daniel Borkmann.
8) Use RCU locking more extensively in openvswitch data paths, from
Pravin B Shelar.
9) Add SCTP support to openvswitch, from Joe Stringer.
10) Add EF10 chip support to SFC driver, from Ben Hutchings.
11) Add new SYNPROXY netfilter target, from Patrick McHardy.
12) Compute a rate approximation for sending in TCP sockets, and use
this to more intelligently coalesce TSO frames. Furthermore, add
a new packet scheduler which takes advantage of this estimate when
available. From Eric Dumazet.
13) Allow AF_PACKET fanouts with random selection, from Daniel
Borkmann.
14) Add ipv6 support to vxlan driver, from Cong Wang"
Resolved conflicts as per discussion.
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1218 commits)
openvswitch: Fix alignment of struct sw_flow_key.
netfilter: Fix build errors with xt_socket.c
tcp: Add missing braces to do_tcp_setsockopt
caif: Add missing braces to multiline if in cfctrl_linkup_request
bnx2x: Add missing braces in bnx2x:bnx2x_link_initialize
vxlan: Fix kernel panic on device delete.
net: mvneta: implement ->ndo_do_ioctl() to support PHY ioctls
net: mvneta: properly disable HW PHY polling and ensure adjust_link() works
icplus: Use netif_running to determine device state
ethernet/arc/arc_emac: Fix huge delays in large file copies
tuntap: orphan frags before trying to set tx timestamp
tuntap: purge socket error queue on detach
qlcnic: use standard NAPI weights
ipv6:introduce function to find route for redirect
bnx2x: VF RSS support - VF side
bnx2x: VF RSS support - PF side
vxlan: Notify drivers for listening UDP port changes
net: usbnet: update addr_assign_type if appropriate
driver/net: enic: update enic maintainers and driver
driver/net: enic: Exposing symbols for Cisco's low latency driver
...
Diffstat (limited to 'net/ipv4/tcp.c')
-rw-r--r-- | net/ipv4/tcp.c | 46 |
1 files changed, 34 insertions, 12 deletions
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index b2f6c74861af..6e5617b9f9db 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -283,6 +283,8 @@ int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT; +int sysctl_tcp_min_tso_segs __read_mostly = 2; + struct percpu_counter tcp_orphan_count; EXPORT_SYMBOL_GPL(tcp_orphan_count); @@ -410,10 +412,6 @@ void tcp_init_sock(struct sock *sk) icsk->icsk_sync_mss = tcp_sync_mss; - /* Presumed zeroed, in order of appearance: - * cookie_in_always, cookie_out_never, - * s_data_constant, s_data_in, s_data_out - */ sk->sk_sndbuf = sysctl_tcp_wmem[1]; sk->sk_rcvbuf = sysctl_tcp_rmem[1]; @@ -499,7 +497,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) mask |= POLLIN | POLLRDNORM; if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { - if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) { + if (sk_stream_is_writeable(sk)) { mask |= POLLOUT | POLLWRNORM; } else { /* send SIGIO later */ set_bit(SOCK_ASYNC_NOSPACE, @@ -510,7 +508,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) * wspace test but before the flags are set, * IO signal will be lost. */ - if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) + if (sk_stream_is_writeable(sk)) mask |= POLLOUT | POLLWRNORM; } } else @@ -789,12 +787,28 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, xmit_size_goal = mss_now; if (large_allowed && sk_can_gso(sk)) { - xmit_size_goal = ((sk->sk_gso_max_size - 1) - - inet_csk(sk)->icsk_af_ops->net_header_len - - inet_csk(sk)->icsk_ext_hdr_len - - tp->tcp_header_len); + u32 gso_size, hlen; + + /* Maybe we should/could use sk->sk_prot->max_header here ? */ + hlen = inet_csk(sk)->icsk_af_ops->net_header_len + + inet_csk(sk)->icsk_ext_hdr_len + + tp->tcp_header_len; + + /* Goal is to send at least one packet per ms, + * not one big TSO packet every 100 ms. + * This preserves ACK clocking and is consistent + * with tcp_tso_should_defer() heuristic. + */ + gso_size = sk->sk_pacing_rate / (2 * MSEC_PER_SEC); + gso_size = max_t(u32, gso_size, + sysctl_tcp_min_tso_segs * mss_now); + + xmit_size_goal = min_t(u32, gso_size, + sk->sk_gso_max_size - 1 - hlen); - /* TSQ : try to have two TSO segments in flight */ + /* TSQ : try to have at least two segments in flight + * (one in NIC TX ring, another in Qdisc) + */ xmit_size_goal = min_t(u32, xmit_size_goal, sysctl_tcp_limit_output_bytes >> 1); @@ -2454,10 +2468,11 @@ static int do_tcp_setsockopt(struct sock *sk, int level, case TCP_THIN_DUPACK: if (val < 0 || val > 1) err = -EINVAL; - else + else { tp->thin_dupack = val; if (tp->thin_dupack) tcp_disable_early_retrans(tp); + } break; case TCP_REPAIR: @@ -2638,6 +2653,10 @@ static int do_tcp_setsockopt(struct sock *sk, int level, else tp->tsoffset = val - tcp_time_stamp; break; + case TCP_NOTSENT_LOWAT: + tp->notsent_lowat = val; + sk->sk_write_space(sk); + break; default: err = -ENOPROTOOPT; break; @@ -2854,6 +2873,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level, case TCP_TIMESTAMP: val = tcp_time_stamp + tp->tsoffset; break; + case TCP_NOTSENT_LOWAT: + val = tp->notsent_lowat; + break; default: return -ENOPROTOOPT; } |