Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next

Pull networking changes from David Miller: "Noteworthy changes this time around: 1) Multicast rejoin support for team driver, from Jiri Pirko. 2) Centralize and simplify TCP RTT measurement handling in order to reduce the impact of bad RTO seeding from SYN/ACKs. Also, when both timestamps and local RTT measurements are available prefer the later because there are broken middleware devices which scramble the timestamp. From Yuchung Cheng. 3) Add TCP_NOTSENT_LOWAT socket option to limit the amount of kernel memory consumed to queue up unsend user data. From Eric Dumazet. 4) Add a "physical port ID" abstraction for network devices, from Jiri Pirko. 5) Add a "suppress" operation to influence fib_rules lookups, from Stefan Tomanek. 6) Add a networking development FAQ, from Paul Gortmaker. 7) Extend the information provided by tcp_probe and add ipv6 support, from Daniel Borkmann. 8) Use RCU locking more extensively in openvswitch data paths, from Pravin B Shelar. 9) Add SCTP support to openvswitch, from Joe Stringer. 10) Add EF10 chip support to SFC driver, from Ben Hutchings. 11) Add new SYNPROXY netfilter target, from Patrick McHardy. 12) Compute a rate approximation for sending in TCP sockets, and use this to more intelligently coalesce TSO frames. Furthermore, add a new packet scheduler which takes advantage of this estimate when available. From Eric Dumazet. 13) Allow AF_PACKET fanouts with random selection, from Daniel Borkmann. 14) Add ipv6 support to vxlan driver, from Cong Wang" Resolved conflicts as per discussion. * git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1218 commits) openvswitch: Fix alignment of struct sw_flow_key. netfilter: Fix build errors with xt_socket.c tcp: Add missing braces to do_tcp_setsockopt caif: Add missing braces to multiline if in cfctrl_linkup_request bnx2x: Add missing braces in bnx2x:bnx2x_link_initialize vxlan: Fix kernel panic on device delete. net: mvneta: implement ->ndo_do_ioctl() to support PHY ioctls net: mvneta: properly disable HW PHY polling and ensure adjust_link() works icplus: Use netif_running to determine device state ethernet/arc/arc_emac: Fix huge delays in large file copies tuntap: orphan frags before trying to set tx timestamp tuntap: purge socket error queue on detach qlcnic: use standard NAPI weights ipv6:introduce function to find route for redirect bnx2x: VF RSS support - VF side bnx2x: VF RSS support - PF side vxlan: Notify drivers for listening UDP port changes net: usbnet: update addr_assign_type if appropriate driver/net: enic: update enic maintainers and driver driver/net: enic: Exposing symbols for Cisco's low latency driver ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2013-09-05 14:54:29 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> 2013-09-05 14:54:29 -0700
commit: cc998ff8811530be521f6b316f37ab7676a07938 (patch)
tree: a054b3bf4b2ef406bf756a6cfc9be2f9115f17ae /net/ipv4/tcp.c
parent: 57d730924d5cc2c3e280af16a9306587c3a511db (diff)
parent: 0d40f75bdab241868c0eb6f97aef9f8b3a66f7b3 (diff)
download: linux-cc998ff8811530be521f6b316f37ab7676a07938.tar.bz2
1 files changed, 34 insertions, 12 deletions
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index b2f6c74861af..6e5617b9f9db 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -283,6 +283,8 @@
 
 int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
 
+int sysctl_tcp_min_tso_segs __read_mostly = 2;
+
 struct percpu_counter tcp_orphan_count;
 EXPORT_SYMBOL_GPL(tcp_orphan_count);
 
@@ -410,10 +412,6 @@ void tcp_init_sock(struct sock *sk)
 
 	icsk->icsk_sync_mss = tcp_sync_mss;
 
-	/* Presumed zeroed, in order of appearance:
-	 *	cookie_in_always, cookie_out_never,
-	 *	s_data_constant, s_data_in, s_data_out
-	 */
 	sk->sk_sndbuf = sysctl_tcp_wmem[1];
 	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
 
@@ -499,7 +497,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 			mask |= POLLIN | POLLRDNORM;
 
 		if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
-			if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
+			if (sk_stream_is_writeable(sk)) {
 				mask |= POLLOUT | POLLWRNORM;
 			} else {  /* send SIGIO later */
 				set_bit(SOCK_ASYNC_NOSPACE,
@@ -510,7 +508,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 				 * wspace test but before the flags are set,
 				 * IO signal will be lost.
 				 */
-				if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
+				if (sk_stream_is_writeable(sk))
 					mask |= POLLOUT | POLLWRNORM;
 			}
 		} else
@@ -789,12 +787,28 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
 	xmit_size_goal = mss_now;
 
 	if (large_allowed && sk_can_gso(sk)) {
-		xmit_size_goal = ((sk->sk_gso_max_size - 1) -
-				  inet_csk(sk)->icsk_af_ops->net_header_len -
-				  inet_csk(sk)->icsk_ext_hdr_len -
-				  tp->tcp_header_len);
+		u32 gso_size, hlen;
+
+		/* Maybe we should/could use sk->sk_prot->max_header here ? */
+		hlen = inet_csk(sk)->icsk_af_ops->net_header_len +
+		       inet_csk(sk)->icsk_ext_hdr_len +
+		       tp->tcp_header_len;
+
+		/* Goal is to send at least one packet per ms,
+		 * not one big TSO packet every 100 ms.
+		 * This preserves ACK clocking and is consistent
+		 * with tcp_tso_should_defer() heuristic.
+		 */
+		gso_size = sk->sk_pacing_rate / (2 * MSEC_PER_SEC);
+		gso_size = max_t(u32, gso_size,
+				 sysctl_tcp_min_tso_segs * mss_now);
+
+		xmit_size_goal = min_t(u32, gso_size,
+				       sk->sk_gso_max_size - 1 - hlen);
 
-		/* TSQ : try to have two TSO segments in flight */
+		/* TSQ : try to have at least two segments in flight
+		 * (one in NIC TX ring, another in Qdisc)
+		 */
 		xmit_size_goal = min_t(u32, xmit_size_goal,
 				       sysctl_tcp_limit_output_bytes >> 1);
 
@@ -2454,10 +2468,11 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 	case TCP_THIN_DUPACK:
 		if (val < 0 || val > 1)
 			err = -EINVAL;
-		else
+		else {
 			tp->thin_dupack = val;
 			if (tp->thin_dupack)
 				tcp_disable_early_retrans(tp);
+		}
 		break;
 
 	case TCP_REPAIR:
@@ -2638,6 +2653,10 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 		else
 			tp->tsoffset = val - tcp_time_stamp;
 		break;
+	case TCP_NOTSENT_LOWAT:
+		tp->notsent_lowat = val;
+		sk->sk_write_space(sk);
+		break;
 	default:
 		err = -ENOPROTOOPT;
 		break;
@@ -2854,6 +2873,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
 	case TCP_TIMESTAMP:
 		val = tcp_time_stamp + tp->tsoffset;
 		break;
+	case TCP_NOTSENT_LOWAT:
+		val = tp->notsent_lowat;
+		break;
 	default:
 		return -ENOPROTOOPT;
 	}
author	Linus Torvalds <torvalds@linux-foundation.org>	2013-09-05 14:54:29 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	2013-09-05 14:54:29 -0700
commit	cc998ff8811530be521f6b316f37ab7676a07938 (patch)
tree	a054b3bf4b2ef406bf756a6cfc9be2f9115f17ae /net/ipv4/tcp.c
parent	57d730924d5cc2c3e280af16a9306587c3a511db (diff)
parent	0d40f75bdab241868c0eb6f97aef9f8b3a66f7b3 (diff)
download	linux-cc998ff8811530be521f6b316f37ab7676a07938.tar.bz2