From acb32ba3dee66d58704caeeb8c6ff95f60efdc66 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 8 Nov 2011 13:04:43 +0000 Subject: ipv4: reduce percpu needs for icmpmsg mibs Reading /proc/net/snmp on a machine with a lot of cpus is very expensive (can be ~88000 us). This is because ICMPMSG MIB uses 4096 bytes per cpu, and folding values for all possible cpus can read 16 Mbytes of memory. ICMP messages are not considered as fast path on a typical server, and eventually few cpus handle them anyway. We can afford an atomic operation instead of using percpu data. This saves 4096 bytes per cpu and per network namespace. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/proc.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'net/ipv4/proc.c') diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 466ea8bb7a4d..961eed4f510a 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -288,7 +288,7 @@ static void icmpmsg_put(struct seq_file *seq) count = 0; for (i = 0; i < ICMPMSG_MIB_MAX; i++) { - val = snmp_fold_field((void __percpu **) net->mib.icmpmsg_statistics, i); + val = atomic_long_read(&net->mib.icmpmsg_statistics->mibs[i]); if (val) { type[count] = i; vals[count++] = val; @@ -307,6 +307,7 @@ static void icmp_put(struct seq_file *seq) { int i; struct net *net = seq->private; + atomic_long_t *ptr = net->mib.icmpmsg_statistics->mibs; seq_puts(seq, "\nIcmp: InMsgs InErrors"); for (i=0; icmpmibmap[i].name != NULL; i++) @@ -319,15 +320,13 @@ static void icmp_put(struct seq_file *seq) snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INERRORS)); for (i=0; icmpmibmap[i].name != NULL; i++) seq_printf(seq, " %lu", - snmp_fold_field((void __percpu **) net->mib.icmpmsg_statistics, - icmpmibmap[i].index)); + atomic_long_read(ptr + icmpmibmap[i].index)); seq_printf(seq, " %lu %lu", snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTMSGS), snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTERRORS)); for (i=0; icmpmibmap[i].name != NULL; i++) seq_printf(seq, " %lu", - snmp_fold_field((void __percpu **) net->mib.icmpmsg_statistics, - icmpmibmap[i].index | 0x100)); + atomic_long_read(ptr + (icmpmibmap[i].index | 0x100))); } /* -- cgit v1.2.3 From 180d8cd942ce336b2c869d324855c40c5db478ad Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Sun, 11 Dec 2011 21:47:02 +0000 Subject: foundations of per-cgroup memory pressure controlling. This patch replaces all uses of struct sock fields' memory_pressure, memory_allocated, sockets_allocated, and sysctl_mem to acessor macros. Those macros can either receive a socket argument, or a mem_cgroup argument, depending on the context they live in. Since we're only doing a macro wrapping here, no performance impact at all is expected in the case where we don't have cgroups disabled. Signed-off-by: Glauber Costa Reviewed-by: Hiroyouki Kamezawa CC: David S. Miller CC: Eric W. Biederman CC: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 96 ++++++++++++++++++++++++++++++++++++++++++++++++++- include/net/tcp.h | 3 +- net/core/sock.c | 57 ++++++++++++++++++------------ net/ipv4/proc.c | 6 ++-- net/ipv4/tcp_input.c | 12 +++---- net/ipv4/tcp_ipv4.c | 4 +-- net/ipv4/tcp_output.c | 2 +- net/ipv4/tcp_timer.c | 2 +- net/ipv6/tcp_ipv6.c | 2 +- 9 files changed, 145 insertions(+), 39 deletions(-) (limited to 'net/ipv4/proc.c') diff --git a/include/net/sock.h b/include/net/sock.h index 8ac338cb39ce..ed0dbf034539 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -53,6 +53,7 @@ #include #include #include +#include #include #include @@ -867,6 +868,99 @@ static inline void sk_refcnt_debug_release(const struct sock *sk) #define sk_refcnt_debug_release(sk) do { } while (0) #endif /* SOCK_REFCNT_DEBUG */ +static inline bool sk_has_memory_pressure(const struct sock *sk) +{ + return sk->sk_prot->memory_pressure != NULL; +} + +static inline bool sk_under_memory_pressure(const struct sock *sk) +{ + if (!sk->sk_prot->memory_pressure) + return false; + return !!*sk->sk_prot->memory_pressure; +} + +static inline void sk_leave_memory_pressure(struct sock *sk) +{ + int *memory_pressure = sk->sk_prot->memory_pressure; + + if (memory_pressure && *memory_pressure) + *memory_pressure = 0; +} + +static inline void sk_enter_memory_pressure(struct sock *sk) +{ + if (sk->sk_prot->enter_memory_pressure) + sk->sk_prot->enter_memory_pressure(sk); +} + +static inline long sk_prot_mem_limits(const struct sock *sk, int index) +{ + long *prot = sk->sk_prot->sysctl_mem; + return prot[index]; +} + +static inline long +sk_memory_allocated(const struct sock *sk) +{ + struct proto *prot = sk->sk_prot; + return atomic_long_read(prot->memory_allocated); +} + +static inline long +sk_memory_allocated_add(struct sock *sk, int amt) +{ + struct proto *prot = sk->sk_prot; + return atomic_long_add_return(amt, prot->memory_allocated); +} + +static inline void +sk_memory_allocated_sub(struct sock *sk, int amt) +{ + struct proto *prot = sk->sk_prot; + atomic_long_sub(amt, prot->memory_allocated); +} + +static inline void sk_sockets_allocated_dec(struct sock *sk) +{ + struct proto *prot = sk->sk_prot; + percpu_counter_dec(prot->sockets_allocated); +} + +static inline void sk_sockets_allocated_inc(struct sock *sk) +{ + struct proto *prot = sk->sk_prot; + percpu_counter_inc(prot->sockets_allocated); +} + +static inline int +sk_sockets_allocated_read_positive(struct sock *sk) +{ + struct proto *prot = sk->sk_prot; + + return percpu_counter_sum_positive(prot->sockets_allocated); +} + +static inline int +proto_sockets_allocated_sum_positive(struct proto *prot) +{ + return percpu_counter_sum_positive(prot->sockets_allocated); +} + +static inline long +proto_memory_allocated(struct proto *prot) +{ + return atomic_long_read(prot->memory_allocated); +} + +static inline bool +proto_memory_pressure(struct proto *prot) +{ + if (!prot->memory_pressure) + return false; + return !!*prot->memory_pressure; +} + #ifdef CONFIG_PROC_FS /* Called with local bh disabled */ @@ -1674,7 +1768,7 @@ static inline struct page *sk_stream_alloc_page(struct sock *sk) page = alloc_pages(sk->sk_allocation, 0); if (!page) { - sk->sk_prot->enter_memory_pressure(sk); + sk_enter_memory_pressure(sk); sk_stream_moderate_sndbuf(sk); } return page; diff --git a/include/net/tcp.h b/include/net/tcp.h index 02f070d339ba..913473b4eda7 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -44,6 +44,7 @@ #include #include +#include extern struct inet_hashinfo tcp_hashinfo; @@ -285,7 +286,7 @@ static inline bool tcp_too_many_orphans(struct sock *sk, int shift) } if (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && - atomic_long_read(&tcp_memory_allocated) > sysctl_tcp_mem[2]) + sk_memory_allocated(sk) > sk_prot_mem_limits(sk, 2)) return true; return false; } diff --git a/net/core/sock.c b/net/core/sock.c index 9777da86aeac..a3d4205e7238 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1323,7 +1323,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) newsk->sk_wq = NULL; if (newsk->sk_prot->sockets_allocated) - percpu_counter_inc(newsk->sk_prot->sockets_allocated); + sk_sockets_allocated_inc(newsk); if (newsk->sk_flags & SK_FLAGS_TIMESTAMP) net_enable_timestamp(); @@ -1713,28 +1713,28 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind) long allocated; sk->sk_forward_alloc += amt * SK_MEM_QUANTUM; - allocated = atomic_long_add_return(amt, prot->memory_allocated); + + allocated = sk_memory_allocated_add(sk, amt); /* Under limit. */ - if (allocated <= prot->sysctl_mem[0]) { - if (prot->memory_pressure && *prot->memory_pressure) - *prot->memory_pressure = 0; + if (allocated <= sk_prot_mem_limits(sk, 0)) { + sk_leave_memory_pressure(sk); return 1; } /* Under pressure. */ - if (allocated > prot->sysctl_mem[1]) - if (prot->enter_memory_pressure) - prot->enter_memory_pressure(sk); + if (allocated > sk_prot_mem_limits(sk, 1)) + sk_enter_memory_pressure(sk); /* Over hard limit. */ - if (allocated > prot->sysctl_mem[2]) + if (allocated > sk_prot_mem_limits(sk, 2)) goto suppress_allocation; /* guarantee minimum buffer size under pressure */ if (kind == SK_MEM_RECV) { if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0]) return 1; + } else { /* SK_MEM_SEND */ if (sk->sk_type == SOCK_STREAM) { if (sk->sk_wmem_queued < prot->sysctl_wmem[0]) @@ -1744,13 +1744,13 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind) return 1; } - if (prot->memory_pressure) { + if (sk_has_memory_pressure(sk)) { int alloc; - if (!*prot->memory_pressure) + if (!sk_under_memory_pressure(sk)) return 1; - alloc = percpu_counter_read_positive(prot->sockets_allocated); - if (prot->sysctl_mem[2] > alloc * + alloc = sk_sockets_allocated_read_positive(sk); + if (sk_prot_mem_limits(sk, 2) > alloc * sk_mem_pages(sk->sk_wmem_queued + atomic_read(&sk->sk_rmem_alloc) + sk->sk_forward_alloc)) @@ -1773,7 +1773,9 @@ suppress_allocation: /* Alas. Undo changes. */ sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM; - atomic_long_sub(amt, prot->memory_allocated); + + sk_memory_allocated_sub(sk, amt); + return 0; } EXPORT_SYMBOL(__sk_mem_schedule); @@ -1784,15 +1786,13 @@ EXPORT_SYMBOL(__sk_mem_schedule); */ void __sk_mem_reclaim(struct sock *sk) { - struct proto *prot = sk->sk_prot; - - atomic_long_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT, - prot->memory_allocated); + sk_memory_allocated_sub(sk, + sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT); sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1; - if (prot->memory_pressure && *prot->memory_pressure && - (atomic_long_read(prot->memory_allocated) < prot->sysctl_mem[0])) - *prot->memory_pressure = 0; + if (sk_under_memory_pressure(sk) && + (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) + sk_leave_memory_pressure(sk); } EXPORT_SYMBOL(__sk_mem_reclaim); @@ -2507,16 +2507,27 @@ static char proto_method_implemented(const void *method) { return method == NULL ? 'n' : 'y'; } +static long sock_prot_memory_allocated(struct proto *proto) +{ + return proto->memory_allocated != NULL ? proto_memory_allocated(proto): -1L; +} + +static char *sock_prot_memory_pressure(struct proto *proto) +{ + return proto->memory_pressure != NULL ? + proto_memory_pressure(proto) ? "yes" : "no" : "NI"; +} static void proto_seq_printf(struct seq_file *seq, struct proto *proto) { + seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s " "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", proto->name, proto->obj_size, sock_prot_inuse_get(seq_file_net(seq), proto), - proto->memory_allocated != NULL ? atomic_long_read(proto->memory_allocated) : -1L, - proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI", + sock_prot_memory_allocated(proto), + sock_prot_memory_pressure(proto), proto->max_header, proto->slab == NULL ? "no" : "yes", module_name(proto->owner), diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 961eed4f510a..3569d8ecaeac 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -56,17 +56,17 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) local_bh_disable(); orphans = percpu_counter_sum_positive(&tcp_orphan_count); - sockets = percpu_counter_sum_positive(&tcp_sockets_allocated); + sockets = proto_sockets_allocated_sum_positive(&tcp_prot); local_bh_enable(); socket_seq_show(seq); seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n", sock_prot_inuse_get(net, &tcp_prot), orphans, tcp_death_row.tw_count, sockets, - atomic_long_read(&tcp_memory_allocated)); + proto_memory_allocated(&tcp_prot)); seq_printf(seq, "UDP: inuse %d mem %ld\n", sock_prot_inuse_get(net, &udp_prot), - atomic_long_read(&udp_memory_allocated)); + proto_memory_allocated(&udp_prot)); seq_printf(seq, "UDPLITE: inuse %d\n", sock_prot_inuse_get(net, &udplite_prot)); seq_printf(seq, "RAW: inuse %d\n", diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b9cbc351c511..f131d92d25ee 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -322,7 +322,7 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) /* Check #1 */ if (tp->rcv_ssthresh < tp->window_clamp && (int)tp->rcv_ssthresh < tcp_space(sk) && - !tcp_memory_pressure) { + !sk_under_memory_pressure(sk)) { int incr; /* Check #2. Increase window, if skb with such overhead @@ -411,8 +411,8 @@ static void tcp_clamp_window(struct sock *sk) if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && - !tcp_memory_pressure && - atomic_long_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { + !sk_under_memory_pressure(sk) && + sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) { sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), sysctl_tcp_rmem[2]); } @@ -4866,7 +4866,7 @@ static int tcp_prune_queue(struct sock *sk) if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) tcp_clamp_window(sk); - else if (tcp_memory_pressure) + else if (sk_under_memory_pressure(sk)) tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); tcp_collapse_ofo_queue(sk); @@ -4932,11 +4932,11 @@ static int tcp_should_expand_sndbuf(const struct sock *sk) return 0; /* If we are under global TCP memory pressure, do not expand. */ - if (tcp_memory_pressure) + if (sk_under_memory_pressure(sk)) return 0; /* If we are under soft global TCP memory pressure, do not expand. */ - if (atomic_long_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0]) + if (sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0)) return 0; /* If we filled the congestion window, do not expand. */ diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index c4b8b09db9f5..f48bf312cfe8 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1917,7 +1917,7 @@ static int tcp_v4_init_sock(struct sock *sk) sk->sk_rcvbuf = sysctl_tcp_rmem[1]; local_bh_disable(); - percpu_counter_inc(&tcp_sockets_allocated); + sk_sockets_allocated_inc(sk); local_bh_enable(); return 0; @@ -1973,7 +1973,7 @@ void tcp_v4_destroy_sock(struct sock *sk) tp->cookie_values = NULL; } - percpu_counter_dec(&tcp_sockets_allocated); + sk_sockets_allocated_dec(sk); } EXPORT_SYMBOL(tcp_v4_destroy_sock); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index cf3068038942..8c8de2780c7a 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1922,7 +1922,7 @@ u32 __tcp_select_window(struct sock *sk) if (free_space < (full_space >> 1)) { icsk->icsk_ack.quick = 0; - if (tcp_memory_pressure) + if (sk_under_memory_pressure(sk)) tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index aa39a692f4c8..40a41f077981 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -261,7 +261,7 @@ static void tcp_delack_timer(unsigned long data) } out: - if (tcp_memory_pressure) + if (sk_under_memory_pressure(sk)) sk_mem_reclaim(sk); out_unlock: bh_unlock_sock(sk); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 9d74eee334d6..b69c7030aba9 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1994,7 +1994,7 @@ static int tcp_v6_init_sock(struct sock *sk) sk->sk_rcvbuf = sysctl_tcp_rmem[1]; local_bh_disable(); - percpu_counter_inc(&tcp_sockets_allocated); + sk_sockets_allocated_inc(sk); local_bh_enable(); return 0; -- cgit v1.2.3 From 974c12360dfe6ab01201fe9e708e7755c413f8b6 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 19 Jan 2012 14:42:21 +0000 Subject: tcp: detect loss above high_seq in recovery Correctly implement a loss detection heuristic: New sequences (above high_seq) sent during the fast recovery are deemed lost when higher sequences are SACKed. Current code does not catch these losses, because tcp_mark_head_lost() does not check packets beyond high_seq. The fix is straight-forward by checking packets until the highest sacked packet. In addition, all the FLAG_DATA_LOST logic are in-effective and redundant and can be removed. Update the loss heuristic comments. The algorithm above is documented as heuristic B, but it is redundant too because heuristic A already covers B. Note that this change only marks some forward-retransmitted packets LOST. It does NOT forbid TCP performing further CWR on new losses. A potential follow-up patch under preparation is to perform another CWR on "new" losses such as 1) sequence above high_seq is lost (by resetting high_seq to snd_nxt) 2) retransmission is lost. Signed-off-by: Yuchung Cheng Signed-off-by: David S. Miller --- include/linux/snmp.h | 1 - net/ipv4/proc.c | 1 - net/ipv4/tcp_input.c | 41 +++++++++++++++-------------------------- 3 files changed, 15 insertions(+), 28 deletions(-) (limited to 'net/ipv4/proc.c') diff --git a/include/linux/snmp.h b/include/linux/snmp.h index e16557a357e5..c1241c428179 100644 --- a/include/linux/snmp.h +++ b/include/linux/snmp.h @@ -192,7 +192,6 @@ enum LINUX_MIB_TCPPARTIALUNDO, /* TCPPartialUndo */ LINUX_MIB_TCPDSACKUNDO, /* TCPDSACKUndo */ LINUX_MIB_TCPLOSSUNDO, /* TCPLossUndo */ - LINUX_MIB_TCPLOSS, /* TCPLoss */ LINUX_MIB_TCPLOSTRETRANSMIT, /* TCPLostRetransmit */ LINUX_MIB_TCPRENOFAILURES, /* TCPRenoFailures */ LINUX_MIB_TCPSACKFAILURES, /* TCPSackFailures */ diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 3569d8ecaeac..6afc807ee2ad 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -216,7 +216,6 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPPartialUndo", LINUX_MIB_TCPPARTIALUNDO), SNMP_MIB_ITEM("TCPDSACKUndo", LINUX_MIB_TCPDSACKUNDO), SNMP_MIB_ITEM("TCPLossUndo", LINUX_MIB_TCPLOSSUNDO), - SNMP_MIB_ITEM("TCPLoss", LINUX_MIB_TCPLOSS), SNMP_MIB_ITEM("TCPLostRetransmit", LINUX_MIB_TCPLOSTRETRANSMIT), SNMP_MIB_ITEM("TCPRenoFailures", LINUX_MIB_TCPRENOFAILURES), SNMP_MIB_ITEM("TCPSackFailures", LINUX_MIB_TCPSACKFAILURES), diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2877c3e09587..976034f82320 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -105,7 +105,6 @@ int sysctl_tcp_abc __read_mostly; #define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */ #define FLAG_DATA_SACKED 0x20 /* New SACK. */ #define FLAG_ECE 0x40 /* ECE in this ACK */ -#define FLAG_DATA_LOST 0x80 /* SACK detected data lossage. */ #define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ #define FLAG_ONLY_ORIG_SACKED 0x200 /* SACKs only non-rexmit sent before RTO */ #define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ @@ -1040,13 +1039,11 @@ static void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, * These 6 states form finite state machine, controlled by the following events: * 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue()) * 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue()) - * 3. Loss detection event of one of three flavors: + * 3. Loss detection event of two flavors: * A. Scoreboard estimator decided the packet is lost. * A'. Reno "three dupacks" marks head of queue lost. - * A''. Its FACK modfication, head until snd.fack is lost. - * B. SACK arrives sacking data transmitted after never retransmitted - * hole was sent out. - * C. SACK arrives sacking SND.NXT at the moment, when the + * A''. Its FACK modification, head until snd.fack is lost. + * B. SACK arrives sacking SND.NXT at the moment, when the * segment was retransmitted. * 4. D-SACK added new rule: D-SACK changes any tag to S. * @@ -1153,7 +1150,7 @@ static int tcp_is_sackblock_valid(struct tcp_sock *tp, int is_dsack, } /* Check for lost retransmit. This superb idea is borrowed from "ratehalving". - * Event "C". Later note: FACK people cheated me again 8), we have to account + * Event "B". Later note: FACK people cheated me again 8), we have to account * for reordering! Ugly, but should help. * * Search retransmitted skbs from write_queue that were sent when snd_nxt was @@ -1844,10 +1841,6 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, if (found_dup_sack && ((i + 1) == first_sack_index)) next_dup = &sp[i + 1]; - /* Event "B" in the comment above. */ - if (after(end_seq, tp->high_seq)) - state.flag |= FLAG_DATA_LOST; - /* Skip too early cached blocks */ while (tcp_sack_cache_ok(tp, cache) && !before(start_seq, cache->end_seq)) @@ -2515,8 +2508,11 @@ static void tcp_timeout_skbs(struct sock *sk) tcp_verify_left_out(tp); } -/* Mark head of queue up as lost. With RFC3517 SACK, the packets is - * is against sacked "cnt", otherwise it's against facked "cnt" +/* Detect loss in event "A" above by marking head of queue up as lost. + * For FACK or non-SACK(Reno) senders, the first "packets" number of segments + * are considered lost. For RFC3517 SACK, a segment is considered lost if it + * has at least tp->reordering SACKed seqments above it; "packets" refers to + * the maximum SACKed segments to pass before reaching this limit. */ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) { @@ -2525,6 +2521,8 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) int cnt, oldcnt; int err; unsigned int mss; + /* Use SACK to deduce losses of new sequences sent during recovery */ + const u32 loss_high = tcp_is_sack(tp) ? tp->snd_nxt : tp->high_seq; WARN_ON(packets > tp->packets_out); if (tp->lost_skb_hint) { @@ -2546,7 +2544,7 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) tp->lost_skb_hint = skb; tp->lost_cnt_hint = cnt; - if (after(TCP_SKB_CB(skb)->end_seq, tp->high_seq)) + if (after(TCP_SKB_CB(skb)->end_seq, loss_high)) break; oldcnt = cnt; @@ -3033,19 +3031,10 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, if (tcp_check_sack_reneging(sk, flag)) return; - /* C. Process data loss notification, provided it is valid. */ - if (tcp_is_fack(tp) && (flag & FLAG_DATA_LOST) && - before(tp->snd_una, tp->high_seq) && - icsk->icsk_ca_state != TCP_CA_Open && - tp->fackets_out > tp->reordering) { - tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering, 0); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSS); - } - - /* D. Check consistency of the current state. */ + /* C. Check consistency of the current state. */ tcp_verify_left_out(tp); - /* E. Check state exit conditions. State can be terminated + /* D. Check state exit conditions. State can be terminated * when high_seq is ACKed. */ if (icsk->icsk_ca_state == TCP_CA_Open) { WARN_ON(tp->retrans_out != 0); @@ -3077,7 +3066,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, } } - /* F. Process state. */ + /* E. Process state. */ switch (icsk->icsk_ca_state) { case TCP_CA_Recovery: if (!(flag & FLAG_SND_UNA_ADVANCED)) { -- cgit v1.2.3