From 9de7922bc709eee2f609cd01d98aaedc4cf5ea74 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 9 Oct 2014 22:55:31 +0200 Subject: net: sctp: fix skb_over_panic when receiving malformed ASCONF chunks Commit 6f4c618ddb0 ("SCTP : Add paramters validity check for ASCONF chunk") added basic verification of ASCONF chunks, however, it is still possible to remotely crash a server by sending a special crafted ASCONF chunk, even up to pre 2.6.12 kernels: skb_over_panic: text:ffffffffa01ea1c3 len:31056 put:30768 head:ffff88011bd81800 data:ffff88011bd81800 tail:0x7950 end:0x440 dev: ------------[ cut here ]------------ kernel BUG at net/core/skbuff.c:129! [...] Call Trace: [] skb_put+0x5c/0x70 [] sctp_addto_chunk+0x63/0xd0 [sctp] [] sctp_process_asconf+0x1af/0x540 [sctp] [] ? _read_unlock_bh+0x15/0x20 [] sctp_sf_do_asconf+0x168/0x240 [sctp] [] sctp_do_sm+0x71/0x1210 [sctp] [] ? fib_rules_lookup+0xad/0xf0 [] ? sctp_cmp_addr_exact+0x32/0x40 [sctp] [] sctp_assoc_bh_rcv+0xd3/0x180 [sctp] [] sctp_inq_push+0x56/0x80 [sctp] [] sctp_rcv+0x982/0xa10 [sctp] [] ? ipt_local_in_hook+0x23/0x28 [iptable_filter] [] ? nf_iterate+0x69/0xb0 [] ? ip_local_deliver_finish+0x0/0x2d0 [] ? nf_hook_slow+0x76/0x120 [] ? ip_local_deliver_finish+0x0/0x2d0 [] ip_local_deliver_finish+0xdd/0x2d0 [] ip_local_deliver+0x98/0xa0 [] ip_rcv_finish+0x12d/0x440 [] ip_rcv+0x275/0x350 [] __netif_receive_skb+0x4ab/0x750 [] netif_receive_skb+0x58/0x60 This can be triggered e.g., through a simple scripted nmap connection scan injecting the chunk after the handshake, for example, ... -------------- INIT[ASCONF; ASCONF_ACK] -------------> <----------- INIT-ACK[ASCONF; ASCONF_ACK] ------------ -------------------- COOKIE-ECHO --------------------> <-------------------- COOKIE-ACK --------------------- ------------------ ASCONF; UNKNOWN ------------------> ... where ASCONF chunk of length 280 contains 2 parameters ... 1) Add IP address parameter (param length: 16) 2) Add/del IP address parameter (param length: 255) ... followed by an UNKNOWN chunk of e.g. 4 bytes. Here, the Address Parameter in the ASCONF chunk is even missing, too. This is just an example and similarly-crafted ASCONF chunks could be used just as well. The ASCONF chunk passes through sctp_verify_asconf() as all parameters passed sanity checks, and after walking, we ended up successfully at the chunk end boundary, and thus may invoke sctp_process_asconf(). Parameter walking is done with WORD_ROUND() to take padding into account. In sctp_process_asconf()'s TLV processing, we may fail in sctp_process_asconf_param() e.g., due to removal of the IP address that is also the source address of the packet containing the ASCONF chunk, and thus we need to add all TLVs after the failure to our ASCONF response to remote via helper function sctp_add_asconf_response(), which basically invokes a sctp_addto_chunk() adding the error parameters to the given skb. When walking to the next parameter this time, we proceed with ... length = ntohs(asconf_param->param_hdr.length); asconf_param = (void *)asconf_param + length; ... instead of the WORD_ROUND()'ed length, thus resulting here in an off-by-one that leads to reading the follow-up garbage parameter length of 12336, and thus throwing an skb_over_panic for the reply when trying to sctp_addto_chunk() next time, which implicitly calls the skb_put() with that length. Fix it by using sctp_walk_params() [ which is also used in INIT parameter processing ] macro in the verification *and* in ASCONF processing: it will make sure we don't spill over, that we walk parameters WORD_ROUND()'ed. Moreover, we're being more defensive and guard against unknown parameter types and missized addresses. Joint work with Vlad Yasevich. Fixes: b896b82be4ae ("[SCTP] ADDIP: Support for processing incoming ASCONF_ACK chunks.") Signed-off-by: Daniel Borkmann Signed-off-by: Vlad Yasevich Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/sm_make_chunk.c | 99 +++++++++++++++++++++++++++--------------------- net/sctp/sm_statefuns.c | 18 +-------- 2 files changed, 57 insertions(+), 60 deletions(-) (limited to 'net') diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index ae0e616a7ca5..ab734be8cb20 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -3110,50 +3110,63 @@ static __be16 sctp_process_asconf_param(struct sctp_association *asoc, return SCTP_ERROR_NO_ERROR; } -/* Verify the ASCONF packet before we process it. */ -int sctp_verify_asconf(const struct sctp_association *asoc, - struct sctp_paramhdr *param_hdr, void *chunk_end, - struct sctp_paramhdr **errp) { - sctp_addip_param_t *asconf_param; +/* Verify the ASCONF packet before we process it. */ +bool sctp_verify_asconf(const struct sctp_association *asoc, + struct sctp_chunk *chunk, bool addr_param_needed, + struct sctp_paramhdr **errp) +{ + sctp_addip_chunk_t *addip = (sctp_addip_chunk_t *) chunk->chunk_hdr; union sctp_params param; - int length, plen; - - param.v = (sctp_paramhdr_t *) param_hdr; - while (param.v <= chunk_end - sizeof(sctp_paramhdr_t)) { - length = ntohs(param.p->length); - *errp = param.p; + bool addr_param_seen = false; - if (param.v > chunk_end - length || - length < sizeof(sctp_paramhdr_t)) - return 0; + sctp_walk_params(param, addip, addip_hdr.params) { + size_t length = ntohs(param.p->length); + *errp = param.p; switch (param.p->type) { + case SCTP_PARAM_ERR_CAUSE: + break; + case SCTP_PARAM_IPV4_ADDRESS: + if (length != sizeof(sctp_ipv4addr_param_t)) + return false; + addr_param_seen = true; + break; + case SCTP_PARAM_IPV6_ADDRESS: + if (length != sizeof(sctp_ipv6addr_param_t)) + return false; + addr_param_seen = true; + break; case SCTP_PARAM_ADD_IP: case SCTP_PARAM_DEL_IP: case SCTP_PARAM_SET_PRIMARY: - asconf_param = (sctp_addip_param_t *)param.v; - plen = ntohs(asconf_param->param_hdr.length); - if (plen < sizeof(sctp_addip_param_t) + - sizeof(sctp_paramhdr_t)) - return 0; + /* In ASCONF chunks, these need to be first. */ + if (addr_param_needed && !addr_param_seen) + return false; + length = ntohs(param.addip->param_hdr.length); + if (length < sizeof(sctp_addip_param_t) + + sizeof(sctp_paramhdr_t)) + return false; break; case SCTP_PARAM_SUCCESS_REPORT: case SCTP_PARAM_ADAPTATION_LAYER_IND: if (length != sizeof(sctp_addip_param_t)) - return 0; - + return false; break; default: - break; + /* This is unkown to us, reject! */ + return false; } - - param.v += WORD_ROUND(length); } - if (param.v != chunk_end) - return 0; + /* Remaining sanity checks. */ + if (addr_param_needed && !addr_param_seen) + return false; + if (!addr_param_needed && addr_param_seen) + return false; + if (param.v != chunk->chunk_end) + return false; - return 1; + return true; } /* Process an incoming ASCONF chunk with the next expected serial no. and @@ -3162,16 +3175,17 @@ int sctp_verify_asconf(const struct sctp_association *asoc, struct sctp_chunk *sctp_process_asconf(struct sctp_association *asoc, struct sctp_chunk *asconf) { + sctp_addip_chunk_t *addip = (sctp_addip_chunk_t *) asconf->chunk_hdr; + bool all_param_pass = true; + union sctp_params param; sctp_addiphdr_t *hdr; union sctp_addr_param *addr_param; sctp_addip_param_t *asconf_param; struct sctp_chunk *asconf_ack; - __be16 err_code; int length = 0; int chunk_len; __u32 serial; - int all_param_pass = 1; chunk_len = ntohs(asconf->chunk_hdr->length) - sizeof(sctp_chunkhdr_t); hdr = (sctp_addiphdr_t *)asconf->skb->data; @@ -3199,9 +3213,14 @@ struct sctp_chunk *sctp_process_asconf(struct sctp_association *asoc, goto done; /* Process the TLVs contained within the ASCONF chunk. */ - while (chunk_len > 0) { + sctp_walk_params(param, addip, addip_hdr.params) { + /* Skip preceeding address parameters. */ + if (param.p->type == SCTP_PARAM_IPV4_ADDRESS || + param.p->type == SCTP_PARAM_IPV6_ADDRESS) + continue; + err_code = sctp_process_asconf_param(asoc, asconf, - asconf_param); + param.addip); /* ADDIP 4.1 A7) * If an error response is received for a TLV parameter, * all TLVs with no response before the failed TLV are @@ -3209,28 +3228,20 @@ struct sctp_chunk *sctp_process_asconf(struct sctp_association *asoc, * the failed response are considered unsuccessful unless * a specific success indication is present for the parameter. */ - if (SCTP_ERROR_NO_ERROR != err_code) - all_param_pass = 0; - + if (err_code != SCTP_ERROR_NO_ERROR) + all_param_pass = false; if (!all_param_pass) - sctp_add_asconf_response(asconf_ack, - asconf_param->crr_id, err_code, - asconf_param); + sctp_add_asconf_response(asconf_ack, param.addip->crr_id, + err_code, param.addip); /* ADDIP 4.3 D11) When an endpoint receiving an ASCONF to add * an IP address sends an 'Out of Resource' in its response, it * MUST also fail any subsequent add or delete requests bundled * in the ASCONF. */ - if (SCTP_ERROR_RSRC_LOW == err_code) + if (err_code == SCTP_ERROR_RSRC_LOW) goto done; - - /* Move to the next ASCONF param. */ - length = ntohs(asconf_param->param_hdr.length); - asconf_param = (void *)asconf_param + length; - chunk_len -= length; } - done: asoc->peer.addip_serial++; diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index c8f606324134..bdea3dfbad31 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -3591,9 +3591,7 @@ sctp_disposition_t sctp_sf_do_asconf(struct net *net, struct sctp_chunk *asconf_ack = NULL; struct sctp_paramhdr *err_param = NULL; sctp_addiphdr_t *hdr; - union sctp_addr_param *addr_param; __u32 serial; - int length; if (!sctp_vtag_verify(chunk, asoc)) { sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_BAD_TAG, @@ -3618,17 +3616,8 @@ sctp_disposition_t sctp_sf_do_asconf(struct net *net, hdr = (sctp_addiphdr_t *)chunk->skb->data; serial = ntohl(hdr->serial); - addr_param = (union sctp_addr_param *)hdr->params; - length = ntohs(addr_param->p.length); - if (length < sizeof(sctp_paramhdr_t)) - return sctp_sf_violation_paramlen(net, ep, asoc, type, arg, - (void *)addr_param, commands); - /* Verify the ASCONF chunk before processing it. */ - if (!sctp_verify_asconf(asoc, - (sctp_paramhdr_t *)((void *)addr_param + length), - (void *)chunk->chunk_end, - &err_param)) + if (!sctp_verify_asconf(asoc, chunk, true, &err_param)) return sctp_sf_violation_paramlen(net, ep, asoc, type, arg, (void *)err_param, commands); @@ -3745,10 +3734,7 @@ sctp_disposition_t sctp_sf_do_asconf_ack(struct net *net, rcvd_serial = ntohl(addip_hdr->serial); /* Verify the ASCONF-ACK chunk before processing it. */ - if (!sctp_verify_asconf(asoc, - (sctp_paramhdr_t *)addip_hdr->params, - (void *)asconf_ack->chunk_end, - &err_param)) + if (!sctp_verify_asconf(asoc, asconf_ack, false, &err_param)) return sctp_sf_violation_paramlen(net, ep, asoc, type, arg, (void *)err_param, commands); -- cgit v1.2.3 From b69040d8e39f20d5215a03502a8e8b4c6ab78395 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 9 Oct 2014 22:55:32 +0200 Subject: net: sctp: fix panic on duplicate ASCONF chunks When receiving a e.g. semi-good formed connection scan in the form of ... -------------- INIT[ASCONF; ASCONF_ACK] -------------> <----------- INIT-ACK[ASCONF; ASCONF_ACK] ------------ -------------------- COOKIE-ECHO --------------------> <-------------------- COOKIE-ACK --------------------- ---------------- ASCONF_a; ASCONF_b -----------------> ... where ASCONF_a equals ASCONF_b chunk (at least both serials need to be equal), we panic an SCTP server! The problem is that good-formed ASCONF chunks that we reply with ASCONF_ACK chunks are cached per serial. Thus, when we receive a same ASCONF chunk twice (e.g. through a lost ASCONF_ACK), we do not need to process them again on the server side (that was the idea, also proposed in the RFC). Instead, we know it was cached and we just resend the cached chunk instead. So far, so good. Where things get nasty is in SCTP's side effect interpreter, that is, sctp_cmd_interpreter(): While incoming ASCONF_a (chunk = event_arg) is being marked !end_of_packet and !singleton, and we have an association context, we do not flush the outqueue the first time after processing the ASCONF_ACK singleton chunk via SCTP_CMD_REPLY. Instead, we keep it queued up, although we set local_cork to 1. Commit 2e3216cd54b1 changed the precedence, so that as long as we get bundled, incoming chunks we try possible bundling on outgoing queue as well. Before this commit, we would just flush the output queue. Now, while ASCONF_a's ASCONF_ACK sits in the corked outq, we continue to process the same ASCONF_b chunk from the packet. As we have cached the previous ASCONF_ACK, we find it, grab it and do another SCTP_CMD_REPLY command on it. So, effectively, we rip the chunk->list pointers and requeue the same ASCONF_ACK chunk another time. Since we process ASCONF_b, it's correctly marked with end_of_packet and we enforce an uncork, and thus flush, thus crashing the kernel. Fix it by testing if the ASCONF_ACK is currently pending and if that is the case, do not requeue it. When flushing the output queue we may relink the chunk for preparing an outgoing packet, but eventually unlink it when it's copied into the skb right before transmission. Joint work with Vlad Yasevich. Fixes: 2e3216cd54b1 ("sctp: Follow security requirement of responding with 1 packet") Signed-off-by: Daniel Borkmann Signed-off-by: Vlad Yasevich Signed-off-by: David S. Miller --- include/net/sctp/sctp.h | 5 +++++ net/sctp/associola.c | 2 ++ 2 files changed, 7 insertions(+) (limited to 'net') diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index 9fbd856e6713..856f01cb51dd 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -426,6 +426,11 @@ static inline void sctp_assoc_pending_pmtu(struct sock *sk, struct sctp_associat asoc->pmtu_pending = 0; } +static inline bool sctp_chunk_pending(const struct sctp_chunk *chunk) +{ + return !list_empty(&chunk->list); +} + /* Walk through a list of TLV parameters. Don't trust the * individual parameter lengths and instead depend on * the chunk length to indicate when to stop. Make sure diff --git a/net/sctp/associola.c b/net/sctp/associola.c index a88b8524846e..f791edd64d6c 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -1668,6 +1668,8 @@ struct sctp_chunk *sctp_assoc_lookup_asconf_ack( * ack chunk whose serial number matches that of the request. */ list_for_each_entry(ack, &asoc->asconf_ack_list, transmitted_list) { + if (sctp_chunk_pending(ack)) + continue; if (ack->subh.addip_hdr->serial == serial) { sctp_chunk_hold(ack); return ack; -- cgit v1.2.3 From 26b87c7881006311828bb0ab271a551a62dcceb4 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 9 Oct 2014 22:55:33 +0200 Subject: net: sctp: fix remote memory pressure from excessive queueing This scenario is not limited to ASCONF, just taken as one example triggering the issue. When receiving ASCONF probes in the form of ... -------------- INIT[ASCONF; ASCONF_ACK] -------------> <----------- INIT-ACK[ASCONF; ASCONF_ACK] ------------ -------------------- COOKIE-ECHO --------------------> <-------------------- COOKIE-ACK --------------------- ---- ASCONF_a; [ASCONF_b; ...; ASCONF_n;] JUNK ------> [...] ---- ASCONF_m; [ASCONF_o; ...; ASCONF_z;] JUNK ------> ... where ASCONF_a, ASCONF_b, ..., ASCONF_z are good-formed ASCONFs and have increasing serial numbers, we process such ASCONF chunk(s) marked with !end_of_packet and !singleton, since we have not yet reached the SCTP packet end. SCTP does only do verification on a chunk by chunk basis, as an SCTP packet is nothing more than just a container of a stream of chunks which it eats up one by one. We could run into the case that we receive a packet with a malformed tail, above marked as trailing JUNK. All previous chunks are here goodformed, so the stack will eat up all previous chunks up to this point. In case JUNK does not fit into a chunk header and there are no more other chunks in the input queue, or in case JUNK contains a garbage chunk header, but the encoded chunk length would exceed the skb tail, or we came here from an entirely different scenario and the chunk has pdiscard=1 mark (without having had a flush point), it will happen, that we will excessively queue up the association's output queue (a correct final chunk may then turn it into a response flood when flushing the queue ;)): I ran a simple script with incremental ASCONF serial numbers and could see the server side consuming excessive amount of RAM [before/after: up to 2GB and more]. The issue at heart is that the chunk train basically ends with !end_of_packet and !singleton markers and since commit 2e3216cd54b1 ("sctp: Follow security requirement of responding with 1 packet") therefore preventing an output queue flush point in sctp_do_sm() -> sctp_cmd_interpreter() on the input chunk (chunk = event_arg) even though local_cork is set, but its precedence has changed since then. In the normal case, the last chunk with end_of_packet=1 would trigger the queue flush to accommodate possible outgoing bundling. In the input queue, sctp_inq_pop() seems to do the right thing in terms of discarding invalid chunks. So, above JUNK will not enter the state machine and instead be released and exit the sctp_assoc_bh_rcv() chunk processing loop. It's simply the flush point being missing at loop exit. Adding a try-flush approach on the output queue might not work as the underlying infrastructure might be long gone at this point due to the side-effect interpreter run. One possibility, albeit a bit of a kludge, would be to defer invalid chunk freeing into the state machine in order to possibly trigger packet discards and thus indirectly a queue flush on error. It would surely be better to discard chunks as in the current, perhaps better controlled environment, but going back and forth, it's simply architecturally not possible. I tried various trailing JUNK attack cases and it seems to look good now. Joint work with Vlad Yasevich. Fixes: 2e3216cd54b1 ("sctp: Follow security requirement of responding with 1 packet") Signed-off-by: Daniel Borkmann Signed-off-by: Vlad Yasevich Signed-off-by: David S. Miller --- net/sctp/inqueue.c | 33 +++++++-------------------------- net/sctp/sm_statefuns.c | 3 +++ 2 files changed, 10 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c index 4de12afa13d4..7e8a16c77039 100644 --- a/net/sctp/inqueue.c +++ b/net/sctp/inqueue.c @@ -140,18 +140,9 @@ struct sctp_chunk *sctp_inq_pop(struct sctp_inq *queue) } else { /* Nothing to do. Next chunk in the packet, please. */ ch = (sctp_chunkhdr_t *) chunk->chunk_end; - /* Force chunk->skb->data to chunk->chunk_end. */ - skb_pull(chunk->skb, - chunk->chunk_end - chunk->skb->data); - - /* Verify that we have at least chunk headers - * worth of buffer left. - */ - if (skb_headlen(chunk->skb) < sizeof(sctp_chunkhdr_t)) { - sctp_chunk_free(chunk); - chunk = queue->in_progress = NULL; - } + skb_pull(chunk->skb, chunk->chunk_end - chunk->skb->data); + /* We are guaranteed to pull a SCTP header. */ } } @@ -187,24 +178,14 @@ struct sctp_chunk *sctp_inq_pop(struct sctp_inq *queue) skb_pull(chunk->skb, sizeof(sctp_chunkhdr_t)); chunk->subh.v = NULL; /* Subheader is no longer valid. */ - if (chunk->chunk_end < skb_tail_pointer(chunk->skb)) { + if (chunk->chunk_end + sizeof(sctp_chunkhdr_t) < + skb_tail_pointer(chunk->skb)) { /* This is not a singleton */ chunk->singleton = 0; } else if (chunk->chunk_end > skb_tail_pointer(chunk->skb)) { - /* RFC 2960, Section 6.10 Bundling - * - * Partial chunks MUST NOT be placed in an SCTP packet. - * If the receiver detects a partial chunk, it MUST drop - * the chunk. - * - * Since the end of the chunk is past the end of our buffer - * (which contains the whole packet, we can freely discard - * the whole packet. - */ - sctp_chunk_free(chunk); - chunk = queue->in_progress = NULL; - - return NULL; + /* Discard inside state machine. */ + chunk->pdiscard = 1; + chunk->chunk_end = skb_tail_pointer(chunk->skb); } else { /* We are at the end of the packet, so mark the chunk * in case we need to send a SACK. diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index bdea3dfbad31..3ee27b7704ff 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -170,6 +170,9 @@ sctp_chunk_length_valid(struct sctp_chunk *chunk, { __u16 chunk_length = ntohs(chunk->chunk_hdr->length); + /* Previously already marked? */ + if (unlikely(chunk->pdiscard)) + return 0; if (unlikely(chunk_length < required_length)) return 0; -- cgit v1.2.3 From b2532eb9abd88384aa586169b54a3e53574f29f8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 10 Oct 2014 18:06:35 -0700 Subject: tcp: fix ooo_okay setting vs Small Queues TCP Small Queues (tcp_tsq_handler()) can hold one reference on sk->sk_wmem_alloc, preventing skb->ooo_okay being set. We should relax test done to set skb->ooo_okay to take care of this extra reference. Minimal truesize of skb containing one byte of payload is SKB_TRUESIZE(1) Without this fix, we have more chance locking flows into the wrong transmit queue. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 8d4eac793700..0a5d97c20aa9 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -914,9 +914,13 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, tcp_ca_event(sk, CA_EVENT_TX_START); /* if no packet is in qdisc/device queue, then allow XPS to select - * another queue. + * another queue. We can be called from tcp_tsq_handler() + * which holds one reference to sk_wmem_alloc. + * + * TODO: Ideally, in-flight pure ACK packets should not matter here. + * One way to get this would be to set skb->truesize = 2 on them. */ - skb->ooo_okay = sk_wmem_alloc_get(sk) == 0; + skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1); skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); -- cgit v1.2.3 From 02ea80741a25435123e8a5ca40cac6a0bcf0c9f1 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Sat, 11 Oct 2014 13:03:34 +0800 Subject: ipv6: remove aca_lock spinlock from struct ifacaddr6 no user uses this lock. Signed-off-by: Li RongQing Signed-off-by: David S. Miller --- include/net/if_inet6.h | 1 - net/ipv6/anycast.c | 1 - 2 files changed, 2 deletions(-) (limited to 'net') diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h index 55a8d4056cc9..98e5f9578f86 100644 --- a/include/net/if_inet6.h +++ b/include/net/if_inet6.h @@ -146,7 +146,6 @@ struct ifacaddr6 { struct ifacaddr6 *aca_next; int aca_users; atomic_t aca_refcnt; - spinlock_t aca_lock; unsigned long aca_cstamp; unsigned long aca_tstamp; }; diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index f5e319a8d4e2..baf2742d1ec4 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -235,7 +235,6 @@ static struct ifacaddr6 *aca_alloc(struct rt6_info *rt, /* aca_tstamp should be updated upon changes */ aca->aca_cstamp = aca->aca_tstamp = jiffies; atomic_set(&aca->aca_refcnt, 1); - spin_lock_init(&aca->aca_lock); return aca; } -- cgit v1.2.3 From ad971f616aa98ea2503f1a1064637bfb4ef7b21e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 11 Oct 2014 15:17:29 -0700 Subject: tcp: fix tcp_ack() performance problem We worked hard to improve tcp_ack() performance, by not accessing skb_shinfo() in fast path (cd7d8498c9a5 tcp: change tcp_skb_pcount() location) We still have one spurious access because of ACK timestamping, added in commit e1c8a607b281 ("net-timestamp: ACK timestamp for bytestreams") By checking if sk_tsflags has SOF_TIMESTAMPING_TX_ACK set, we can avoid two cache line misses for the common case. While we are at it, add two prefetchw() : One in tcp_ack() to bring skb at the head of write queue. One in tcp_clean_rtx_queue() loop to bring following skb, as we will delete skb from the write queue and dirty skb->next->prev. Add a couple of [un]likely() clauses. After this patch, tcp_ack() is no longer the most consuming function in tcp stack. Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Cc: Neal Cardwell Cc: Yuchung Cheng Cc: Van Jacobson Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 00a41499d52c..a12b455928e5 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -68,6 +68,7 @@ #include #include #include +#include #include #include #include @@ -3029,6 +3030,21 @@ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) return packets_acked; } +static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, + u32 prior_snd_una) +{ + const struct skb_shared_info *shinfo; + + /* Avoid cache line misses to get skb_shinfo() and shinfo->tx_flags */ + if (likely(!(sk->sk_tsflags & SOF_TIMESTAMPING_TX_ACK))) + return; + + shinfo = skb_shinfo(skb); + if ((shinfo->tx_flags & SKBTX_ACK_TSTAMP) && + between(shinfo->tskey, prior_snd_una, tcp_sk(sk)->snd_una - 1)) + __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK); +} + /* Remove acknowledged frames from the retransmission queue. If our packet * is before the ack sequence we can discard it as it's confirmed to have * arrived at the other end. @@ -3052,14 +3068,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, first_ackt.v64 = 0; while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) { - struct skb_shared_info *shinfo = skb_shinfo(skb); struct tcp_skb_cb *scb = TCP_SKB_CB(skb); u8 sacked = scb->sacked; u32 acked_pcount; - if (unlikely(shinfo->tx_flags & SKBTX_ACK_TSTAMP) && - between(shinfo->tskey, prior_snd_una, tp->snd_una - 1)) - __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK); + tcp_ack_tstamp(sk, skb, prior_snd_una); /* Determine how many packets and what bytes were acked, tso and else */ if (after(scb->end_seq, tp->snd_una)) { @@ -3073,10 +3086,12 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, fully_acked = false; } else { + /* Speedup tcp_unlink_write_queue() and next loop */ + prefetchw(skb->next); acked_pcount = tcp_skb_pcount(skb); } - if (sacked & TCPCB_RETRANS) { + if (unlikely(sacked & TCPCB_RETRANS)) { if (sacked & TCPCB_SACKED_RETRANS) tp->retrans_out -= acked_pcount; flag |= FLAG_RETRANS_DATA_ACKED; @@ -3107,7 +3122,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, * connection startup slow start one packet too * quickly. This is severely frowned upon behavior. */ - if (!(scb->tcp_flags & TCPHDR_SYN)) { + if (likely(!(scb->tcp_flags & TCPHDR_SYN))) { flag |= FLAG_DATA_ACKED; } else { flag |= FLAG_SYN_ACKED; @@ -3119,9 +3134,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, tcp_unlink_write_queue(skb, sk); sk_wmem_free_skb(sk, skb); - if (skb == tp->retransmit_skb_hint) + if (unlikely(skb == tp->retransmit_skb_hint)) tp->retransmit_skb_hint = NULL; - if (skb == tp->lost_skb_hint) + if (unlikely(skb == tp->lost_skb_hint)) tp->lost_skb_hint = NULL; } @@ -3132,7 +3147,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, flag |= FLAG_SACK_RENEGING; skb_mstamp_get(&now); - if (first_ackt.v64) { + if (likely(first_ackt.v64)) { seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt); ca_seq_rtt_us = skb_mstamp_us_delta(&now, &last_ackt); } @@ -3394,6 +3409,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) int acked = 0; /* Number of packets newly acked */ long sack_rtt_us = -1L; + /* We very likely will need to access write queue head. */ + prefetchw(sk->sk_write_queue.next); + /* If the ack is older than previous acks * then we can probably ignore it. */ -- cgit v1.2.3 From f76936d07c4eeb36d8dbb64ebd30ab46ff85d9f7 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 13 Oct 2014 16:34:10 +0200 Subject: ipv4: fix nexthop attlen check in fib_nh_match fib_nh_match does not match nexthops correctly. Example: ip route add 172.16.10/24 nexthop via 192.168.122.12 dev eth0 \ nexthop via 192.168.122.13 dev eth0 ip route del 172.16.10/24 nexthop via 192.168.122.14 dev eth0 \ nexthop via 192.168.122.15 dev eth0 Del command is successful and route is removed. After this patch applied, the route is correctly matched and result is: RTNETLINK answers: No such process Please consider this for stable trees as well. Fixes: 4e902c57417c4 ("[IPv4]: FIB configuration using struct fib_config") Signed-off-by: Jiri Pirko Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 5b6efb3d2308..f99f41bd15b8 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -537,7 +537,7 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi) return 1; attrlen = rtnh_attrlen(rtnh); - if (attrlen < 0) { + if (attrlen > 0) { struct nlattr *nla, *attrs = rtnh_attrs(rtnh); nla = nla_find(attrs, attrlen, RTA_GATEWAY); -- cgit v1.2.3 From 6ff1e1e3c81426515e1782f2f13b7237211a43df Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Mon, 13 Oct 2014 22:21:46 +0200 Subject: caif: replace kmalloc/memset 0 by kzalloc Also add blank line after declaration Signed-off-by: Fabian Frederick Signed-off-by: David S. Miller --- net/caif/cfmuxl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/caif/cfmuxl.c b/net/caif/cfmuxl.c index 8c5d6386319f..510aa5a753f0 100644 --- a/net/caif/cfmuxl.c +++ b/net/caif/cfmuxl.c @@ -47,10 +47,10 @@ static struct cflayer *get_up(struct cfmuxl *muxl, u16 id); struct cflayer *cfmuxl_create(void) { - struct cfmuxl *this = kmalloc(sizeof(struct cfmuxl), GFP_ATOMIC); + struct cfmuxl *this = kzalloc(sizeof(struct cfmuxl), GFP_ATOMIC); + if (!this) return NULL; - memset(this, 0, sizeof(*this)); this->layer.receive = cfmuxl_receive; this->layer.transmit = cfmuxl_transmit; this->layer.ctrlcmd = cfmuxl_ctrlcmd; -- cgit v1.2.3 From 7970f1918ff685e64063b54474a9c1ac087aee4d Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Tue, 14 Oct 2014 19:00:55 +0200 Subject: caif_usb: remove redundant memory message Let MM subsystem display out of memory messages. Signed-off-by: Fabian Frederick Signed-off-by: David S. Miller --- net/caif/caif_usb.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/caif/caif_usb.c b/net/caif/caif_usb.c index ba02db022900..0e487b07cf96 100644 --- a/net/caif/caif_usb.c +++ b/net/caif/caif_usb.c @@ -87,10 +87,9 @@ static struct cflayer *cfusbl_create(int phyid, u8 ethaddr[ETH_ALEN], { struct cfusbl *this = kmalloc(sizeof(struct cfusbl), GFP_ATOMIC); - if (!this) { - pr_warn("Out of memory\n"); + if (!this) return NULL; - } + caif_assert(offsetof(struct cfusbl, layer) == 0); memset(this, 0, sizeof(struct cflayer)); -- cgit v1.2.3 From 91c4467e3c76b6d40ecc29ed71d3aa1e0285ab80 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Tue, 14 Oct 2014 19:01:14 +0200 Subject: caif_usb: use target structure member in memset parent cfusbl was used instead of first structure member 'layer' Suggested-by: Joe Perches Signed-off-by: Fabian Frederick Signed-off-by: David S. Miller --- net/caif/caif_usb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/caif/caif_usb.c b/net/caif/caif_usb.c index 0e487b07cf96..5cd44f001f64 100644 --- a/net/caif/caif_usb.c +++ b/net/caif/caif_usb.c @@ -92,7 +92,7 @@ static struct cflayer *cfusbl_create(int phyid, u8 ethaddr[ETH_ALEN], caif_assert(offsetof(struct cfusbl, layer) == 0); - memset(this, 0, sizeof(struct cflayer)); + memset(&this->layer, 0, sizeof(this->layer)); this->layer.receive = cfusbl_receive; this->layer.transmit = cfusbl_transmit; this->layer.ctrlcmd = cfusbl_ctrlcmd; -- cgit v1.2.3 From dee49f203a7feef5d00c416b7dc7e34a7caba8e1 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 14 Oct 2014 12:35:08 -0700 Subject: rds: avoid calling sock_kfree_s() on allocation failure It is okay to free a NULL pointer but not okay to mischarge the socket optmem accounting. Compile test only. Reported-by: rucsoftsec@gmail.com Cc: Chien Yen Cc: Stephen Hemminger Signed-off-by: Cong Wang Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/rds/rdma.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 4e37c1cbe8b2..40084d843e9f 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -564,12 +564,12 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, if (rs->rs_bound_addr == 0) { ret = -ENOTCONN; /* XXX not a great errno */ - goto out; + goto out_ret; } if (args->nr_local > UIO_MAXIOV) { ret = -EMSGSIZE; - goto out; + goto out_ret; } /* Check whether to allocate the iovec area */ @@ -578,7 +578,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, iovs = sock_kmalloc(rds_rs_to_sk(rs), iov_size, GFP_KERNEL); if (!iovs) { ret = -ENOMEM; - goto out; + goto out_ret; } } @@ -696,6 +696,7 @@ out: if (iovs != iovstack) sock_kfree_s(rds_rs_to_sk(rs), iovs, iov_size); kfree(pages); +out_ret: if (ret) rds_rdma_free_op(op); else -- cgit v1.2.3 From e53da5fbfc02586fe4506ed583069b8205f3e38d Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 14 Oct 2014 17:02:37 -0400 Subject: net: Trap attempts to call sock_kfree_s() with a NULL pointer. Unlike normal kfree() it is never right to call sock_kfree_s() with a NULL pointer, because sock_kfree_s() also has the side effect of discharging the memory from the sockets quota. Signed-off-by: David S. Miller --- net/core/sock.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index b4f3ea2fce60..15e0c67b1069 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1718,6 +1718,8 @@ EXPORT_SYMBOL(sock_kmalloc); */ void sock_kfree_s(struct sock *sk, void *mem, int size) { + if (WARN_ON_ONCE(!mem)) + return; kfree(mem); atomic_sub(size, &sk->sk_omem_alloc); } -- cgit v1.2.3 From 9b462d02d6dd671a9ebdc45caed6fe98a53c0ebe Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 13 Oct 2014 06:27:47 -0700 Subject: tcp: TCP Small Queues and strange attractors TCP Small queues tries to keep number of packets in qdisc as small as possible, and depends on a tasklet to feed following packets at TX completion time. Choice of tasklet was driven by latencies requirements. Then, TCP stack tries to avoid reorders, by locking flows with outstanding packets in qdisc in a given TX queue. What can happen is that many flows get attracted by a low performing TX queue, and cpu servicing TX completion has to feed packets for all of them, making this cpu 100% busy in softirq mode. This became particularly visible with latest skb->xmit_more support Strategy adopted in this patch is to detect when tcp_wfree() is called from ksoftirqd and let the outstanding queue for this flow being drained before feeding additional packets, so that skb->ooo_okay can be set to allow select_queue() to select the optimal queue : Incoming ACKS are normally handled by different cpus, so this patch gives more chance for these cpus to take over the burden of feeding qdisc with future packets. Tested: lpaa23:~# ./super_netperf 1400 --google-pacing-rate 3028000 -H lpaa24 -l 3600 & lpaa23:~# sar -n DEV 1 10 | grep eth1 06:16:18 AM eth1 595448.00 1190564.00 38381.09 1760253.12 0.00 0.00 1.00 06:16:19 AM eth1 594858.00 1189686.00 38340.76 1758952.72 0.00 0.00 0.00 06:16:20 AM eth1 597017.00 1194019.00 38480.79 1765370.29 0.00 0.00 1.00 06:16:21 AM eth1 595450.00 1190936.00 38380.19 1760805.05 0.00 0.00 0.00 06:16:22 AM eth1 596385.00 1193096.00 38442.56 1763976.29 0.00 0.00 1.00 06:16:23 AM eth1 598155.00 1195978.00 38552.97 1768264.60 0.00 0.00 0.00 06:16:24 AM eth1 594405.00 1188643.00 38312.57 1757414.89 0.00 0.00 1.00 06:16:25 AM eth1 593366.00 1187154.00 38252.16 1755195.83 0.00 0.00 0.00 06:16:26 AM eth1 593188.00 1186118.00 38232.88 1753682.57 0.00 0.00 1.00 06:16:27 AM eth1 596301.00 1192241.00 38440.94 1762733.09 0.00 0.00 0.00 Average: eth1 595457.30 1190843.50 38381.69 1760664.84 0.00 0.00 0.50 lpaa23:~# ./tc -s -d qd sh dev eth1 | grep backlog backlog 7606336b 2513p requeues 167982 backlog 224072b 74p requeues 566 backlog 581376b 192p requeues 5598 backlog 181680b 60p requeues 1070 backlog 5305056b 1753p requeues 110166 // Here, this TX queue is attracting flows backlog 157456b 52p requeues 1758 backlog 672216b 222p requeues 3025 backlog 60560b 20p requeues 24541 backlog 448144b 148p requeues 21258 lpaa23:~# echo 1 >/proc/sys/net/ipv4/tcp_tsq_enable_tcp_wfree_ksoftirqd_detect Immediate jump to full bandwidth, and traffic is properly shard on all tx queues. lpaa23:~# sar -n DEV 1 10 | grep eth1 06:16:46 AM eth1 1397632.00 2795397.00 90081.87 4133031.26 0.00 0.00 1.00 06:16:47 AM eth1 1396874.00 2793614.00 90032.99 4130385.46 0.00 0.00 0.00 06:16:48 AM eth1 1395842.00 2791600.00 89966.46 4127409.67 0.00 0.00 1.00 06:16:49 AM eth1 1395528.00 2791017.00 89946.17 4126551.24 0.00 0.00 0.00 06:16:50 AM eth1 1397891.00 2795716.00 90098.74 4133497.39 0.00 0.00 1.00 06:16:51 AM eth1 1394951.00 2789984.00 89908.96 4125022.51 0.00 0.00 0.00 06:16:52 AM eth1 1394608.00 2789190.00 89886.90 4123851.36 0.00 0.00 1.00 06:16:53 AM eth1 1395314.00 2790653.00 89934.33 4125983.09 0.00 0.00 0.00 06:16:54 AM eth1 1396115.00 2792276.00 89984.25 4128411.21 0.00 0.00 1.00 06:16:55 AM eth1 1396829.00 2793523.00 90030.19 4130250.28 0.00 0.00 0.00 Average: eth1 1396158.40 2792297.00 89987.09 4128439.35 0.00 0.00 0.50 lpaa23:~# tc -s -d qd sh dev eth1 | grep backlog backlog 7900052b 2609p requeues 173287 backlog 878120b 290p requeues 589 backlog 1068884b 354p requeues 5621 backlog 996212b 329p requeues 1088 backlog 984100b 325p requeues 115316 backlog 956848b 316p requeues 1781 backlog 1080996b 357p requeues 3047 backlog 975016b 322p requeues 24571 backlog 990156b 327p requeues 21274 (All 8 TX queues get a fair share of the traffic) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 0a5d97c20aa9..e13d77857225 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -839,26 +839,38 @@ void tcp_wfree(struct sk_buff *skb) { struct sock *sk = skb->sk; struct tcp_sock *tp = tcp_sk(sk); + int wmem; + + /* Keep one reference on sk_wmem_alloc. + * Will be released by sk_free() from here or tcp_tasklet_func() + */ + wmem = atomic_sub_return(skb->truesize - 1, &sk->sk_wmem_alloc); + + /* If this softirq is serviced by ksoftirqd, we are likely under stress. + * Wait until our queues (qdisc + devices) are drained. + * This gives : + * - less callbacks to tcp_write_xmit(), reducing stress (batches) + * - chance for incoming ACK (processed by another cpu maybe) + * to migrate this flow (skb->ooo_okay will be eventually set) + */ + if (wmem >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current) + goto out; if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) && !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) { unsigned long flags; struct tsq_tasklet *tsq; - /* Keep a ref on socket. - * This last ref will be released in tcp_tasklet_func() - */ - atomic_sub(skb->truesize - 1, &sk->sk_wmem_alloc); - /* queue this socket to tasklet queue */ local_irq_save(flags); tsq = &__get_cpu_var(tsq_tasklet); list_add(&tp->tsq_node, &tsq->head); tasklet_schedule(&tsq->tasklet); local_irq_restore(flags); - } else { - sock_wfree(skb); + return; } +out: + sk_free(sk); } /* This routine actually transmits TCP packets queued in by -- cgit v1.2.3 From 04ffcb255f22a2a988ce7393e6e72f6eb3fcb7aa Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 14 Oct 2014 15:19:06 -0700 Subject: net: Add ndo_gso_check Add ndo_gso_check which a device can define to indicate whether is is capable of doing GSO on a packet. This funciton would be called from the stack to determine whether software GSO is needed to be done. A driver should populate this function if it advertises GSO types for which there are combinations that it wouldn't be able to handle. For instance a device that performs UDP tunneling might only implement support for transparent Ethernet bridging type of inner packets or might have limitations on lengths of inner headers. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- drivers/net/macvtap.c | 2 +- drivers/net/xen-netfront.c | 2 +- include/linux/netdevice.h | 12 +++++++++++- net/core/dev.c | 2 +- 4 files changed, 14 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c index 0c6adaaf898c..65e2892342bd 100644 --- a/drivers/net/macvtap.c +++ b/drivers/net/macvtap.c @@ -298,7 +298,7 @@ static rx_handler_result_t macvtap_handle_frame(struct sk_buff **pskb) */ if (q->flags & IFF_VNET_HDR) features |= vlan->tap_features; - if (netif_needs_gso(skb, features)) { + if (netif_needs_gso(dev, skb, features)) { struct sk_buff *segs = __skb_gso_segment(skb, features, false); if (IS_ERR(segs)) diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index ca82f545ec2c..3c0b37574d05 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -638,7 +638,7 @@ static int xennet_start_xmit(struct sk_buff *skb, struct net_device *dev) if (unlikely(!netif_carrier_ok(dev) || (slots > 1 && !xennet_can_sg(dev)) || - netif_needs_gso(skb, netif_skb_features(skb)))) { + netif_needs_gso(dev, skb, netif_skb_features(skb)))) { spin_unlock_irqrestore(&queue->tx_lock, flags); goto drop; } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 838407aea705..74fd5d37f15a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -998,6 +998,12 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev, * Callback to use for xmit over the accelerated station. This * is used in place of ndo_start_xmit on accelerated net * devices. + * bool (*ndo_gso_check) (struct sk_buff *skb, + * struct net_device *dev); + * Called by core transmit path to determine if device is capable of + * performing GSO on a packet. The device returns true if it is + * able to GSO the packet, false otherwise. If the return value is + * false the stack will do software GSO. */ struct net_device_ops { int (*ndo_init)(struct net_device *dev); @@ -1147,6 +1153,8 @@ struct net_device_ops { struct net_device *dev, void *priv); int (*ndo_get_lock_subclass)(struct net_device *dev); + bool (*ndo_gso_check) (struct sk_buff *skb, + struct net_device *dev); }; /** @@ -3572,10 +3580,12 @@ static inline bool skb_gso_ok(struct sk_buff *skb, netdev_features_t features) (!skb_has_frag_list(skb) || (features & NETIF_F_FRAGLIST)); } -static inline bool netif_needs_gso(struct sk_buff *skb, +static inline bool netif_needs_gso(struct net_device *dev, struct sk_buff *skb, netdev_features_t features) { return skb_is_gso(skb) && (!skb_gso_ok(skb, features) || + (dev->netdev_ops->ndo_gso_check && + !dev->netdev_ops->ndo_gso_check(skb, dev)) || unlikely((skb->ip_summed != CHECKSUM_PARTIAL) && (skb->ip_summed != CHECKSUM_UNNECESSARY))); } diff --git a/net/core/dev.c b/net/core/dev.c index 4699dcfdc4ab..9f77a78c6b1c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2675,7 +2675,7 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device if (skb->encapsulation) features &= dev->hw_enc_features; - if (netif_needs_gso(skb, features)) { + if (netif_needs_gso(dev, skb, features)) { struct sk_buff *segs; segs = skb_gso_segment(skb, features); -- cgit v1.2.3 From 7e78cc46b7ec0c80257de8d09f0097081754e206 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 15 Oct 2014 21:03:18 +0200 Subject: openvswitch: kerneldoc warning fix s/sock/gs Signed-off-by: Fabian Frederick Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/vport-geneve.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c index 910b3ef2c0d5..106a9d80b663 100644 --- a/net/openvswitch/vport-geneve.c +++ b/net/openvswitch/vport-geneve.c @@ -30,7 +30,7 @@ /** * struct geneve_port - Keeps track of open UDP ports - * @sock: The socket created for this port number. + * @gs: The socket created for this port number. * @name: vport name. */ struct geneve_port { -- cgit v1.2.3 From 4e8febd0a76333875636859e0092a14c1fba49e4 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 15 Oct 2014 21:03:41 +0200 Subject: openvswitch: use vport instead of p All functions used struct vport *vport except ovs_vport_find_upcall_portid. This fixes 1 kerneldoc warning Signed-off-by: Fabian Frederick Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/vport.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 53001b020ca7..6015802ebe6f 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -408,13 +408,13 @@ int ovs_vport_get_upcall_portids(const struct vport *vport, * * Returns the portid of the target socket. Must be called with rcu_read_lock. */ -u32 ovs_vport_find_upcall_portid(const struct vport *p, struct sk_buff *skb) +u32 ovs_vport_find_upcall_portid(const struct vport *vport, struct sk_buff *skb) { struct vport_portids *ids; u32 ids_index; u32 hash; - ids = rcu_dereference(p->upcall_portids); + ids = rcu_dereference(vport->upcall_portids); if (ids->n_ids == 1 && ids->ids[0] == 0) return 0; -- cgit v1.2.3 From 2077eebf7d8bf20b36524de45851e28111a60c52 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 15 Oct 2014 14:33:20 -0700 Subject: ipv4: call __ip_options_echo() in cookie_v4_check() commit 971f10eca186cab238c49da ("tcp: better TCP_SKB_CB layout to reduce cache line misses") missed that cookie_v4_check() still calls ip_options_echo() which uses IPCB(). It should use TCPCB() at TCP layer, so call __ip_options_echo() instead. Fixes: commit 971f10eca186cab238c49da ("tcp: better TCP_SKB_CB layout to reduce cache line misses") Cc: Krzysztof Kolasa Cc: Eric Dumazet Reported-by: Krzysztof Kolasa Tested-by: Krzysztof Kolasa Signed-off-by: Cong Wang Signed-off-by: Cong Wang Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/syncookies.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 0431a8f3c8f4..7e7401cdb9d7 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -321,7 +321,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, int opt_size = sizeof(struct ip_options_rcu) + opt->optlen; ireq->opt = kmalloc(opt_size, GFP_ATOMIC); - if (ireq->opt != NULL && ip_options_echo(&ireq->opt->opt, skb)) { + if (ireq->opt != NULL && __ip_options_echo(&ireq->opt->opt, skb, opt)) { kfree(ireq->opt); ireq->opt = NULL; } -- cgit v1.2.3 From e25f866fbc8a4bf387b5dbe8e25aa5b07e55c74f Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 15 Oct 2014 14:33:21 -0700 Subject: ipv4: share tcp_v4_save_options() with cookie_v4_check() cookie_v4_check() allocates ip_options_rcu in the same way with tcp_v4_save_options(), we can just make it a helper function. Cc: Krzysztof Kolasa Cc: Eric Dumazet Signed-off-by: Cong Wang Signed-off-by: Cong Wang Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 20 ++++++++++++++++++++ net/ipv4/syncookies.c | 10 +--------- net/ipv4/tcp_ipv4.c | 20 -------------------- 3 files changed, 21 insertions(+), 29 deletions(-) (limited to 'net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 74efeda994b3..869637a7caf7 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1666,4 +1666,24 @@ int tcpv4_offload_init(void); void tcp_v4_init(void); void tcp_init(void); +/* + * Save and compile IPv4 options, return a pointer to it + */ +static inline struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb) +{ + const struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt; + struct ip_options_rcu *dopt = NULL; + + if (opt && opt->optlen) { + int opt_size = sizeof(*dopt) + opt->optlen; + + dopt = kmalloc(opt_size, GFP_ATOMIC); + if (dopt && __ip_options_echo(&dopt->opt, skb, opt)) { + kfree(dopt); + dopt = NULL; + } + } + return dopt; +} + #endif /* _TCP_H */ diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 7e7401cdb9d7..c68d0a1a468e 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -317,15 +317,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, /* We throwed the options of the initial SYN away, so we hope * the ACK carries the same options again (see RFC1122 4.2.3.8) */ - if (opt && opt->optlen) { - int opt_size = sizeof(struct ip_options_rcu) + opt->optlen; - - ireq->opt = kmalloc(opt_size, GFP_ATOMIC); - if (ireq->opt != NULL && __ip_options_echo(&ireq->opt->opt, skb, opt)) { - kfree(ireq->opt); - ireq->opt = NULL; - } - } + ireq->opt = tcp_v4_save_options(skb); if (security_inet_conn_request(sk, skb, req)) { reqsk_free(req); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 552e87e3c269..6a2a7d659a7a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -880,26 +880,6 @@ bool tcp_syn_flood_action(struct sock *sk, } EXPORT_SYMBOL(tcp_syn_flood_action); -/* - * Save and compile IPv4 options into the request_sock if needed. - */ -static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb) -{ - const struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt; - struct ip_options_rcu *dopt = NULL; - - if (opt && opt->optlen) { - int opt_size = sizeof(*dopt) + opt->optlen; - - dopt = kmalloc(opt_size, GFP_ATOMIC); - if (dopt && __ip_options_echo(&dopt->opt, skb, opt)) { - kfree(dopt); - dopt = NULL; - } - } - return dopt; -} - #ifdef CONFIG_TCP_MD5SIG /* * RFC2385 MD5 checksumming requires a mapping of -- cgit v1.2.3 From 461b74c391c4ec9c766794e158508c357d8952e6 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 15 Oct 2014 14:33:22 -0700 Subject: ipv4: clean up cookie_v4_check() We can retrieve opt from skb, no need to pass it as a parameter. And opt should always be non-NULL, no need to check. Cc: Krzysztof Kolasa Cc: Eric Dumazet Tested-by: Krzysztof Kolasa Signed-off-by: Cong Wang Signed-off-by: Cong Wang Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 5 ++--- net/ipv4/syncookies.c | 6 +++--- net/ipv4/tcp_ipv4.c | 2 +- 3 files changed, 6 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 869637a7caf7..3a4bbbfdcf8d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -468,8 +468,7 @@ void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb); /* From syncookies.c */ int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th, u32 cookie); -struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, - struct ip_options *opt); +struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb); #ifdef CONFIG_SYN_COOKIES /* Syncookies use a monotonic timer which increments every 60 seconds. @@ -1674,7 +1673,7 @@ static inline struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb) const struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt; struct ip_options_rcu *dopt = NULL; - if (opt && opt->optlen) { + if (opt->optlen) { int opt_size = sizeof(*dopt) + opt->optlen; dopt = kmalloc(opt_size, GFP_ATOMIC); diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index c68d0a1a468e..d346303fac77 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -255,9 +255,9 @@ bool cookie_check_timestamp(struct tcp_options_received *tcp_opt, } EXPORT_SYMBOL(cookie_check_timestamp); -struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, - struct ip_options *opt) +struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) { + struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt; struct tcp_options_received tcp_opt; struct inet_request_sock *ireq; struct tcp_request_sock *treq; @@ -336,7 +336,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, flowi4_init_output(&fl4, sk->sk_bound_dev_if, ireq->ir_mark, RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP, inet_sk_flowi_flags(sk), - (opt && opt->srr) ? opt->faddr : ireq->ir_rmt_addr, + opt->srr ? opt->faddr : ireq->ir_rmt_addr, ireq->ir_loc_addr, th->source, th->dest); security_req_classify_flow(req, flowi4_to_flowi(&fl4)); rt = ip_route_output_key(sock_net(sk), &fl4); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 6a2a7d659a7a..94d1a7757ff7 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1408,7 +1408,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) #ifdef CONFIG_SYN_COOKIES if (!th->syn) - sk = cookie_v4_check(sk, skb, &TCP_SKB_CB(skb)->header.h4.opt); + sk = cookie_v4_check(sk, skb); #endif return sk; } -- cgit v1.2.3 From 4062090e3e5caaf55bed4523a69f26c3265cc1d2 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Wed, 15 Oct 2014 16:24:02 +0400 Subject: ipv4: dst_entry leak in ip_send_unicast_reply() ip_setup_cork() called inside ip_append_data() steals dst entry from rt to cork and in case errors in __ip_append_data() nobody frees stolen dst entry Fixes: 2e77d89b2fa8 ("net: avoid a pair of dst_hold()/dst_release() in ip_append_data()") Signed-off-by: Vasily Averin Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index e35b71289156..88e5ef2c7f51 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1535,6 +1535,7 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, struct sk_buff *nskb; struct sock *sk; struct inet_sock *inet; + int err; if (__ip_options_echo(&replyopts.opt.opt, skb, sopt)) return; @@ -1574,8 +1575,13 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, sock_net_set(sk, net); __skb_queue_head_init(&sk->sk_write_queue); sk->sk_sndbuf = sysctl_wmem_default; - ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0, - &ipc, &rt, MSG_DONTWAIT); + err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, + len, 0, &ipc, &rt, MSG_DONTWAIT); + if (unlikely(err)) { + ip_flush_pending_frames(sk); + goto out; + } + nskb = skb_peek(&sk->sk_write_queue); if (nskb) { if (arg->csumoffset >= 0) @@ -1587,7 +1593,7 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb)); ip_push_pending_frames(sk, &fl4); } - +out: put_cpu_var(unicast_sock); ip_rt_put(rt); -- cgit v1.2.3 From 389f48947a5a37ea283de520abb742d42174edb0 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Fri, 17 Oct 2014 14:03:08 +0800 Subject: openvswitch: fix a use after free pskb_may_pull() called by arphdr_ok can change skb->data, so put the arp setting after arphdr_ok to avoid the use the freed memory Fixes: 0714812134d7d ("openvswitch: Eliminate memset() from flow_extract.") Cc: Jesse Gross Cc: Eric Dumazet Signed-off-by: Li RongQing Acked-by: Jesse Gross Signed-off-by: David S. Miller --- net/openvswitch/flow.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 62db02ba36bc..c5cfc72a5535 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -557,10 +557,11 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) } else if (key->eth.type == htons(ETH_P_ARP) || key->eth.type == htons(ETH_P_RARP)) { struct arp_eth_header *arp; + bool arp_available = arphdr_ok(skb); arp = (struct arp_eth_header *)skb_network_header(skb); - if (arphdr_ok(skb) && + if (arp_available && arp->ar_hrd == htons(ARPHRD_ETHER) && arp->ar_pro == htons(ETH_P_IP) && arp->ar_hln == ETH_ALEN && -- cgit v1.2.3 From f47de068f68db91b89e0d3335230d07e02da8727 Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Thu, 16 Oct 2014 21:55:45 -0700 Subject: openvswitch: Create right mask with disabled megaflows If megaflows are disabled, the userspace does not send the netlink attribute OVS_FLOW_ATTR_MASK, and the kernel must create an exact match mask. sw_flow_mask_set() sets every bytes (in 'range') of the mask to 0xff, even the bytes that represent padding for struct sw_flow, or the bytes that represent fields that may not be set during ovs_flow_extract(). This is a problem, because when we extract a flow from a packet, we do not memset() anymore the struct sw_flow to 0. This commit gets rid of sw_flow_mask_set() and introduces mask_set_nlattr(), which operates on the netlink attributes rather than on the mask key. Using this approach we are sure that only the bytes that the user provided in the flow are matched. Also, if the parse_flow_mask_nlattrs() for the mask ENCAP attribute fails, we now return with an error. This bug is introduced by commit 0714812134d7dcadeb7ecfbfeb18788aa7e1eaac ("openvswitch: Eliminate memset() from flow_extract"). Reported-by: Alex Wang Signed-off-by: Daniele Di Proietto Signed-off-by: Andy Zhou Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/flow_netlink.c | 93 ++++++++++++++++++++++++++++++++---------- 1 file changed, 72 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 368f23307911..939bcb32100f 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -103,10 +103,19 @@ static void update_range__(struct sw_flow_match *match, SW_FLOW_KEY_MEMCPY_OFFSET(match, offsetof(struct sw_flow_key, field), \ value_p, len, is_mask) -static u16 range_n_bytes(const struct sw_flow_key_range *range) -{ - return range->end - range->start; -} +#define SW_FLOW_KEY_MEMSET_FIELD(match, field, value, is_mask) \ + do { \ + update_range__(match, offsetof(struct sw_flow_key, field), \ + sizeof((match)->key->field), is_mask); \ + if (is_mask) { \ + if ((match)->mask) \ + memset((u8 *)&(match)->mask->key.field, value,\ + sizeof((match)->mask->key.field)); \ + } else { \ + memset((u8 *)&(match)->key->field, value, \ + sizeof((match)->key->field)); \ + } \ + } while (0) static bool match_validate(const struct sw_flow_match *match, u64 key_attrs, u64 mask_attrs) @@ -809,13 +818,26 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, return 0; } -static void sw_flow_mask_set(struct sw_flow_mask *mask, - struct sw_flow_key_range *range, u8 val) +static void nlattr_set(struct nlattr *attr, u8 val, bool is_attr_mask_key) { - u8 *m = (u8 *)&mask->key + range->start; + struct nlattr *nla; + int rem; + + /* The nlattr stream should already have been validated */ + nla_for_each_nested(nla, attr, rem) { + /* We assume that ovs_key_lens[type] == -1 means that type is a + * nested attribute + */ + if (is_attr_mask_key && ovs_key_lens[nla_type(nla)] == -1) + nlattr_set(nla, val, false); + else + memset(nla_data(nla), val, nla_len(nla)); + } +} - mask->range = *range; - memset(m, val, range_n_bytes(range)); +static void mask_set_nlattr(struct nlattr *attr, u8 val) +{ + nlattr_set(attr, val, true); } /** @@ -836,6 +858,7 @@ int ovs_nla_get_match(struct sw_flow_match *match, { const struct nlattr *a[OVS_KEY_ATTR_MAX + 1]; const struct nlattr *encap; + struct nlattr *newmask = NULL; u64 key_attrs = 0; u64 mask_attrs = 0; bool encap_valid = false; @@ -882,18 +905,44 @@ int ovs_nla_get_match(struct sw_flow_match *match, if (err) return err; + if (match->mask && !mask) { + /* Create an exact match mask. We need to set to 0xff all the + * 'match->mask' fields that have been touched in 'match->key'. + * We cannot simply memset 'match->mask', because padding bytes + * and fields not specified in 'match->key' should be left to 0. + * Instead, we use a stream of netlink attributes, copied from + * 'key' and set to 0xff: ovs_key_from_nlattrs() will take care + * of filling 'match->mask' appropriately. + */ + newmask = kmemdup(key, nla_total_size(nla_len(key)), + GFP_KERNEL); + if (!newmask) + return -ENOMEM; + + mask_set_nlattr(newmask, 0xff); + + /* The userspace does not send tunnel attributes that are 0, + * but we should not wildcard them nonetheless. + */ + if (match->key->tun_key.ipv4_dst) + SW_FLOW_KEY_MEMSET_FIELD(match, tun_key, 0xff, true); + + mask = newmask; + } + if (mask) { err = parse_flow_mask_nlattrs(mask, a, &mask_attrs); if (err) - return err; + goto free_newmask; - if (mask_attrs & 1 << OVS_KEY_ATTR_ENCAP) { + if (mask_attrs & 1 << OVS_KEY_ATTR_ENCAP) { __be16 eth_type = 0; __be16 tci = 0; if (!encap_valid) { OVS_NLERR("Encap mask attribute is set for non-VLAN frame.\n"); - return -EINVAL; + err = -EINVAL; + goto free_newmask; } mask_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP); @@ -904,10 +953,13 @@ int ovs_nla_get_match(struct sw_flow_match *match, mask_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE); encap = a[OVS_KEY_ATTR_ENCAP]; err = parse_flow_mask_nlattrs(encap, a, &mask_attrs); + if (err) + goto free_newmask; } else { OVS_NLERR("VLAN frames must have an exact match on the TPID (mask=%x).\n", ntohs(eth_type)); - return -EINVAL; + err = -EINVAL; + goto free_newmask; } if (a[OVS_KEY_ATTR_VLAN]) @@ -915,23 +967,22 @@ int ovs_nla_get_match(struct sw_flow_match *match, if (!(tci & htons(VLAN_TAG_PRESENT))) { OVS_NLERR("VLAN tag present bit must have an exact match (tci_mask=%x).\n", ntohs(tci)); - return -EINVAL; + err = -EINVAL; + goto free_newmask; } } err = ovs_key_from_nlattrs(match, mask_attrs, a, true); if (err) - return err; - } else { - /* Populate exact match flow's key mask. */ - if (match->mask) - sw_flow_mask_set(match->mask, &match->range, 0xff); + goto free_newmask; } if (!match_validate(match, key_attrs, mask_attrs)) - return -EINVAL; + err = -EINVAL; - return 0; +free_newmask: + kfree(newmask); + return err; } /** -- cgit v1.2.3 From 1245dfc8cadb258386fcd27df38215a0eccb1f17 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Fri, 17 Oct 2014 16:53:23 +0800 Subject: ipv4: fix a potential use after free in ip_tunnel_core.c pskb_may_pull() maybe change skb->data and make eth pointer oboslete, so set eth after pskb_may_pull() Fixes:3d7b46cd("ip_tunnel: push generic protocol handling to ip_tunnel module") Cc: Pravin B Shelar Signed-off-by: Li RongQing Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/ipv4/ip_tunnel_core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index f4c987bb7e94..88c386cf7d85 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -91,11 +91,12 @@ int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto) skb_pull_rcsum(skb, hdr_len); if (inner_proto == htons(ETH_P_TEB)) { - struct ethhdr *eh = (struct ethhdr *)skb->data; + struct ethhdr *eh; if (unlikely(!pskb_may_pull(skb, ETH_HLEN))) return -ENOMEM; + eh = (struct ethhdr *)skb->data; if (likely(ntohs(eh->h_proto) >= ETH_P_802_3_MIN)) skb->protocol = eh->h_proto; else -- cgit v1.2.3 From d8f00d27105a1553a13d4a96c3eb4544f70ca908 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Fri, 17 Oct 2014 16:53:47 +0800 Subject: ipv4: fix a potential use after free in fou.c pskb_may_pull() maybe change skb->data and make uh pointer oboslete, so reload uh and guehdr Fixes: 37dd0247 ("gue: Receive side for Generic UDP Encapsulation") Cc: Tom Herbert Signed-off-by: Li RongQing Signed-off-by: David S. Miller --- net/ipv4/fou.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index efa70ad44906..32e78924e246 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -87,6 +87,9 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb) if (!pskb_may_pull(skb, len)) goto drop; + uh = udp_hdr(skb); + guehdr = (struct guehdr *)&uh[1]; + if (guehdr->version != 0) goto drop; -- cgit v1.2.3 From 870c3151382c980590d4d609babf3b0243e7db93 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 17 Oct 2014 09:17:20 -0700 Subject: ipv6: introduce tcp_v6_iif() Commit 971f10eca186 ("tcp: better TCP_SKB_CB layout to reduce cache line misses") added a regression for SO_BINDTODEVICE on IPv6. This is because we still use inet6_iif() which expects that IP6 control block is still at the beginning of skb->cb[] This patch adds tcp_v6_iif() helper and uses it where necessary. Because __inet6_lookup_skb() is used by TCP and DCCP, we add an iif parameter to it. Signed-off-by: Eric Dumazet Fixes: 971f10eca186 ("tcp: better TCP_SKB_CB layout to reduce cache line misses") Acked-by: Cong Wang Signed-off-by: David S. Miller --- include/net/inet6_hashtables.h | 5 +++-- include/net/tcp.h | 9 +++++++++ net/dccp/ipv6.c | 3 ++- net/ipv6/syncookies.c | 2 +- net/ipv6/tcp_ipv6.c | 26 +++++++++++++++----------- 5 files changed, 30 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h index ae0613544308..d1d272843b3b 100644 --- a/include/net/inet6_hashtables.h +++ b/include/net/inet6_hashtables.h @@ -80,7 +80,8 @@ static inline struct sock *__inet6_lookup(struct net *net, static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo, struct sk_buff *skb, const __be16 sport, - const __be16 dport) + const __be16 dport, + int iif) { struct sock *sk = skb_steal_sock(skb); @@ -90,7 +91,7 @@ static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo, return __inet6_lookup(dev_net(skb_dst(skb)->dev), hashinfo, &ipv6_hdr(skb)->saddr, sport, &ipv6_hdr(skb)->daddr, ntohs(dport), - inet6_iif(skb)); + iif); } struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo, diff --git a/include/net/tcp.h b/include/net/tcp.h index 3a4bbbfdcf8d..c9766f89deba 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -729,6 +729,15 @@ struct tcp_skb_cb { #define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0])) + +/* This is the variant of inet6_iif() that must be used by TCP, + * as TCP moves IP6CB into a different location in skb->cb[] + */ +static inline int tcp_v6_iif(const struct sk_buff *skb) +{ + return TCP_SKB_CB(skb)->header.h6.iif; +} + /* Due to TSO, an SKB can be composed of multiple actual * packets. To keep these tracked properly, we use this. */ diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index ad2acfe1ca61..6bcaa33cd804 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -757,7 +757,8 @@ static int dccp_v6_rcv(struct sk_buff *skb) /* Step 2: * Look up flow ID in table and get corresponding socket */ sk = __inet6_lookup_skb(&dccp_hashinfo, skb, - dh->dccph_sport, dh->dccph_dport); + dh->dccph_sport, dh->dccph_dport, + inet6_iif(skb)); /* * Step 2: * If no socket ... diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 9a2838e93cc5..2a86a0f00f2b 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -214,7 +214,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) /* So that link locals have meaning */ if (!sk->sk_bound_dev_if && ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL) - ireq->ir_iif = inet6_iif(skb); + ireq->ir_iif = tcp_v6_iif(skb); ireq->ir_mark = inet_request_mark(sk, skb); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index cf2e45ab2fa4..831495529b82 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -424,6 +424,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (sock_owned_by_user(sk)) goto out; + /* Note : We use inet6_iif() here, not tcp_v6_iif() */ req = inet6_csk_search_req(sk, &prev, th->dest, &hdr->daddr, &hdr->saddr, inet6_iif(skb)); if (!req) @@ -738,7 +739,7 @@ static void tcp_v6_init_req(struct request_sock *req, struct sock *sk, /* So that link locals have meaning */ if (!sk->sk_bound_dev_if && ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL) - ireq->ir_iif = inet6_iif(skb); + ireq->ir_iif = tcp_v6_iif(skb); if (!TCP_SKB_CB(skb)->tcp_tw_isn && (ipv6_opt_accepted(sk, skb, &TCP_SKB_CB(skb)->header.h6) || @@ -860,7 +861,7 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win, fl6.flowi6_proto = IPPROTO_TCP; if (rt6_need_strict(&fl6.daddr) && !oif) - fl6.flowi6_oif = inet6_iif(skb); + fl6.flowi6_oif = tcp_v6_iif(skb); else fl6.flowi6_oif = oif; fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark); @@ -918,7 +919,7 @@ static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb) sk1 = inet6_lookup_listener(dev_net(skb_dst(skb)->dev), &tcp_hashinfo, &ipv6h->saddr, th->source, &ipv6h->daddr, - ntohs(th->source), inet6_iif(skb)); + ntohs(th->source), tcp_v6_iif(skb)); if (!sk1) return; @@ -1000,13 +1001,14 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb) /* Find possible connection requests. */ req = inet6_csk_search_req(sk, &prev, th->source, &ipv6_hdr(skb)->saddr, - &ipv6_hdr(skb)->daddr, inet6_iif(skb)); + &ipv6_hdr(skb)->daddr, tcp_v6_iif(skb)); if (req) return tcp_check_req(sk, skb, req, prev, false); nsk = __inet6_lookup_established(sock_net(sk), &tcp_hashinfo, - &ipv6_hdr(skb)->saddr, th->source, - &ipv6_hdr(skb)->daddr, ntohs(th->dest), inet6_iif(skb)); + &ipv6_hdr(skb)->saddr, th->source, + &ipv6_hdr(skb)->daddr, ntohs(th->dest), + tcp_v6_iif(skb)); if (nsk) { if (nsk->sk_state != TCP_TIME_WAIT) { @@ -1090,7 +1092,7 @@ static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newnp->ipv6_fl_list = NULL; newnp->pktoptions = NULL; newnp->opt = NULL; - newnp->mcast_oif = inet6_iif(skb); + newnp->mcast_oif = tcp_v6_iif(skb); newnp->mcast_hops = ipv6_hdr(skb)->hop_limit; newnp->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(skb)); if (np->repflow) @@ -1174,7 +1176,7 @@ static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, skb_set_owner_r(newnp->pktoptions, newsk); } newnp->opt = NULL; - newnp->mcast_oif = inet6_iif(skb); + newnp->mcast_oif = tcp_v6_iif(skb); newnp->mcast_hops = ipv6_hdr(skb)->hop_limit; newnp->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(skb)); if (np->repflow) @@ -1360,7 +1362,7 @@ ipv6_pktoptions: if (TCP_SKB_CB(opt_skb)->end_seq == tp->rcv_nxt && !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) { if (np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo) - np->mcast_oif = inet6_iif(opt_skb); + np->mcast_oif = tcp_v6_iif(opt_skb); if (np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) np->mcast_hops = ipv6_hdr(opt_skb)->hop_limit; if (np->rxopt.bits.rxflow || np->rxopt.bits.rxtclass) @@ -1427,7 +1429,8 @@ static int tcp_v6_rcv(struct sk_buff *skb) TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr); TCP_SKB_CB(skb)->sacked = 0; - sk = __inet6_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest); + sk = __inet6_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest, + tcp_v6_iif(skb)); if (!sk) goto no_tcp_socket; @@ -1514,7 +1517,7 @@ do_time_wait: sk2 = inet6_lookup_listener(dev_net(skb->dev), &tcp_hashinfo, &ipv6_hdr(skb)->saddr, th->source, &ipv6_hdr(skb)->daddr, - ntohs(th->dest), inet6_iif(skb)); + ntohs(th->dest), tcp_v6_iif(skb)); if (sk2 != NULL) { struct inet_timewait_sock *tw = inet_twsk(sk); inet_twsk_deschedule(tw, &tcp_death_row); @@ -1553,6 +1556,7 @@ static void tcp_v6_early_demux(struct sk_buff *skb) if (th->doff < sizeof(struct tcphdr) / 4) return; + /* Note : We use inet6_iif() here, not tcp_v6_iif() */ sk = __inet6_lookup_established(dev_net(skb->dev), &tcp_hashinfo, &hdr->saddr, th->source, &hdr->daddr, ntohs(th->dest), -- cgit v1.2.3 From 643566d4b47e2956110e79c0e6f65db9b9ea42c6 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Fri, 17 Oct 2014 15:25:28 -0400 Subject: tipc: fix bug in bundled buffer reception In commit ec8a2e5621db2da24badb3969eda7fd359e1869f ("tipc: same receive code path for connection protocol and data messages") we omitted the the possiblilty that an arriving message extracted from a bundle buffer may be a multicast message. Such messages need to be to be delivered to the socket via a separate function, tipc_sk_mcast_rcv(). As a result, small multicast messages arriving as members of a bundle buffer will be silently dropped. This commit corrects the error by considering this case in the function tipc_link_bundle_rcv(). Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index 65410e18b8a6..1db162aa64a5 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1924,7 +1924,12 @@ void tipc_link_bundle_rcv(struct sk_buff *buf) } omsg = buf_msg(obuf); pos += align(msg_size(omsg)); - if (msg_isdata(omsg) || (msg_user(omsg) == CONN_MANAGER)) { + if (msg_isdata(omsg)) { + if (unlikely(msg_type(omsg) == TIPC_MCAST_MSG)) + tipc_sk_mcast_rcv(obuf); + else + tipc_sk_rcv(obuf); + } else if (msg_user(omsg) == CONN_MANAGER) { tipc_sk_rcv(obuf); } else if (msg_user(omsg) == NAME_DISTRIBUTOR) { tipc_named_rcv(obuf); -- cgit v1.2.3 From dc8e54165f1dc8ee946c953512a877676f8bbe3f Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Fri, 17 Oct 2014 22:00:22 +0200 Subject: netrom: use linux/uaccess.h replace asm/uaccess.h by linux/uaccess.h Signed-off-by: Fabian Frederick Signed-off-by: David S. Miller --- net/netrom/af_netrom.c | 2 +- net/netrom/nr_dev.c | 2 +- net/netrom/nr_in.c | 2 +- net/netrom/nr_out.c | 2 +- net/netrom/nr_route.c | 2 +- net/netrom/nr_subr.c | 2 +- net/netrom/nr_timer.c | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 71cf1bffea06..1b06a1fcf3e8 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -30,7 +30,7 @@ #include #include #include -#include +#include #include #include /* For TIOCINQ/OUTQ */ #include diff --git a/net/netrom/nr_dev.c b/net/netrom/nr_dev.c index 743262becd6e..6ae063cebf7d 100644 --- a/net/netrom/nr_dev.c +++ b/net/netrom/nr_dev.c @@ -20,8 +20,8 @@ #include #include /* For the statistics structure. */ #include +#include -#include #include #include diff --git a/net/netrom/nr_in.c b/net/netrom/nr_in.c index c3073a2ef634..80dbd0beb516 100644 --- a/net/netrom/nr_in.c +++ b/net/netrom/nr_in.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/net/netrom/nr_out.c b/net/netrom/nr_out.c index 0b4bcb2bf38f..00fbf1419ec6 100644 --- a/net/netrom/nr_out.c +++ b/net/netrom/nr_out.c @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/net/netrom/nr_route.c b/net/netrom/nr_route.c index b976d5eff2de..96b64d2f6dbf 100644 --- a/net/netrom/nr_route.c +++ b/net/netrom/nr_route.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include /* For TIOCINQ/OUTQ */ #include diff --git a/net/netrom/nr_subr.c b/net/netrom/nr_subr.c index ca40e2298f5a..029c8bb90f4c 100644 --- a/net/netrom/nr_subr.c +++ b/net/netrom/nr_subr.c @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/net/netrom/nr_timer.c b/net/netrom/nr_timer.c index ff2c1b142f57..94d05806a9a2 100644 --- a/net/netrom/nr_timer.c +++ b/net/netrom/nr_timer.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include #include -- cgit v1.2.3 From 25ef1328a03c72a7285883d5b337c4b602476ecd Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Fri, 17 Oct 2014 13:56:31 -0700 Subject: openvswitch: Set flow-key members. This patch adds missing memset which are required to initialize flow key member. For example for IP flow we need to initialize ip.frag for all cases. Found by inspection. This bug is introduced by commit 0714812134d7dcadeb7ecfbfeb18788aa7e1eaac ("openvswitch: Eliminate memset() from flow_extract"). Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/flow.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index c5cfc72a5535..2b78789ea7c5 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -274,6 +274,8 @@ static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key) key->ip.frag = OVS_FRAG_TYPE_LATER; else key->ip.frag = OVS_FRAG_TYPE_FIRST; + } else { + key->ip.frag = OVS_FRAG_TYPE_NONE; } nh_len = payload_ofs - nh_ofs; @@ -358,6 +360,7 @@ static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key, */ key->tp.src = htons(icmp->icmp6_type); key->tp.dst = htons(icmp->icmp6_code); + memset(&key->ipv6.nd, 0, sizeof(key->ipv6.nd)); if (icmp->icmp6_code == 0 && (icmp->icmp6_type == NDISC_NEIGHBOUR_SOLICITATION || @@ -674,9 +677,6 @@ int ovs_flow_key_extract(struct ovs_tunnel_info *tun_info, key->ovs_flow_hash = 0; key->recirc_id = 0; - /* Flags are always used as part of stats */ - key->tp.flags = 0; - return key_extract(skb, key); } -- cgit v1.2.3 From a28205437b41a2c1333c1599ce1e8f09af7b00d6 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 17 Oct 2014 16:02:13 -0700 Subject: net: dsa: add includes for ethtool and phy_fixed definitions net/dsa/slave.c uses functions and structures declared in phy_fixed.h but does not explicitely include it, while dsa.h needs structure declarations for 'struct ethtool_wolinfo' and 'struct ethtool_eee', fix those by including the correct header files. Fixes: ec9436baedb6 ("net: dsa: allow drivers to do link adjustment") Fixes: ce31b31c68e7 ("net: dsa: allow updating fixed PHY link information") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 1 + net/dsa/slave.c | 1 + 2 files changed, 2 insertions(+) (limited to 'net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 58ad8c6492db..b76559293535 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -18,6 +18,7 @@ #include #include #include +#include enum dsa_tag_protocol { DSA_TAG_PROTO_NONE = 0, diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 8030489d9cbe..a851e9f14118 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include "dsa_priv.h" -- cgit v1.2.3