diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2013-09-05 14:54:29 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-09-05 14:54:29 -0700 |
commit | cc998ff8811530be521f6b316f37ab7676a07938 (patch) | |
tree | a054b3bf4b2ef406bf756a6cfc9be2f9115f17ae /net/core/sock.c | |
parent | 57d730924d5cc2c3e280af16a9306587c3a511db (diff) | |
parent | 0d40f75bdab241868c0eb6f97aef9f8b3a66f7b3 (diff) | |
download | linux-cc998ff8811530be521f6b316f37ab7676a07938.tar.bz2 |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking changes from David Miller:
"Noteworthy changes this time around:
1) Multicast rejoin support for team driver, from Jiri Pirko.
2) Centralize and simplify TCP RTT measurement handling in order to
reduce the impact of bad RTO seeding from SYN/ACKs. Also, when
both timestamps and local RTT measurements are available prefer
the later because there are broken middleware devices which
scramble the timestamp.
From Yuchung Cheng.
3) Add TCP_NOTSENT_LOWAT socket option to limit the amount of kernel
memory consumed to queue up unsend user data. From Eric Dumazet.
4) Add a "physical port ID" abstraction for network devices, from
Jiri Pirko.
5) Add a "suppress" operation to influence fib_rules lookups, from
Stefan Tomanek.
6) Add a networking development FAQ, from Paul Gortmaker.
7) Extend the information provided by tcp_probe and add ipv6 support,
from Daniel Borkmann.
8) Use RCU locking more extensively in openvswitch data paths, from
Pravin B Shelar.
9) Add SCTP support to openvswitch, from Joe Stringer.
10) Add EF10 chip support to SFC driver, from Ben Hutchings.
11) Add new SYNPROXY netfilter target, from Patrick McHardy.
12) Compute a rate approximation for sending in TCP sockets, and use
this to more intelligently coalesce TSO frames. Furthermore, add
a new packet scheduler which takes advantage of this estimate when
available. From Eric Dumazet.
13) Allow AF_PACKET fanouts with random selection, from Daniel
Borkmann.
14) Add ipv6 support to vxlan driver, from Cong Wang"
Resolved conflicts as per discussion.
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1218 commits)
openvswitch: Fix alignment of struct sw_flow_key.
netfilter: Fix build errors with xt_socket.c
tcp: Add missing braces to do_tcp_setsockopt
caif: Add missing braces to multiline if in cfctrl_linkup_request
bnx2x: Add missing braces in bnx2x:bnx2x_link_initialize
vxlan: Fix kernel panic on device delete.
net: mvneta: implement ->ndo_do_ioctl() to support PHY ioctls
net: mvneta: properly disable HW PHY polling and ensure adjust_link() works
icplus: Use netif_running to determine device state
ethernet/arc/arc_emac: Fix huge delays in large file copies
tuntap: orphan frags before trying to set tx timestamp
tuntap: purge socket error queue on detach
qlcnic: use standard NAPI weights
ipv6:introduce function to find route for redirect
bnx2x: VF RSS support - VF side
bnx2x: VF RSS support - PF side
vxlan: Notify drivers for listening UDP port changes
net: usbnet: update addr_assign_type if appropriate
driver/net: enic: update enic maintainers and driver
driver/net: enic: Exposing symbols for Cisco's low latency driver
...
Diffstat (limited to 'net/core/sock.c')
-rw-r--r-- | net/core/sock.c | 166 |
1 files changed, 117 insertions, 49 deletions
diff --git a/net/core/sock.c b/net/core/sock.c index 2c097c5a35dd..5b6beba494a3 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -93,6 +93,7 @@ #include <linux/capability.h> #include <linux/errno.h> +#include <linux/errqueue.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> @@ -1575,6 +1576,25 @@ void sock_wfree(struct sk_buff *skb) } EXPORT_SYMBOL(sock_wfree); +void skb_orphan_partial(struct sk_buff *skb) +{ + /* TCP stack sets skb->ooo_okay based on sk_wmem_alloc, + * so we do not completely orphan skb, but transfert all + * accounted bytes but one, to avoid unexpected reorders. + */ + if (skb->destructor == sock_wfree +#ifdef CONFIG_INET + || skb->destructor == tcp_wfree +#endif + ) { + atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc); + skb->truesize = 1; + } else { + skb_orphan(skb); + } +} +EXPORT_SYMBOL(skb_orphan_partial); + /* * Read buffer destructor automatically called from kfree_skb. */ @@ -1721,24 +1741,23 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo) struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, unsigned long data_len, int noblock, - int *errcode) + int *errcode, int max_page_order) { - struct sk_buff *skb; + struct sk_buff *skb = NULL; + unsigned long chunk; gfp_t gfp_mask; long timeo; int err; int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; + struct page *page; + int i; err = -EMSGSIZE; if (npages > MAX_SKB_FRAGS) goto failure; - gfp_mask = sk->sk_allocation; - if (gfp_mask & __GFP_WAIT) - gfp_mask |= __GFP_REPEAT; - timeo = sock_sndtimeo(sk, noblock); - while (1) { + while (!skb) { err = sock_error(sk); if (err != 0) goto failure; @@ -1747,50 +1766,52 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, if (sk->sk_shutdown & SEND_SHUTDOWN) goto failure; - if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { - skb = alloc_skb(header_len, gfp_mask); - if (skb) { - int i; - - /* No pages, we're done... */ - if (!data_len) - break; - - skb->truesize += data_len; - skb_shinfo(skb)->nr_frags = npages; - for (i = 0; i < npages; i++) { - struct page *page; - - page = alloc_pages(sk->sk_allocation, 0); - if (!page) { - err = -ENOBUFS; - skb_shinfo(skb)->nr_frags = i; - kfree_skb(skb); - goto failure; - } - - __skb_fill_page_desc(skb, i, - page, 0, - (data_len >= PAGE_SIZE ? - PAGE_SIZE : - data_len)); - data_len -= PAGE_SIZE; - } + if (atomic_read(&sk->sk_wmem_alloc) >= sk->sk_sndbuf) { + set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + err = -EAGAIN; + if (!timeo) + goto failure; + if (signal_pending(current)) + goto interrupted; + timeo = sock_wait_for_wmem(sk, timeo); + continue; + } - /* Full success... */ - break; - } - err = -ENOBUFS; + err = -ENOBUFS; + gfp_mask = sk->sk_allocation; + if (gfp_mask & __GFP_WAIT) + gfp_mask |= __GFP_REPEAT; + + skb = alloc_skb(header_len, gfp_mask); + if (!skb) goto failure; + + skb->truesize += data_len; + + for (i = 0; npages > 0; i++) { + int order = max_page_order; + + while (order) { + if (npages >= 1 << order) { + page = alloc_pages(sk->sk_allocation | + __GFP_COMP | __GFP_NOWARN, + order); + if (page) + goto fill_page; + } + order--; + } + page = alloc_page(sk->sk_allocation); + if (!page) + goto failure; +fill_page: + chunk = min_t(unsigned long, data_len, + PAGE_SIZE << order); + skb_fill_page_desc(skb, i, page, 0, chunk); + data_len -= chunk; + npages -= 1 << order; } - set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); - set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); - err = -EAGAIN; - if (!timeo) - goto failure; - if (signal_pending(current)) - goto interrupted; - timeo = sock_wait_for_wmem(sk, timeo); } skb_set_owner_w(skb, sk); @@ -1799,6 +1820,7 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, interrupted: err = sock_intr_errno(timeo); failure: + kfree_skb(skb); *errcode = err; return NULL; } @@ -1807,7 +1829,7 @@ EXPORT_SYMBOL(sock_alloc_send_pskb); struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, int noblock, int *errcode) { - return sock_alloc_send_pskb(sk, size, 0, noblock, errcode); + return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0); } EXPORT_SYMBOL(sock_alloc_send_skb); @@ -2425,6 +2447,52 @@ void sock_enable_timestamp(struct sock *sk, int flag) } } +int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, + int level, int type) +{ + struct sock_exterr_skb *serr; + struct sk_buff *skb, *skb2; + int copied, err; + + err = -EAGAIN; + skb = skb_dequeue(&sk->sk_error_queue); + if (skb == NULL) + goto out; + + copied = skb->len; + if (copied > len) { + msg->msg_flags |= MSG_TRUNC; + copied = len; + } + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + if (err) + goto out_free_skb; + + sock_recv_timestamp(msg, sk, skb); + + serr = SKB_EXT_ERR(skb); + put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee); + + msg->msg_flags |= MSG_ERRQUEUE; + err = copied; + + /* Reset and regenerate socket error */ + spin_lock_bh(&sk->sk_error_queue.lock); + sk->sk_err = 0; + if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) { + sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno; + spin_unlock_bh(&sk->sk_error_queue.lock); + sk->sk_error_report(sk); + } else + spin_unlock_bh(&sk->sk_error_queue.lock); + +out_free_skb: + kfree_skb(skb); +out: + return err; +} +EXPORT_SYMBOL(sock_recv_errqueue); + /* * Get a socket option on an socket. * |