summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/networking/segmentation-offloads.txt130
-rw-r--r--include/linux/netdev_features.h8
-rw-r--r--include/linux/netdevice.h8
-rw-r--r--include/linux/skbuff.h27
-rw-r--r--net/core/dev.c67
-rw-r--r--net/core/ethtool.c4
-rw-r--r--net/core/skbuff.c29
-rw-r--r--net/ipv4/af_inet.c70
-rw-r--r--net/ipv4/gre_offload.c27
-rw-r--r--net/ipv4/tcp_offload.c30
-rw-r--r--net/ipv4/udp_offload.c27
-rw-r--r--net/ipv6/ip6_offload.c21
-rw-r--r--net/mpls/mpls_gso.c1
13 files changed, 395 insertions, 54 deletions
diff --git a/Documentation/networking/segmentation-offloads.txt b/Documentation/networking/segmentation-offloads.txt
new file mode 100644
index 000000000000..f200467ade38
--- /dev/null
+++ b/Documentation/networking/segmentation-offloads.txt
@@ -0,0 +1,130 @@
+Segmentation Offloads in the Linux Networking Stack
+
+Introduction
+============
+
+This document describes a set of techniques in the Linux networking stack
+to take advantage of segmentation offload capabilities of various NICs.
+
+The following technologies are described:
+ * TCP Segmentation Offload - TSO
+ * UDP Fragmentation Offload - UFO
+ * IPIP, SIT, GRE, and UDP Tunnel Offloads
+ * Generic Segmentation Offload - GSO
+ * Generic Receive Offload - GRO
+ * Partial Generic Segmentation Offload - GSO_PARTIAL
+
+TCP Segmentation Offload
+========================
+
+TCP segmentation allows a device to segment a single frame into multiple
+frames with a data payload size specified in skb_shinfo()->gso_size.
+When TCP segmentation requested the bit for either SKB_GSO_TCP or
+SKB_GSO_TCP6 should be set in skb_shinfo()->gso_type and
+skb_shinfo()->gso_size should be set to a non-zero value.
+
+TCP segmentation is dependent on support for the use of partial checksum
+offload. For this reason TSO is normally disabled if the Tx checksum
+offload for a given device is disabled.
+
+In order to support TCP segmentation offload it is necessary to populate
+the network and transport header offsets of the skbuff so that the device
+drivers will be able determine the offsets of the IP or IPv6 header and the
+TCP header. In addition as CHECKSUM_PARTIAL is required csum_start should
+also point to the TCP header of the packet.
+
+For IPv4 segmentation we support one of two types in terms of the IP ID.
+The default behavior is to increment the IP ID with every segment. If the
+GSO type SKB_GSO_TCP_FIXEDID is specified then we will not increment the IP
+ID and all segments will use the same IP ID. If a device has
+NETIF_F_TSO_MANGLEID set then the IP ID can be ignored when performing TSO
+and we will either increment the IP ID for all frames, or leave it at a
+static value based on driver preference.
+
+UDP Fragmentation Offload
+=========================
+
+UDP fragmentation offload allows a device to fragment an oversized UDP
+datagram into multiple IPv4 fragments. Many of the requirements for UDP
+fragmentation offload are the same as TSO. However the IPv4 ID for
+fragments should not increment as a single IPv4 datagram is fragmented.
+
+IPIP, SIT, GRE, UDP Tunnel, and Remote Checksum Offloads
+========================================================
+
+In addition to the offloads described above it is possible for a frame to
+contain additional headers such as an outer tunnel. In order to account
+for such instances an additional set of segmentation offload types were
+introduced including SKB_GSO_IPIP, SKB_GSO_SIT, SKB_GSO_GRE, and
+SKB_GSO_UDP_TUNNEL. These extra segmentation types are used to identify
+cases where there are more than just 1 set of headers. For example in the
+case of IPIP and SIT we should have the network and transport headers moved
+from the standard list of headers to "inner" header offsets.
+
+Currently only two levels of headers are supported. The convention is to
+refer to the tunnel headers as the outer headers, while the encapsulated
+data is normally referred to as the inner headers. Below is the list of
+calls to access the given headers:
+
+IPIP/SIT Tunnel:
+ Outer Inner
+MAC skb_mac_header
+Network skb_network_header skb_inner_network_header
+Transport skb_transport_header
+
+UDP/GRE Tunnel:
+ Outer Inner
+MAC skb_mac_header skb_inner_mac_header
+Network skb_network_header skb_inner_network_header
+Transport skb_transport_header skb_inner_transport_header
+
+In addition to the above tunnel types there are also SKB_GSO_GRE_CSUM and
+SKB_GSO_UDP_TUNNEL_CSUM. These two additional tunnel types reflect the
+fact that the outer header also requests to have a non-zero checksum
+included in the outer header.
+
+Finally there is SKB_GSO_REMCSUM which indicates that a given tunnel header
+has requested a remote checksum offload. In this case the inner headers
+will be left with a partial checksum and only the outer header checksum
+will be computed.
+
+Generic Segmentation Offload
+============================
+
+Generic segmentation offload is a pure software offload that is meant to
+deal with cases where device drivers cannot perform the offloads described
+above. What occurs in GSO is that a given skbuff will have its data broken
+out over multiple skbuffs that have been resized to match the MSS provided
+via skb_shinfo()->gso_size.
+
+Before enabling any hardware segmentation offload a corresponding software
+offload is required in GSO. Otherwise it becomes possible for a frame to
+be re-routed between devices and end up being unable to be transmitted.
+
+Generic Receive Offload
+=======================
+
+Generic receive offload is the complement to GSO. Ideally any frame
+assembled by GRO should be segmented to create an identical sequence of
+frames using GSO, and any sequence of frames segmented by GSO should be
+able to be reassembled back to the original by GRO. The only exception to
+this is IPv4 ID in the case that the DF bit is set for a given IP header.
+If the value of the IPv4 ID is not sequentially incrementing it will be
+altered so that it is when a frame assembled via GRO is segmented via GSO.
+
+Partial Generic Segmentation Offload
+====================================
+
+Partial generic segmentation offload is a hybrid between TSO and GSO. What
+it effectively does is take advantage of certain traits of TCP and tunnels
+so that instead of having to rewrite the packet headers for each segment
+only the inner-most transport header and possibly the outer-most network
+header need to be updated. This allows devices that do not support tunnel
+offloads or tunnel offloads with checksum to still make use of segmentation.
+
+With the partial offload what occurs is that all headers excluding the
+inner transport header are updated such that they will contain the correct
+values for if the header was simply duplicated. The one exception to this
+is the outer IPv4 ID field. It is up to the device drivers to guarantee
+that the IPv4 ID field is incremented in the case that a given header does
+not have the DF bit set.
diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index a734bf43d190..9fc79df0e561 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -39,6 +39,7 @@ enum {
NETIF_F_UFO_BIT, /* ... UDPv4 fragmentation */
NETIF_F_GSO_ROBUST_BIT, /* ... ->SKB_GSO_DODGY */
NETIF_F_TSO_ECN_BIT, /* ... TCP ECN support */
+ NETIF_F_TSO_MANGLEID_BIT, /* ... IPV4 ID mangling allowed */
NETIF_F_TSO6_BIT, /* ... TCPv6 segmentation */
NETIF_F_FSO_BIT, /* ... FCoE segmentation */
NETIF_F_GSO_GRE_BIT, /* ... GRE with TSO */
@@ -47,6 +48,10 @@ enum {
NETIF_F_GSO_SIT_BIT, /* ... SIT tunnel with TSO */
NETIF_F_GSO_UDP_TUNNEL_BIT, /* ... UDP TUNNEL with TSO */
NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT,/* ... UDP TUNNEL with TSO & CSUM */
+ NETIF_F_GSO_PARTIAL_BIT, /* ... Only segment inner-most L4
+ * in hardware and all other
+ * headers in software.
+ */
NETIF_F_GSO_TUNNEL_REMCSUM_BIT, /* ... TUNNEL with TSO & REMCSUM */
/**/NETIF_F_GSO_LAST = /* last bit, see GSO_MASK */
NETIF_F_GSO_TUNNEL_REMCSUM_BIT,
@@ -120,6 +125,8 @@ enum {
#define NETIF_F_GSO_SIT __NETIF_F(GSO_SIT)
#define NETIF_F_GSO_UDP_TUNNEL __NETIF_F(GSO_UDP_TUNNEL)
#define NETIF_F_GSO_UDP_TUNNEL_CSUM __NETIF_F(GSO_UDP_TUNNEL_CSUM)
+#define NETIF_F_TSO_MANGLEID __NETIF_F(TSO_MANGLEID)
+#define NETIF_F_GSO_PARTIAL __NETIF_F(GSO_PARTIAL)
#define NETIF_F_GSO_TUNNEL_REMCSUM __NETIF_F(GSO_TUNNEL_REMCSUM)
#define NETIF_F_HW_VLAN_STAG_FILTER __NETIF_F(HW_VLAN_STAG_FILTER)
#define NETIF_F_HW_VLAN_STAG_RX __NETIF_F(HW_VLAN_STAG_RX)
@@ -147,6 +154,7 @@ enum {
/* List of features with software fallbacks. */
#define NETIF_F_GSO_SOFTWARE (NETIF_F_TSO | NETIF_F_TSO_ECN | \
+ NETIF_F_TSO_MANGLEID | \
NETIF_F_TSO6 | NETIF_F_UFO)
/* List of IP checksum features. Note that NETIF_F_ HW_CSUM should not be
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 9884fe9a6552..a3bb534576a3 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1654,6 +1654,7 @@ struct net_device {
netdev_features_t vlan_features;
netdev_features_t hw_enc_features;
netdev_features_t mpls_features;
+ netdev_features_t gso_partial_features;
int ifindex;
int group;
@@ -2121,7 +2122,10 @@ struct napi_gro_cb {
/* Used in GRE, set in fou/gue_gro_receive */
u8 is_fou:1;
- /* 6 bit hole */
+ /* Used to determine if flush_id can be ignored */
+ u8 is_atomic:1;
+
+ /* 5 bit hole */
/* used to support CHECKSUM_COMPLETE for tunneling protocols */
__wsum csum;
@@ -3992,6 +3996,7 @@ static inline bool net_gso_ok(netdev_features_t features, int gso_type)
BUILD_BUG_ON(SKB_GSO_UDP != (NETIF_F_UFO >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_DODGY != (NETIF_F_GSO_ROBUST >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_TCP_ECN != (NETIF_F_TSO_ECN >> NETIF_F_GSO_SHIFT));
+ BUILD_BUG_ON(SKB_GSO_TCP_FIXEDID != (NETIF_F_TSO_MANGLEID >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_TCPV6 != (NETIF_F_TSO6 >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_FCOE != (NETIF_F_FSO >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_GRE != (NETIF_F_GSO_GRE >> NETIF_F_GSO_SHIFT));
@@ -4000,6 +4005,7 @@ static inline bool net_gso_ok(netdev_features_t features, int gso_type)
BUILD_BUG_ON(SKB_GSO_SIT != (NETIF_F_GSO_SIT >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL != (NETIF_F_GSO_UDP_TUNNEL >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL_CSUM != (NETIF_F_GSO_UDP_TUNNEL_CSUM >> NETIF_F_GSO_SHIFT));
+ BUILD_BUG_ON(SKB_GSO_PARTIAL != (NETIF_F_GSO_PARTIAL >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_TUNNEL_REMCSUM != (NETIF_F_GSO_TUNNEL_REMCSUM >> NETIF_F_GSO_SHIFT));
return (features & feature) == feature;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 007381270ff8..da0ace389fec 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -465,23 +465,27 @@ enum {
/* This indicates the tcp segment has CWR set. */
SKB_GSO_TCP_ECN = 1 << 3,
- SKB_GSO_TCPV6 = 1 << 4,
+ SKB_GSO_TCP_FIXEDID = 1 << 4,
- SKB_GSO_FCOE = 1 << 5,
+ SKB_GSO_TCPV6 = 1 << 5,
- SKB_GSO_GRE = 1 << 6,
+ SKB_GSO_FCOE = 1 << 6,
- SKB_GSO_GRE_CSUM = 1 << 7,
+ SKB_GSO_GRE = 1 << 7,
- SKB_GSO_IPIP = 1 << 8,
+ SKB_GSO_GRE_CSUM = 1 << 8,
- SKB_GSO_SIT = 1 << 9,
+ SKB_GSO_IPIP = 1 << 9,
- SKB_GSO_UDP_TUNNEL = 1 << 10,
+ SKB_GSO_SIT = 1 << 10,
- SKB_GSO_UDP_TUNNEL_CSUM = 1 << 11,
+ SKB_GSO_UDP_TUNNEL = 1 << 11,
- SKB_GSO_TUNNEL_REMCSUM = 1 << 12,
+ SKB_GSO_UDP_TUNNEL_CSUM = 1 << 12,
+
+ SKB_GSO_PARTIAL = 1 << 13,
+
+ SKB_GSO_TUNNEL_REMCSUM = 1 << 14,
};
#if BITS_PER_LONG > 32
@@ -3589,7 +3593,10 @@ static inline struct sec_path *skb_sec_path(struct sk_buff *skb)
* Keeps track of level of encapsulation of network headers.
*/
struct skb_gso_cb {
- int mac_offset;
+ union {
+ int mac_offset;
+ int data_offset;
+ };
int encap_level;
__wsum csum;
__u16 csum_start;
diff --git a/net/core/dev.c b/net/core/dev.c
index 09fb1ace9dc8..556dd09af3b8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2711,6 +2711,19 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
return ERR_PTR(err);
}
+ /* Only report GSO partial support if it will enable us to
+ * support segmentation on this frame without needing additional
+ * work.
+ */
+ if (features & NETIF_F_GSO_PARTIAL) {
+ netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
+ struct net_device *dev = skb->dev;
+
+ partial_features |= dev->features & dev->gso_partial_features;
+ if (!skb_gso_ok(skb, features | partial_features))
+ features &= ~NETIF_F_GSO_PARTIAL;
+ }
+
BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
@@ -2825,14 +2838,45 @@ static netdev_features_t dflt_features_check(const struct sk_buff *skb,
return vlan_features_check(skb, features);
}
+static netdev_features_t gso_features_check(const struct sk_buff *skb,
+ struct net_device *dev,
+ netdev_features_t features)
+{
+ u16 gso_segs = skb_shinfo(skb)->gso_segs;
+
+ if (gso_segs > dev->gso_max_segs)
+ return features & ~NETIF_F_GSO_MASK;
+
+ /* Support for GSO partial features requires software
+ * intervention before we can actually process the packets
+ * so we need to strip support for any partial features now
+ * and we can pull them back in after we have partially
+ * segmented the frame.
+ */
+ if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
+ features &= ~dev->gso_partial_features;
+
+ /* Make sure to clear the IPv4 ID mangling feature if the
+ * IPv4 header has the potential to be fragmented.
+ */
+ if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
+ struct iphdr *iph = skb->encapsulation ?
+ inner_ip_hdr(skb) : ip_hdr(skb);
+
+ if (!(iph->frag_off & htons(IP_DF)))
+ features &= ~NETIF_F_TSO_MANGLEID;
+ }
+
+ return features;
+}
+
netdev_features_t netif_skb_features(struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
netdev_features_t features = dev->features;
- u16 gso_segs = skb_shinfo(skb)->gso_segs;
- if (gso_segs > dev->gso_max_segs)
- features &= ~NETIF_F_GSO_MASK;
+ if (skb_is_gso(skb))
+ features = gso_features_check(skb, dev, features);
/* If encapsulation offload request, verify we are testing
* hardware encapsulation features instead of standard
@@ -4440,6 +4484,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
NAPI_GRO_CB(skb)->free = 0;
NAPI_GRO_CB(skb)->encap_mark = 0;
NAPI_GRO_CB(skb)->is_fou = 0;
+ NAPI_GRO_CB(skb)->is_atomic = 1;
NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
/* Setup for GRO checksum validation */
@@ -6706,6 +6751,14 @@ static netdev_features_t netdev_fix_features(struct net_device *dev,
}
}
+ /* GSO partial features require GSO partial be set */
+ if ((features & dev->gso_partial_features) &&
+ !(features & NETIF_F_GSO_PARTIAL)) {
+ netdev_dbg(dev,
+ "Dropping partially supported GSO features since no GSO partial.\n");
+ features &= ~dev->gso_partial_features;
+ }
+
#ifdef CONFIG_NET_RX_BUSY_POLL
if (dev->netdev_ops->ndo_busy_poll)
features |= NETIF_F_BUSY_POLL;
@@ -6976,9 +7029,11 @@ int register_netdevice(struct net_device *dev)
dev->features |= NETIF_F_SOFT_FEATURES;
dev->wanted_features = dev->features & dev->hw_features;
- if (!(dev->flags & IFF_LOOPBACK)) {
+ if (!(dev->flags & IFF_LOOPBACK))
dev->hw_features |= NETIF_F_NOCACHE_COPY;
- }
+
+ if (dev->hw_features & NETIF_F_TSO)
+ dev->hw_features |= NETIF_F_TSO_MANGLEID;
/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
*/
@@ -6986,7 +7041,7 @@ int register_netdevice(struct net_device *dev)
/* Make NETIF_F_SG inheritable to tunnel devices.
*/
- dev->hw_enc_features |= NETIF_F_SG;
+ dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
/* Make NETIF_F_SG inheritable to MPLS.
*/
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index f426c5ad6149..e0cf20a3b3dd 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -79,12 +79,16 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
[NETIF_F_UFO_BIT] = "tx-udp-fragmentation",
[NETIF_F_GSO_ROBUST_BIT] = "tx-gso-robust",
[NETIF_F_TSO_ECN_BIT] = "tx-tcp-ecn-segmentation",
+ [NETIF_F_TSO_MANGLEID_BIT] = "tx-tcp-mangleid-segmentation",
[NETIF_F_TSO6_BIT] = "tx-tcp6-segmentation",
[NETIF_F_FSO_BIT] = "tx-fcoe-segmentation",
[NETIF_F_GSO_GRE_BIT] = "tx-gre-segmentation",
+ [NETIF_F_GSO_GRE_CSUM_BIT] = "tx-gre-csum-segmentation",
[NETIF_F_GSO_IPIP_BIT] = "tx-ipip-segmentation",
[NETIF_F_GSO_SIT_BIT] = "tx-sit-segmentation",
[NETIF_F_GSO_UDP_TUNNEL_BIT] = "tx-udp_tnl-segmentation",
+ [NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT] = "tx-udp_tnl-csum-segmentation",
+ [NETIF_F_GSO_PARTIAL_BIT] = "tx-gso-partial",
[NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc",
[NETIF_F_SCTP_CRC_BIT] = "tx-checksum-sctp",
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index d04c2d1c8c87..4cc594cdaada 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3076,8 +3076,9 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
struct sk_buff *frag_skb = head_skb;
unsigned int offset = doffset;
unsigned int tnl_hlen = skb_tnl_header_len(head_skb);
+ unsigned int partial_segs = 0;
unsigned int headroom;
- unsigned int len;
+ unsigned int len = head_skb->len;
__be16 proto;
bool csum;
int sg = !!(features & NETIF_F_SG);
@@ -3094,6 +3095,15 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
csum = !!can_checksum_protocol(features, proto);
+ /* GSO partial only requires that we trim off any excess that
+ * doesn't fit into an MSS sized block, so take care of that
+ * now.
+ */
+ if (features & NETIF_F_GSO_PARTIAL) {
+ partial_segs = len / mss;
+ mss *= partial_segs;
+ }
+
headroom = skb_headroom(head_skb);
pos = skb_headlen(head_skb);
@@ -3281,6 +3291,23 @@ perform_csum_check:
*/
segs->prev = tail;
+ /* Update GSO info on first skb in partial sequence. */
+ if (partial_segs) {
+ int type = skb_shinfo(head_skb)->gso_type;
+
+ /* Update type to add partial and then remove dodgy if set */
+ type |= SKB_GSO_PARTIAL;
+ type &= ~SKB_GSO_DODGY;
+
+ /* Update GSO info and prepare to start updating headers on
+ * our way back down the stack of protocols.
+ */
+ skb_shinfo(segs)->gso_size = skb_shinfo(head_skb)->gso_size;
+ skb_shinfo(segs)->gso_segs = partial_segs;
+ skb_shinfo(segs)->gso_type = type;
+ SKB_GSO_CB(segs)->data_offset = skb_headroom(segs) + doffset;
+ }
+
/* Following permits correct backpressure, for protocols
* using skb_set_owner_w().
* Idea is to tranfert ownership from head_skb to last segment.
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 8217cd22f921..2e6e65fc4d20 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1195,12 +1195,12 @@ EXPORT_SYMBOL(inet_sk_rebuild_header);
static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
netdev_features_t features)
{
+ bool udpfrag = false, fixedid = false, encap;
struct sk_buff *segs = ERR_PTR(-EINVAL);
const struct net_offload *ops;
unsigned int offset = 0;
- bool udpfrag, encap;
struct iphdr *iph;
- int proto;
+ int proto, tot_len;
int nhoff;
int ihl;
int id;
@@ -1217,7 +1217,9 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
SKB_GSO_TCPV6 |
SKB_GSO_UDP_TUNNEL |
SKB_GSO_UDP_TUNNEL_CSUM |
+ SKB_GSO_TCP_FIXEDID |
SKB_GSO_TUNNEL_REMCSUM |
+ SKB_GSO_PARTIAL |
0)))
goto out;
@@ -1248,11 +1250,14 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
segs = ERR_PTR(-EPROTONOSUPPORT);
- if (skb->encapsulation &&
- skb_shinfo(skb)->gso_type & (SKB_GSO_SIT|SKB_GSO_IPIP))
- udpfrag = proto == IPPROTO_UDP && encap;
- else
- udpfrag = proto == IPPROTO_UDP && !skb->encapsulation;
+ if (!skb->encapsulation || encap) {
+ udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
+ fixedid = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID);
+
+ /* fixed ID is invalid if DF bit is not set */
+ if (fixedid && !(iph->frag_off & htons(IP_DF)))
+ goto out;
+ }
ops = rcu_dereference(inet_offloads[proto]);
if (likely(ops && ops->callbacks.gso_segment))
@@ -1265,15 +1270,25 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
do {
iph = (struct iphdr *)(skb_mac_header(skb) + nhoff);
if (udpfrag) {
- iph->id = htons(id);
iph->frag_off = htons(offset >> 3);
if (skb->next)
iph->frag_off |= htons(IP_MF);
offset += skb->len - nhoff - ihl;
+ tot_len = skb->len - nhoff;
+ } else if (skb_is_gso(skb)) {
+ if (!fixedid) {
+ iph->id = htons(id);
+ id += skb_shinfo(skb)->gso_segs;
+ }
+ tot_len = skb_shinfo(skb)->gso_size +
+ SKB_GSO_CB(skb)->data_offset +
+ skb->head - (unsigned char *)iph;
} else {
- iph->id = htons(id++);
+ if (!fixedid)
+ iph->id = htons(id++);
+ tot_len = skb->len - nhoff;
}
- iph->tot_len = htons(skb->len - nhoff);
+ iph->tot_len = htons(tot_len);
ip_send_check(iph);
if (encap)
skb_reset_inner_headers(skb);
@@ -1325,6 +1340,7 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
for (p = *head; p; p = p->next) {
struct iphdr *iph2;
+ u16 flush_id;
if (!NAPI_GRO_CB(p)->same_flow)
continue;
@@ -1348,16 +1364,36 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
(iph->tos ^ iph2->tos) |
((iph->frag_off ^ iph2->frag_off) & htons(IP_DF));
- /* Save the IP ID check to be included later when we get to
- * the transport layer so only the inner most IP ID is checked.
- * This is because some GSO/TSO implementations do not
- * correctly increment the IP ID for the outer hdrs.
- */
- NAPI_GRO_CB(p)->flush_id =
- ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id);
NAPI_GRO_CB(p)->flush |= flush;
+
+ /* We need to store of the IP ID check to be included later
+ * when we can verify that this packet does in fact belong
+ * to a given flow.
+ */
+ flush_id = (u16)(id - ntohs(iph2->id));
+
+ /* This bit of code makes it much easier for us to identify
+ * the cases where we are doing atomic vs non-atomic IP ID
+ * checks. Specifically an atomic check can return IP ID
+ * values 0 - 0xFFFF, while a non-atomic check can only
+ * return 0 or 0xFFFF.
+ */
+ if (!NAPI_GRO_CB(p)->is_atomic ||
+ !(iph->frag_off & htons(IP_DF))) {
+ flush_id ^= NAPI_GRO_CB(p)->count;
+ flush_id = flush_id ? 0xFFFF : 0;
+ }
+
+ /* If the previous IP ID value was based on an atomic
+ * datagram we can overwrite the value and ignore it.
+ */
+ if (NAPI_GRO_CB(skb)->is_atomic)
+ NAPI_GRO_CB(p)->flush_id = flush_id;
+ else
+ NAPI_GRO_CB(p)->flush_id |= flush_id;
}
+ NAPI_GRO_CB(skb)->is_atomic = !!(iph->frag_off & htons(IP_DF));
NAPI_GRO_CB(skb)->flush |= flush;
skb_set_network_header(skb, off);
/* The above will be needed by the transport layer if there is one
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index 6a5bd4317866..20557f211408 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -32,10 +32,12 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
SKB_GSO_UDP |
SKB_GSO_DODGY |
SKB_GSO_TCP_ECN |
+ SKB_GSO_TCP_FIXEDID |
SKB_GSO_GRE |
SKB_GSO_GRE_CSUM |
SKB_GSO_IPIP |
- SKB_GSO_SIT)))
+ SKB_GSO_SIT |
+ SKB_GSO_PARTIAL)))
goto out;
if (!skb->encapsulation)
@@ -86,7 +88,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
skb = segs;
do {
struct gre_base_hdr *greh;
- __be32 *pcsum;
+ __sum16 *pcsum;
/* Set up inner headers if we are offloading inner checksum */
if (skb->ip_summed == CHECKSUM_PARTIAL) {
@@ -106,10 +108,25 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
continue;
greh = (struct gre_base_hdr *)skb_transport_header(skb);
- pcsum = (__be32 *)(greh + 1);
+ pcsum = (__sum16 *)(greh + 1);
+
+ if (skb_is_gso(skb)) {
+ unsigned int partial_adj;
+
+ /* Adjust checksum to account for the fact that
+ * the partial checksum is based on actual size
+ * whereas headers should be based on MSS size.
+ */
+ partial_adj = skb->len + skb_headroom(skb) -
+ SKB_GSO_CB(skb)->data_offset -
+ skb_shinfo(skb)->gso_size;
+ *pcsum = ~csum_fold((__force __wsum)htonl(partial_adj));
+ } else {
+ *pcsum = 0;
+ }
- *pcsum = 0;
- *(__sum16 *)pcsum = gso_make_checksum(skb, 0);
+ *(pcsum + 1) = 0;
+ *pcsum = gso_make_checksum(skb, 0);
} while ((skb = skb->next));
out:
return segs;
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 773083b7f1e9..02737b607aa7 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -89,6 +89,7 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
~(SKB_GSO_TCPV4 |
SKB_GSO_DODGY |
SKB_GSO_TCP_ECN |
+ SKB_GSO_TCP_FIXEDID |
SKB_GSO_TCPV6 |
SKB_GSO_GRE |
SKB_GSO_GRE_CSUM |
@@ -98,7 +99,8 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
SKB_GSO_UDP_TUNNEL_CSUM |
SKB_GSO_TUNNEL_REMCSUM |
0) ||
- !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))))
+ !(type & (SKB_GSO_TCPV4 |
+ SKB_GSO_TCPV6))))
goto out;
skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
@@ -107,6 +109,12 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
goto out;
}
+ /* GSO partial only requires splitting the frame into an MSS
+ * multiple and possibly a remainder. So update the mss now.
+ */
+ if (features & NETIF_F_GSO_PARTIAL)
+ mss = skb->len - (skb->len % mss);
+
copy_destructor = gso_skb->destructor == tcp_wfree;
ooo_okay = gso_skb->ooo_okay;
/* All segments but the first should have ooo_okay cleared */
@@ -131,7 +139,7 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
newcheck = ~csum_fold((__force __wsum)((__force u32)th->check +
(__force u32)delta));
- do {
+ while (skb->next) {
th->fin = th->psh = 0;
th->check = newcheck;
@@ -151,7 +159,7 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
th->seq = htonl(seq);
th->cwr = 0;
- } while (skb->next);
+ }
/* Following permits TCP Small Queues to work well with GSO :
* The callback to TCP stack will be called at the time last frag
@@ -237,7 +245,7 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
found:
/* Include the IP ID check below from the inner most IP hdr */
- flush = NAPI_GRO_CB(p)->flush | NAPI_GRO_CB(p)->flush_id;
+ flush = NAPI_GRO_CB(p)->flush;
flush |= (__force int)(flags & TCP_FLAG_CWR);
flush |= (__force int)((flags ^ tcp_flag_word(th2)) &
~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH));
@@ -246,6 +254,17 @@ found:
flush |= *(u32 *)((u8 *)th + i) ^
*(u32 *)((u8 *)th2 + i);
+ /* When we receive our second frame we can made a decision on if we
+ * continue this flow as an atomic flow with a fixed ID or if we use
+ * an incrementing ID.
+ */
+ if (NAPI_GRO_CB(p)->flush_id != 1 ||
+ NAPI_GRO_CB(p)->count != 1 ||
+ !NAPI_GRO_CB(p)->is_atomic)
+ flush |= NAPI_GRO_CB(p)->flush_id;
+ else
+ NAPI_GRO_CB(p)->is_atomic = false;
+
mss = skb_shinfo(p)->gso_size;
flush |= (len - 1) >= mss;
@@ -314,6 +333,9 @@ static int tcp4_gro_complete(struct sk_buff *skb, int thoff)
iph->daddr, 0);
skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4;
+ if (NAPI_GRO_CB(skb)->is_atomic)
+ skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_FIXEDID;
+
return tcp_gro_complete(skb);
}
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 6230cf4b0d2d..097060def7f0 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -39,8 +39,11 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
* 16 bit length field due to the header being added outside of an
* IP or IPv6 frame that was already limited to 64K - 1.
*/
- partial = csum_sub(csum_unfold(uh->check),
- (__force __wsum)htonl(skb->len));
+ if (skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL)
+ partial = (__force __wsum)uh->len;
+ else
+ partial = (__force __wsum)htonl(skb->len);
+ partial = csum_sub(csum_unfold(uh->check), partial);
/* setup inner skb. */
skb->encapsulation = 0;
@@ -89,7 +92,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
udp_offset = outer_hlen - tnl_hlen;
skb = segs;
do {
- __be16 len;
+ unsigned int len;
if (remcsum)
skb->ip_summed = CHECKSUM_NONE;
@@ -107,14 +110,26 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
skb_reset_mac_header(skb);
skb_set_network_header(skb, mac_len);
skb_set_transport_header(skb, udp_offset);
- len = htons(skb->len - udp_offset);
+ len = skb->len - udp_offset;
uh = udp_hdr(skb);
- uh->len = len;
+
+ /* If we are only performing partial GSO the inner header
+ * will be using a length value equal to only one MSS sized
+ * segment instead of the entire frame.
+ */
+ if (skb_is_gso(skb)) {
+ uh->len = htons(skb_shinfo(skb)->gso_size +
+ SKB_GSO_CB(skb)->data_offset +
+ skb->head - (unsigned char *)uh);
+ } else {
+ uh->len = htons(len);
+ }
if (!need_csum)
continue;
- uh->check = ~csum_fold(csum_add(partial, (__force __wsum)len));
+ uh->check = ~csum_fold(csum_add(partial,
+ (__force __wsum)htonl(len)));
if (skb->encapsulation || !offload_csum) {
uh->check = gso_make_checksum(skb, ~uh->check);
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index 204af2219471..f5eb184e1093 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -63,6 +63,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
int proto;
struct frag_hdr *fptr;
unsigned int unfrag_ip6hlen;
+ unsigned int payload_len;
u8 *prevhdr;
int offset = 0;
bool encap, udpfrag;
@@ -73,6 +74,8 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
SKB_GSO_UDP |
SKB_GSO_DODGY |
SKB_GSO_TCP_ECN |
+ SKB_GSO_TCP_FIXEDID |
+ SKB_GSO_TCPV6 |
SKB_GSO_GRE |
SKB_GSO_GRE_CSUM |
SKB_GSO_IPIP |
@@ -80,7 +83,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
SKB_GSO_UDP_TUNNEL |
SKB_GSO_UDP_TUNNEL_CSUM |
SKB_GSO_TUNNEL_REMCSUM |
- SKB_GSO_TCPV6 |
+ SKB_GSO_PARTIAL |
0)))
goto out;
@@ -117,7 +120,13 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
for (skb = segs; skb; skb = skb->next) {
ipv6h = (struct ipv6hdr *)(skb_mac_header(skb) + nhoff);
- ipv6h->payload_len = htons(skb->len - nhoff - sizeof(*ipv6h));
+ if (skb_is_gso(skb))
+ payload_len = skb_shinfo(skb)->gso_size +
+ SKB_GSO_CB(skb)->data_offset +
+ skb->head - (unsigned char *)(ipv6h + 1);
+ else
+ payload_len = skb->len - nhoff - sizeof(*ipv6h);
+ ipv6h->payload_len = htons(payload_len);
skb->network_header = (u8 *)ipv6h - skb->head;
if (udpfrag) {
@@ -239,10 +248,14 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head,
NAPI_GRO_CB(p)->flush |= !!(first_word & htonl(0x0FF00000));
NAPI_GRO_CB(p)->flush |= flush;
- /* Clear flush_id, there's really no concept of ID in IPv6. */
- NAPI_GRO_CB(p)->flush_id = 0;
+ /* If the previous IP ID value was based on an atomic
+ * datagram we can overwrite the value and ignore it.
+ */
+ if (NAPI_GRO_CB(skb)->is_atomic)
+ NAPI_GRO_CB(p)->flush_id = 0;
}
+ NAPI_GRO_CB(skb)->is_atomic = true;
NAPI_GRO_CB(skb)->flush |= flush;
skb_gro_postpull_rcsum(skb, iph, nlen);
diff --git a/net/mpls/mpls_gso.c b/net/mpls/mpls_gso.c
index 0183b32da942..bbcf60465e5c 100644
--- a/net/mpls/mpls_gso.c
+++ b/net/mpls/mpls_gso.c
@@ -31,6 +31,7 @@ static struct sk_buff *mpls_gso_segment(struct sk_buff *skb,
SKB_GSO_TCPV6 |
SKB_GSO_UDP |
SKB_GSO_DODGY |
+ SKB_GSO_TCP_FIXEDID |
SKB_GSO_TCP_ECN)))
goto out;