From 4ed377e36ec2f385484d12e516faf88516fad31c Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Sat, 21 Sep 2013 06:32:34 +0200 Subject: net: neighbour: use source address of last enqueued packet for solicitation Currently we always use the first member of the arp_queue to determine the sender ip address of the arp packet (or in case of IPv6 - source address of the ndisc packet). This skb is fixed as long as the queue is not drained by a complete purge because of a timeout or by a successful response. If the first packet enqueued on the arp_queue is from a local application with a manually set source address and the to be discovered system does some kind of uRPF checks on the source address in the arp packet the resolving process hangs until a timeout and restarts. This hurts communication with the participating network node. This could be mitigated a bit if we use the latest enqueued skb's source address for the resolving process, which is not as static as the arp_queue's head. This change of the source address could result in better recovery of a failed solicitation. Cc: "David S. Miller" Cc: Julian Anastasov Reviewed-by: Julian Anastasov Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/core/neighbour.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 6072610a8672..ca15f32821fb 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -867,7 +867,7 @@ static void neigh_invalidate(struct neighbour *neigh) static void neigh_probe(struct neighbour *neigh) __releases(neigh->lock) { - struct sk_buff *skb = skb_peek(&neigh->arp_queue); + struct sk_buff *skb = skb_peek_tail(&neigh->arp_queue); /* keep skb alive even if arp_queue overflows */ if (skb) skb = skb_copy(skb, GFP_ATOMIC); -- cgit v1.2.3 From 7863c054d1b4fd35f76c13e2e918f7f483fe48f4 Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Wed, 25 Sep 2013 09:20:06 +0200 Subject: net: use lists as arguments instead of bool upper Currently we make use of bool upper when we want to specify if we want to work with upper/lower list. It's, however, harder to read, debug and occupies a lot more code. Fix this by just passing the correct upper/lower_dev_list list_head pointer instead of bool upper, and work internally with it. CC: "David S. Miller" CC: Eric Dumazet CC: Jiri Pirko CC: Alexander Duyck CC: Cong Wang Signed-off-by: Veaceslav Falico Signed-off-by: David S. Miller --- net/core/dev.c | 54 ++++++++++++++++++++++-------------------------------- 1 file changed, 22 insertions(+), 32 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 5c713f2239cc..9be79377a0f3 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4385,12 +4385,9 @@ struct netdev_adjacent { static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev, struct net_device *adj_dev, - bool upper) + struct list_head *dev_list) { struct netdev_adjacent *adj; - struct list_head *dev_list; - - dev_list = upper ? &dev->upper_dev_list : &dev->lower_dev_list; list_for_each_entry(adj, dev_list, list) { if (adj->dev == adj_dev) @@ -4402,13 +4399,13 @@ static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev, static inline struct netdev_adjacent *__netdev_find_upper(struct net_device *dev, struct net_device *udev) { - return __netdev_find_adj(dev, udev, true); + return __netdev_find_adj(dev, udev, &dev->upper_dev_list); } static inline struct netdev_adjacent *__netdev_find_lower(struct net_device *dev, struct net_device *ldev) { - return __netdev_find_adj(dev, ldev, false); + return __netdev_find_adj(dev, ldev, &dev->lower_dev_list); } /** @@ -4514,12 +4511,12 @@ EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu); static int __netdev_adjacent_dev_insert(struct net_device *dev, struct net_device *adj_dev, - bool neighbour, bool master, - bool upper) + struct list_head *dev_list, + bool neighbour, bool master) { struct netdev_adjacent *adj; - adj = __netdev_find_adj(dev, adj_dev, upper); + adj = __netdev_find_adj(dev, adj_dev, dev_list); if (adj) { BUG_ON(neighbour); @@ -4538,19 +4535,14 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev, dev_hold(adj_dev); pr_debug("dev_hold for %s, because of %s link added from %s to %s\n", - adj_dev->name, upper ? "upper" : "lower", dev->name, - adj_dev->name); + adj_dev->name, dev_list == &dev->upper_dev_list ? + "upper" : "lower", dev->name, adj_dev->name); - if (!upper) { - list_add_tail_rcu(&adj->list, &dev->lower_dev_list); - return 0; - } - - /* Ensure that master upper link is always the first item in list. */ + /* Ensure that master link is always the first item in list. */ if (master) - list_add_rcu(&adj->list, &dev->upper_dev_list); + list_add_rcu(&adj->list, dev_list); else - list_add_tail_rcu(&adj->list, &dev->upper_dev_list); + list_add_tail_rcu(&adj->list, dev_list); return 0; } @@ -4559,27 +4551,25 @@ static inline int __netdev_upper_dev_insert(struct net_device *dev, struct net_device *udev, bool master, bool neighbour) { - return __netdev_adjacent_dev_insert(dev, udev, neighbour, master, - true); + return __netdev_adjacent_dev_insert(dev, udev, &dev->upper_dev_list, + neighbour, master); } static inline int __netdev_lower_dev_insert(struct net_device *dev, struct net_device *ldev, bool neighbour) { - return __netdev_adjacent_dev_insert(dev, ldev, neighbour, false, - false); + return __netdev_adjacent_dev_insert(dev, ldev, &dev->lower_dev_list, + neighbour, false); } void __netdev_adjacent_dev_remove(struct net_device *dev, - struct net_device *adj_dev, bool upper) + struct net_device *adj_dev, + struct list_head *dev_list) { struct netdev_adjacent *adj; - if (upper) - adj = __netdev_find_upper(dev, adj_dev); - else - adj = __netdev_find_lower(dev, adj_dev); + adj = __netdev_find_adj(dev, adj_dev, dev_list); if (!adj) BUG(); @@ -4591,8 +4581,8 @@ void __netdev_adjacent_dev_remove(struct net_device *dev, list_del_rcu(&adj->list); pr_debug("dev_put for %s, because of %s link removed from %s to %s\n", - adj_dev->name, upper ? "upper" : "lower", dev->name, - adj_dev->name); + adj_dev->name, dev_list == &dev->upper_dev_list ? + "upper" : "lower", dev->name, adj_dev->name); dev_put(adj_dev); kfree_rcu(adj, rcu); } @@ -4600,13 +4590,13 @@ void __netdev_adjacent_dev_remove(struct net_device *dev, static inline void __netdev_upper_dev_remove(struct net_device *dev, struct net_device *udev) { - return __netdev_adjacent_dev_remove(dev, udev, true); + return __netdev_adjacent_dev_remove(dev, udev, &dev->upper_dev_list); } static inline void __netdev_lower_dev_remove(struct net_device *dev, struct net_device *ldev) { - return __netdev_adjacent_dev_remove(dev, ldev, false); + return __netdev_adjacent_dev_remove(dev, ldev, &dev->lower_dev_list); } int __netdev_adjacent_dev_insert_link(struct net_device *dev, -- cgit v1.2.3 From 2f268f129c2d1a05d297fe3ee34d393f862d2b22 Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Wed, 25 Sep 2013 09:20:07 +0200 Subject: net: add adj_list to save only neighbours Currently, we distinguish neighbours (first-level linked devices) from non-neighbours by the neighbour bool in the netdev_adjacent. This could be quite time-consuming in case we would like to traverse *only* through neighbours - cause we'd have to traverse through all devices and check for this flag, and in a (quite common) scenario where we have lots of vlans on top of bridge, which is on top of a bond - the bonding would have to go through all those vlans to get its upper neighbour linked devices. This situation is really unpleasant, cause there are already a lot of cases when a device with slaves needs to go through them in hot path. To fix this, introduce a new upper/lower device lists structure - adj_list, which contains only the neighbours. It works always in pair with the all_adj_list structure (renamed from upper/lower_dev_list), i.e. both of them contain the same links, only that all_adj_list contains also non-neighbour device links. It's really a small change visible, currently, only for __netdev_adjacent_dev_insert/remove(), and doesn't change the main linked logic at all. Also, add some comments a fix a name collision in netdev_for_each_upper_dev_rcu() and rework the naming by the following rules: netdev_(all_)(upper|lower)_* If "all_" is present, then we work with the whole list of upper/lower devices, otherwise - only with direct neighbours. Uninline functions - to get better stack traces. CC: "David S. Miller" CC: Eric Dumazet CC: Jiri Pirko CC: Alexander Duyck CC: Cong Wang Signed-off-by: Veaceslav Falico Signed-off-by: David S. Miller --- drivers/net/bonding/bond_alb.c | 2 +- drivers/net/bonding/bond_main.c | 10 +- include/linux/netdevice.h | 28 ++++-- net/core/dev.c | 203 ++++++++++++++++++++-------------------- 4 files changed, 129 insertions(+), 114 deletions(-) (limited to 'net/core') diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c index f428ef574372..8524e33e6754 100644 --- a/drivers/net/bonding/bond_alb.c +++ b/drivers/net/bonding/bond_alb.c @@ -1019,7 +1019,7 @@ static void alb_send_learning_packets(struct slave *slave, u8 mac_addr[]) /* loop through vlans and send one packet for each */ rcu_read_lock(); - netdev_for_each_upper_dev_rcu(bond->dev, upper, iter) { + netdev_for_each_all_upper_dev_rcu(bond->dev, upper, iter) { if (upper->priv_flags & IFF_802_1Q_VLAN) alb_send_lp_vid(slave, mac_addr, vlan_dev_vlan_id(upper)); diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 55bbb8b8200c..91c4ab8913b1 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -2267,7 +2267,7 @@ static bool bond_has_this_ip(struct bonding *bond, __be32 ip) return true; rcu_read_lock(); - netdev_for_each_upper_dev_rcu(bond->dev, upper, iter) { + netdev_for_each_all_upper_dev_rcu(bond->dev, upper, iter) { if (ip == bond_confirm_addr(upper, 0, ip)) { ret = true; break; @@ -2342,10 +2342,12 @@ static void bond_arp_send_all(struct bonding *bond, struct slave *slave) * * TODO: QinQ? */ - netdev_for_each_upper_dev_rcu(bond->dev, vlan_upper, vlan_iter) { + netdev_for_each_all_upper_dev_rcu(bond->dev, vlan_upper, + vlan_iter) { if (!is_vlan_dev(vlan_upper)) continue; - netdev_for_each_upper_dev_rcu(vlan_upper, upper, iter) { + netdev_for_each_all_upper_dev_rcu(vlan_upper, upper, + iter) { if (upper == rt->dst.dev) { vlan_id = vlan_dev_vlan_id(vlan_upper); rcu_read_unlock(); @@ -2358,7 +2360,7 @@ static void bond_arp_send_all(struct bonding *bond, struct slave *slave) * our upper vlans, then just search for any dev that * matches, and in case it's a vlan - save the id */ - netdev_for_each_upper_dev_rcu(bond->dev, upper, iter) { + netdev_for_each_all_upper_dev_rcu(bond->dev, upper, iter) { if (upper == rt->dst.dev) { /* if it's a vlan - get its VID */ if (is_vlan_dev(upper)) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3de49aca4519..514045c704a8 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1143,8 +1143,18 @@ struct net_device { struct list_head dev_list; struct list_head napi_list; struct list_head unreg_list; - struct list_head upper_dev_list; /* List of upper devices */ - struct list_head lower_dev_list; + + /* directly linked devices, like slaves for bonding */ + struct { + struct list_head upper; + struct list_head lower; + } adj_list; + + /* all linked devices, *including* neighbours */ + struct { + struct list_head upper; + struct list_head lower; + } all_adj_list; /* currently active device features */ @@ -2813,15 +2823,15 @@ extern int bpf_jit_enable; extern bool netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev); extern bool netdev_has_any_upper_dev(struct net_device *dev); -extern struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, - struct list_head **iter); +extern struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev, + struct list_head **iter); /* iterate through upper list, must be called under RCU read lock */ -#define netdev_for_each_upper_dev_rcu(dev, upper, iter) \ - for (iter = &(dev)->upper_dev_list, \ - upper = netdev_upper_get_next_dev_rcu(dev, &(iter)); \ - upper; \ - upper = netdev_upper_get_next_dev_rcu(dev, &(iter))) +#define netdev_for_each_all_upper_dev_rcu(dev, updev, iter) \ + for (iter = &(dev)->all_adj_list.upper, \ + updev = netdev_all_upper_get_next_dev_rcu(dev, &(iter)); \ + updev; \ + updev = netdev_all_upper_get_next_dev_rcu(dev, &(iter))) extern struct net_device *netdev_master_upper_dev_get(struct net_device *dev); extern struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev); diff --git a/net/core/dev.c b/net/core/dev.c index 9be79377a0f3..9a395e03da74 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4373,9 +4373,6 @@ struct netdev_adjacent { /* upper master flag, there can only be one master device per list */ bool master; - /* indicates that this dev is our first-level lower/upper device */ - bool neighbour; - /* counter for the number of times this device was added to us */ u16 ref_nr; @@ -4385,29 +4382,17 @@ struct netdev_adjacent { static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev, struct net_device *adj_dev, - struct list_head *dev_list) + struct list_head *adj_list) { struct netdev_adjacent *adj; - list_for_each_entry(adj, dev_list, list) { + list_for_each_entry(adj, adj_list, list) { if (adj->dev == adj_dev) return adj; } return NULL; } -static inline struct netdev_adjacent *__netdev_find_upper(struct net_device *dev, - struct net_device *udev) -{ - return __netdev_find_adj(dev, udev, &dev->upper_dev_list); -} - -static inline struct netdev_adjacent *__netdev_find_lower(struct net_device *dev, - struct net_device *ldev) -{ - return __netdev_find_adj(dev, ldev, &dev->lower_dev_list); -} - /** * netdev_has_upper_dev - Check if device is linked to an upper device * @dev: device @@ -4422,7 +4407,7 @@ bool netdev_has_upper_dev(struct net_device *dev, { ASSERT_RTNL(); - return __netdev_find_upper(dev, upper_dev); + return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper); } EXPORT_SYMBOL(netdev_has_upper_dev); @@ -4437,7 +4422,7 @@ bool netdev_has_any_upper_dev(struct net_device *dev) { ASSERT_RTNL(); - return !list_empty(&dev->upper_dev_list); + return !list_empty(&dev->all_adj_list.upper); } EXPORT_SYMBOL(netdev_has_any_upper_dev); @@ -4454,10 +4439,10 @@ struct net_device *netdev_master_upper_dev_get(struct net_device *dev) ASSERT_RTNL(); - if (list_empty(&dev->upper_dev_list)) + if (list_empty(&dev->adj_list.upper)) return NULL; - upper = list_first_entry(&dev->upper_dev_list, + upper = list_first_entry(&dev->adj_list.upper, struct netdev_adjacent, list); if (likely(upper->master)) return upper->dev; @@ -4465,15 +4450,15 @@ struct net_device *netdev_master_upper_dev_get(struct net_device *dev) } EXPORT_SYMBOL(netdev_master_upper_dev_get); -/* netdev_upper_get_next_dev_rcu - Get the next dev from upper list +/* netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list * @dev: device * @iter: list_head ** of the current position * * Gets the next device from the dev's upper list, starting from iter * position. The caller must hold RCU read lock. */ -struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, - struct list_head **iter) +struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev, + struct list_head **iter) { struct netdev_adjacent *upper; @@ -4481,14 +4466,14 @@ struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); - if (&upper->list == &dev->upper_dev_list) + if (&upper->list == &dev->all_adj_list.upper) return NULL; *iter = &upper->list; return upper->dev; } -EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu); +EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu); /** * netdev_master_upper_dev_get_rcu - Get master upper device @@ -4501,7 +4486,7 @@ struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev) { struct netdev_adjacent *upper; - upper = list_first_or_null_rcu(&dev->upper_dev_list, + upper = list_first_or_null_rcu(&dev->adj_list.upper, struct netdev_adjacent, list); if (upper && likely(upper->master)) return upper->dev; @@ -4512,14 +4497,13 @@ EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu); static int __netdev_adjacent_dev_insert(struct net_device *dev, struct net_device *adj_dev, struct list_head *dev_list, - bool neighbour, bool master) + bool master) { struct netdev_adjacent *adj; adj = __netdev_find_adj(dev, adj_dev, dev_list); if (adj) { - BUG_ON(neighbour); adj->ref_nr++; return 0; } @@ -4530,13 +4514,11 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev, adj->dev = adj_dev; adj->master = master; - adj->neighbour = neighbour; adj->ref_nr = 1; - dev_hold(adj_dev); - pr_debug("dev_hold for %s, because of %s link added from %s to %s\n", - adj_dev->name, dev_list == &dev->upper_dev_list ? - "upper" : "lower", dev->name, adj_dev->name); + + pr_debug("dev_hold for %s, because of link added from %s to %s\n", + adj_dev->name, dev->name, adj_dev->name); /* Ensure that master link is always the first item in list. */ if (master) @@ -4547,22 +4529,6 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev, return 0; } -static inline int __netdev_upper_dev_insert(struct net_device *dev, - struct net_device *udev, - bool master, bool neighbour) -{ - return __netdev_adjacent_dev_insert(dev, udev, &dev->upper_dev_list, - neighbour, master); -} - -static inline int __netdev_lower_dev_insert(struct net_device *dev, - struct net_device *ldev, - bool neighbour) -{ - return __netdev_adjacent_dev_insert(dev, ldev, &dev->lower_dev_list, - neighbour, false); -} - void __netdev_adjacent_dev_remove(struct net_device *dev, struct net_device *adj_dev, struct list_head *dev_list) @@ -4571,73 +4537,102 @@ void __netdev_adjacent_dev_remove(struct net_device *dev, adj = __netdev_find_adj(dev, adj_dev, dev_list); - if (!adj) + if (!adj) { + pr_err("tried to remove device %s from %s\n", + dev->name, adj_dev->name); BUG(); + } if (adj->ref_nr > 1) { + pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name, + adj->ref_nr-1); adj->ref_nr--; return; } list_del_rcu(&adj->list); - pr_debug("dev_put for %s, because of %s link removed from %s to %s\n", - adj_dev->name, dev_list == &dev->upper_dev_list ? - "upper" : "lower", dev->name, adj_dev->name); + pr_debug("dev_put for %s, because link removed from %s to %s\n", + adj_dev->name, dev->name, adj_dev->name); dev_put(adj_dev); kfree_rcu(adj, rcu); } -static inline void __netdev_upper_dev_remove(struct net_device *dev, - struct net_device *udev) -{ - return __netdev_adjacent_dev_remove(dev, udev, &dev->upper_dev_list); -} - -static inline void __netdev_lower_dev_remove(struct net_device *dev, - struct net_device *ldev) -{ - return __netdev_adjacent_dev_remove(dev, ldev, &dev->lower_dev_list); -} - -int __netdev_adjacent_dev_insert_link(struct net_device *dev, - struct net_device *upper_dev, - bool master, bool neighbour) +int __netdev_adjacent_dev_link_lists(struct net_device *dev, + struct net_device *upper_dev, + struct list_head *up_list, + struct list_head *down_list, + bool master) { int ret; - ret = __netdev_upper_dev_insert(dev, upper_dev, master, neighbour); + ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, master); if (ret) return ret; - ret = __netdev_lower_dev_insert(upper_dev, dev, neighbour); + ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, false); if (ret) { - __netdev_upper_dev_remove(dev, upper_dev); + __netdev_adjacent_dev_remove(dev, upper_dev, up_list); return ret; } return 0; } -static inline int __netdev_adjacent_dev_link(struct net_device *dev, - struct net_device *udev) +int __netdev_adjacent_dev_link(struct net_device *dev, + struct net_device *upper_dev) { - return __netdev_adjacent_dev_insert_link(dev, udev, false, false); + return __netdev_adjacent_dev_link_lists(dev, upper_dev, + &dev->all_adj_list.upper, + &upper_dev->all_adj_list.lower, + false); } -static inline int __netdev_adjacent_dev_link_neighbour(struct net_device *dev, - struct net_device *udev, - bool master) +void __netdev_adjacent_dev_unlink_lists(struct net_device *dev, + struct net_device *upper_dev, + struct list_head *up_list, + struct list_head *down_list) { - return __netdev_adjacent_dev_insert_link(dev, udev, master, true); + __netdev_adjacent_dev_remove(dev, upper_dev, up_list); + __netdev_adjacent_dev_remove(upper_dev, dev, down_list); } void __netdev_adjacent_dev_unlink(struct net_device *dev, struct net_device *upper_dev) { - __netdev_upper_dev_remove(dev, upper_dev); - __netdev_lower_dev_remove(upper_dev, dev); + __netdev_adjacent_dev_unlink_lists(dev, upper_dev, + &dev->all_adj_list.upper, + &upper_dev->all_adj_list.lower); +} + +int __netdev_adjacent_dev_link_neighbour(struct net_device *dev, + struct net_device *upper_dev, + bool master) +{ + int ret = __netdev_adjacent_dev_link(dev, upper_dev); + + if (ret) + return ret; + + ret = __netdev_adjacent_dev_link_lists(dev, upper_dev, + &dev->adj_list.upper, + &upper_dev->adj_list.lower, + master); + if (ret) { + __netdev_adjacent_dev_unlink(dev, upper_dev); + return ret; + } + + return 0; } +void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev, + struct net_device *upper_dev) +{ + __netdev_adjacent_dev_unlink(dev, upper_dev); + __netdev_adjacent_dev_unlink_lists(dev, upper_dev, + &dev->adj_list.upper, + &upper_dev->adj_list.lower); +} static int __netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, bool master) @@ -4651,10 +4646,10 @@ static int __netdev_upper_dev_link(struct net_device *dev, return -EBUSY; /* To prevent loops, check if dev is not upper device to upper_dev. */ - if (__netdev_find_upper(upper_dev, dev)) + if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper)) return -EBUSY; - if (__netdev_find_upper(dev, upper_dev)) + if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper)) return -EEXIST; if (master && netdev_master_upper_dev_get(dev)) @@ -4665,12 +4660,14 @@ static int __netdev_upper_dev_link(struct net_device *dev, return ret; /* Now that we linked these devs, make all the upper_dev's - * upper_dev_list visible to every dev's lower_dev_list and vice + * all_adj_list.upper visible to every dev's all_adj_list.lower an * versa, and don't forget the devices itself. All of these * links are non-neighbours. */ - list_for_each_entry(i, &dev->lower_dev_list, list) { - list_for_each_entry(j, &upper_dev->upper_dev_list, list) { + list_for_each_entry(i, &dev->all_adj_list.lower, list) { + list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) { + pr_debug("Interlinking %s with %s, non-neighbour\n", + i->dev->name, j->dev->name); ret = __netdev_adjacent_dev_link(i->dev, j->dev); if (ret) goto rollback_mesh; @@ -4678,14 +4675,18 @@ static int __netdev_upper_dev_link(struct net_device *dev, } /* add dev to every upper_dev's upper device */ - list_for_each_entry(i, &upper_dev->upper_dev_list, list) { + list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) { + pr_debug("linking %s's upper device %s with %s\n", + upper_dev->name, i->dev->name, dev->name); ret = __netdev_adjacent_dev_link(dev, i->dev); if (ret) goto rollback_upper_mesh; } /* add upper_dev to every dev's lower device */ - list_for_each_entry(i, &dev->lower_dev_list, list) { + list_for_each_entry(i, &dev->all_adj_list.lower, list) { + pr_debug("linking %s's lower device %s with %s\n", dev->name, + i->dev->name, upper_dev->name); ret = __netdev_adjacent_dev_link(i->dev, upper_dev); if (ret) goto rollback_lower_mesh; @@ -4696,7 +4697,7 @@ static int __netdev_upper_dev_link(struct net_device *dev, rollback_lower_mesh: to_i = i; - list_for_each_entry(i, &dev->lower_dev_list, list) { + list_for_each_entry(i, &dev->all_adj_list.lower, list) { if (i == to_i) break; __netdev_adjacent_dev_unlink(i->dev, upper_dev); @@ -4706,7 +4707,7 @@ rollback_lower_mesh: rollback_upper_mesh: to_i = i; - list_for_each_entry(i, &upper_dev->upper_dev_list, list) { + list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) { if (i == to_i) break; __netdev_adjacent_dev_unlink(dev, i->dev); @@ -4717,8 +4718,8 @@ rollback_upper_mesh: rollback_mesh: to_i = i; to_j = j; - list_for_each_entry(i, &dev->lower_dev_list, list) { - list_for_each_entry(j, &upper_dev->upper_dev_list, list) { + list_for_each_entry(i, &dev->all_adj_list.lower, list) { + list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) { if (i == to_i && j == to_j) break; __netdev_adjacent_dev_unlink(i->dev, j->dev); @@ -4727,7 +4728,7 @@ rollback_mesh: break; } - __netdev_adjacent_dev_unlink(dev, upper_dev); + __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); return ret; } @@ -4781,23 +4782,23 @@ void netdev_upper_dev_unlink(struct net_device *dev, struct netdev_adjacent *i, *j; ASSERT_RTNL(); - __netdev_adjacent_dev_unlink(dev, upper_dev); + __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); /* Here is the tricky part. We must remove all dev's lower * devices from all upper_dev's upper devices and vice * versa, to maintain the graph relationship. */ - list_for_each_entry(i, &dev->lower_dev_list, list) - list_for_each_entry(j, &upper_dev->upper_dev_list, list) + list_for_each_entry(i, &dev->all_adj_list.lower, list) + list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) __netdev_adjacent_dev_unlink(i->dev, j->dev); /* remove also the devices itself from lower/upper device * list */ - list_for_each_entry(i, &dev->lower_dev_list, list) + list_for_each_entry(i, &dev->all_adj_list.lower, list) __netdev_adjacent_dev_unlink(i->dev, upper_dev); - list_for_each_entry(i, &upper_dev->upper_dev_list, list) + list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) __netdev_adjacent_dev_unlink(dev, i->dev); call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev); @@ -6059,8 +6060,10 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, INIT_LIST_HEAD(&dev->napi_list); INIT_LIST_HEAD(&dev->unreg_list); INIT_LIST_HEAD(&dev->link_watch_list); - INIT_LIST_HEAD(&dev->upper_dev_list); - INIT_LIST_HEAD(&dev->lower_dev_list); + INIT_LIST_HEAD(&dev->adj_list.upper); + INIT_LIST_HEAD(&dev->adj_list.lower); + INIT_LIST_HEAD(&dev->all_adj_list.upper); + INIT_LIST_HEAD(&dev->all_adj_list.lower); dev->priv_flags = IFF_XMIT_DST_RELEASE; setup(dev); -- cgit v1.2.3 From 5249dec7380cb64928d2ae6201028b4da1dceb1e Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Wed, 25 Sep 2013 09:20:08 +0200 Subject: net: add RCU variant to search for netdev_adjacent link Currently we have only the RTNL flavour, however we can traverse it while holding only RCU, so add the RCU search. Add an RCU variant that uses list_head * as an argument, so that it can be universally used afterwards. CC: "David S. Miller" CC: Eric Dumazet CC: Jiri Pirko CC: Alexander Duyck CC: Cong Wang Signed-off-by: Veaceslav Falico Signed-off-by: David S. Miller --- net/core/dev.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 9a395e03da74..9290f09cdf26 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4380,6 +4380,19 @@ struct netdev_adjacent { struct rcu_head rcu; }; +static struct netdev_adjacent *__netdev_find_adj_rcu(struct net_device *dev, + struct net_device *adj_dev, + struct list_head *adj_list) +{ + struct netdev_adjacent *adj; + + list_for_each_entry_rcu(adj, adj_list, list) { + if (adj->dev == adj_dev) + return adj; + } + return NULL; +} + static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev, struct net_device *adj_dev, struct list_head *adj_list) -- cgit v1.2.3 From 402dae9614557296e84543008a8e582c28fb1db3 Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Wed, 25 Sep 2013 09:20:09 +0200 Subject: net: add netdev_adjacent->private and allow to use it Currently, even though we can access any linked device, we can't attach anything to it, which is vital to properly manage them. To fix this, add a new void *private to netdev_adjacent and functions setting/getting it (per link), so that we can save, per example, bonding's slave structures there, per slave device. netdev_master_upper_dev_link_private(dev, upper_dev, private) links dev to upper dev and populates the neighbour link only with private. netdev_lower_dev_get_private{,_rcu}() returns the private, if found. CC: "David S. Miller" CC: Eric Dumazet CC: Jiri Pirko CC: Alexander Duyck Signed-off-by: Veaceslav Falico Signed-off-by: David S. Miller --- include/linux/netdevice.h | 7 +++++ net/core/dev.c | 68 +++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 64 insertions(+), 11 deletions(-) (limited to 'net/core') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 514045c704a8..75d5beac463b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2839,8 +2839,15 @@ extern int netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev); extern int netdev_master_upper_dev_link(struct net_device *dev, struct net_device *upper_dev); +extern int netdev_master_upper_dev_link_private(struct net_device *dev, + struct net_device *upper_dev, + void *private); extern void netdev_upper_dev_unlink(struct net_device *dev, struct net_device *upper_dev); +extern void *netdev_lower_dev_get_private_rcu(struct net_device *dev, + struct net_device *lower_dev); +extern void *netdev_lower_dev_get_private(struct net_device *dev, + struct net_device *lower_dev); extern int skb_checksum_help(struct sk_buff *skb); extern struct sk_buff *__skb_gso_segment(struct sk_buff *skb, netdev_features_t features, bool tx_path); diff --git a/net/core/dev.c b/net/core/dev.c index 9290f09cdf26..c69ab74fb201 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4376,6 +4376,9 @@ struct netdev_adjacent { /* counter for the number of times this device was added to us */ u16 ref_nr; + /* private field for the users */ + void *private; + struct list_head list; struct rcu_head rcu; }; @@ -4510,7 +4513,7 @@ EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu); static int __netdev_adjacent_dev_insert(struct net_device *dev, struct net_device *adj_dev, struct list_head *dev_list, - bool master) + void *private, bool master) { struct netdev_adjacent *adj; @@ -4528,6 +4531,7 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev, adj->dev = adj_dev; adj->master = master; adj->ref_nr = 1; + adj->private = private; dev_hold(adj_dev); pr_debug("dev_hold for %s, because of link added from %s to %s\n", @@ -4574,15 +4578,17 @@ int __netdev_adjacent_dev_link_lists(struct net_device *dev, struct net_device *upper_dev, struct list_head *up_list, struct list_head *down_list, - bool master) + void *private, bool master) { int ret; - ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, master); + ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private, + master); if (ret) return ret; - ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, false); + ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private, + false); if (ret) { __netdev_adjacent_dev_remove(dev, upper_dev, up_list); return ret; @@ -4597,7 +4603,7 @@ int __netdev_adjacent_dev_link(struct net_device *dev, return __netdev_adjacent_dev_link_lists(dev, upper_dev, &dev->all_adj_list.upper, &upper_dev->all_adj_list.lower, - false); + NULL, false); } void __netdev_adjacent_dev_unlink_lists(struct net_device *dev, @@ -4619,7 +4625,7 @@ void __netdev_adjacent_dev_unlink(struct net_device *dev, int __netdev_adjacent_dev_link_neighbour(struct net_device *dev, struct net_device *upper_dev, - bool master) + void *private, bool master) { int ret = __netdev_adjacent_dev_link(dev, upper_dev); @@ -4629,7 +4635,7 @@ int __netdev_adjacent_dev_link_neighbour(struct net_device *dev, ret = __netdev_adjacent_dev_link_lists(dev, upper_dev, &dev->adj_list.upper, &upper_dev->adj_list.lower, - master); + private, master); if (ret) { __netdev_adjacent_dev_unlink(dev, upper_dev); return ret; @@ -4648,7 +4654,8 @@ void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev, } static int __netdev_upper_dev_link(struct net_device *dev, - struct net_device *upper_dev, bool master) + struct net_device *upper_dev, bool master, + void *private) { struct netdev_adjacent *i, *j, *to_i, *to_j; int ret = 0; @@ -4668,7 +4675,8 @@ static int __netdev_upper_dev_link(struct net_device *dev, if (master && netdev_master_upper_dev_get(dev)) return -EBUSY; - ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, master); + ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private, + master); if (ret) return ret; @@ -4759,7 +4767,7 @@ rollback_mesh: int netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev) { - return __netdev_upper_dev_link(dev, upper_dev, false); + return __netdev_upper_dev_link(dev, upper_dev, false, NULL); } EXPORT_SYMBOL(netdev_upper_dev_link); @@ -4777,10 +4785,18 @@ EXPORT_SYMBOL(netdev_upper_dev_link); int netdev_master_upper_dev_link(struct net_device *dev, struct net_device *upper_dev) { - return __netdev_upper_dev_link(dev, upper_dev, true); + return __netdev_upper_dev_link(dev, upper_dev, true, NULL); } EXPORT_SYMBOL(netdev_master_upper_dev_link); +int netdev_master_upper_dev_link_private(struct net_device *dev, + struct net_device *upper_dev, + void *private) +{ + return __netdev_upper_dev_link(dev, upper_dev, true, private); +} +EXPORT_SYMBOL(netdev_master_upper_dev_link_private); + /** * netdev_upper_dev_unlink - Removes a link to upper device * @dev: device @@ -4818,6 +4834,36 @@ void netdev_upper_dev_unlink(struct net_device *dev, } EXPORT_SYMBOL(netdev_upper_dev_unlink); +void *netdev_lower_dev_get_private_rcu(struct net_device *dev, + struct net_device *lower_dev) +{ + struct netdev_adjacent *lower; + + if (!lower_dev) + return NULL; + lower = __netdev_find_adj_rcu(dev, lower_dev, &dev->adj_list.lower); + if (!lower) + return NULL; + + return lower->private; +} +EXPORT_SYMBOL(netdev_lower_dev_get_private_rcu); + +void *netdev_lower_dev_get_private(struct net_device *dev, + struct net_device *lower_dev) +{ + struct netdev_adjacent *lower; + + if (!lower_dev) + return NULL; + lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower); + if (!lower) + return NULL; + + return lower->private; +} +EXPORT_SYMBOL(netdev_lower_dev_get_private); + static void dev_change_rx_flags(struct net_device *dev, int flags) { const struct net_device_ops *ops = dev->netdev_ops; -- cgit v1.2.3 From 31088a113c2a948856ed2047d8c21c217b13e85d Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Wed, 25 Sep 2013 09:20:12 +0200 Subject: net: add for_each iterators through neighbour lower link's private Add a possibility to iterate through netdev_adjacent's private, currently only for lower neighbours. Add both RCU and RTNL/other locking variants of iterators, and make the non-rcu variant to be safe from removal. CC: "David S. Miller" CC: Eric Dumazet CC: Jiri Pirko CC: Alexander Duyck Signed-off-by: Veaceslav Falico Signed-off-by: David S. Miller --- include/linux/netdevice.h | 17 ++++++++++++++ net/core/dev.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 76 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 75d5beac463b..168974e40cf5 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2833,6 +2833,23 @@ extern struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *d updev; \ updev = netdev_all_upper_get_next_dev_rcu(dev, &(iter))) +extern void *netdev_lower_get_next_private(struct net_device *dev, + struct list_head **iter); +extern void *netdev_lower_get_next_private_rcu(struct net_device *dev, + struct list_head **iter); + +#define netdev_for_each_lower_private(dev, priv, iter) \ + for (iter = (dev)->adj_list.lower.next, \ + priv = netdev_lower_get_next_private(dev, &(iter)); \ + priv; \ + priv = netdev_lower_get_next_private(dev, &(iter))) + +#define netdev_for_each_lower_private_rcu(dev, priv, iter) \ + for (iter = &(dev)->adj_list.lower, \ + priv = netdev_lower_get_next_private_rcu(dev, &(iter)); \ + priv; \ + priv = netdev_lower_get_next_private_rcu(dev, &(iter))) + extern struct net_device *netdev_master_upper_dev_get(struct net_device *dev); extern struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev); extern int netdev_upper_dev_link(struct net_device *dev, diff --git a/net/core/dev.c b/net/core/dev.c index c69ab74fb201..0aa844aae40b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4466,7 +4466,8 @@ struct net_device *netdev_master_upper_dev_get(struct net_device *dev) } EXPORT_SYMBOL(netdev_master_upper_dev_get); -/* netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list +/** + * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list * @dev: device * @iter: list_head ** of the current position * @@ -4491,6 +4492,63 @@ struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev, } EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu); +/** + * netdev_lower_get_next_private - Get the next ->private from the + * lower neighbour list + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next netdev_adjacent->private from the dev's lower neighbour + * list, starting from iter position. The caller must hold either hold the + * RTNL lock or its own locking that guarantees that the neighbour lower + * list will remain unchainged. + */ +void *netdev_lower_get_next_private(struct net_device *dev, + struct list_head **iter) +{ + struct netdev_adjacent *lower; + + lower = list_entry(*iter, struct netdev_adjacent, list); + + if (&lower->list == &dev->adj_list.lower) + return NULL; + + if (iter) + *iter = lower->list.next; + + return lower->private; +} +EXPORT_SYMBOL(netdev_lower_get_next_private); + +/** + * netdev_lower_get_next_private_rcu - Get the next ->private from the + * lower neighbour list, RCU + * variant + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next netdev_adjacent->private from the dev's lower neighbour + * list, starting from iter position. The caller must hold RCU read lock. + */ +void *netdev_lower_get_next_private_rcu(struct net_device *dev, + struct list_head **iter) +{ + struct netdev_adjacent *lower; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); + + if (&lower->list == &dev->adj_list.lower) + return NULL; + + if (iter) + *iter = &lower->list; + + return lower->private; +} +EXPORT_SYMBOL(netdev_lower_get_next_private_rcu); + /** * netdev_master_upper_dev_get_rcu - Get master upper device * @dev: device -- cgit v1.2.3 From b6ccba4c681fdaf0070e580bf951badf7edc860b Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Wed, 25 Sep 2013 09:20:23 +0200 Subject: net: add a possibility to get private from netdev_adjacent->list It will be useful to get first/last element. CC: "David S. Miller" CC: Eric Dumazet CC: Jiri Pirko CC: Alexander Duyck Signed-off-by: Veaceslav Falico Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + net/core/dev.c | 10 ++++++++++ 2 files changed, 11 insertions(+) (limited to 'net/core') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 168974e40cf5..b4cfb63f264e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2850,6 +2850,7 @@ extern void *netdev_lower_get_next_private_rcu(struct net_device *dev, priv; \ priv = netdev_lower_get_next_private_rcu(dev, &(iter))) +extern void *netdev_adjacent_get_private(struct list_head *adj_list); extern struct net_device *netdev_master_upper_dev_get(struct net_device *dev); extern struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev); extern int netdev_upper_dev_link(struct net_device *dev, diff --git a/net/core/dev.c b/net/core/dev.c index 0aa844aae40b..acc11810805f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4466,6 +4466,16 @@ struct net_device *netdev_master_upper_dev_get(struct net_device *dev) } EXPORT_SYMBOL(netdev_master_upper_dev_get); +void *netdev_adjacent_get_private(struct list_head *adj_list) +{ + struct netdev_adjacent *adj; + + adj = list_entry(adj_list, struct netdev_adjacent, list); + + return adj->private; +} +EXPORT_SYMBOL(netdev_adjacent_get_private); + /** * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list * @dev: device -- cgit v1.2.3 From 842d67a7b34ea735155812ecf0671a481284f358 Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Wed, 25 Sep 2013 09:20:31 +0200 Subject: net: expose the master link to sysfs, and remove it from bond Currently, we can have only one master upper neighbour, so it would be useful to create a symlink to it in the sysfs device directory, the way that bonding now does it, for every device. Lower devices from bridge/team/etc will automagically get it, so we could rely on it. Also, remove the same functionality from bonding. CC: Jay Vosburgh CC: Andy Gospodarek CC: "David S. Miller" CC: Eric Dumazet CC: Jiri Pirko CC: Alexander Duyck Signed-off-by: Veaceslav Falico Signed-off-by: David S. Miller --- drivers/net/bonding/bond_sysfs.c | 20 +++----------------- net/core/dev.c | 19 +++++++++++++++++-- 2 files changed, 20 insertions(+), 19 deletions(-) (limited to 'net/core') diff --git a/drivers/net/bonding/bond_sysfs.c b/drivers/net/bonding/bond_sysfs.c index 04d95d6f6c63..1c5724672204 100644 --- a/drivers/net/bonding/bond_sysfs.c +++ b/drivers/net/bonding/bond_sysfs.c @@ -172,24 +172,11 @@ int bond_create_slave_symlinks(struct net_device *master, struct net_device *slave) { char linkname[IFNAMSIZ+7]; - int ret = 0; - /* first, create a link from the slave back to the master */ - ret = sysfs_create_link(&(slave->dev.kobj), &(master->dev.kobj), - "master"); - if (ret) - return ret; - /* next, create a link from the master to the slave */ + /* create a link from the master to the slave */ sprintf(linkname, "slave_%s", slave->name); - ret = sysfs_create_link(&(master->dev.kobj), &(slave->dev.kobj), - linkname); - - /* free the master link created earlier in case of error */ - if (ret) - sysfs_remove_link(&(slave->dev.kobj), "master"); - - return ret; - + return sysfs_create_link(&(master->dev.kobj), &(slave->dev.kobj), + linkname); } void bond_destroy_slave_symlinks(struct net_device *master, @@ -197,7 +184,6 @@ void bond_destroy_slave_symlinks(struct net_device *master, { char linkname[IFNAMSIZ+7]; - sysfs_remove_link(&(slave->dev.kobj), "master"); sprintf(linkname, "slave_%s", slave->name); sysfs_remove_link(&(master->dev.kobj), linkname); } diff --git a/net/core/dev.c b/net/core/dev.c index acc11810805f..de443ee1b046 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4584,6 +4584,7 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev, void *private, bool master) { struct netdev_adjacent *adj; + int ret; adj = __netdev_find_adj(dev, adj_dev, dev_list); @@ -4606,12 +4607,23 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev, adj_dev->name, dev->name, adj_dev->name); /* Ensure that master link is always the first item in list. */ - if (master) + if (master) { + ret = sysfs_create_link(&(dev->dev.kobj), + &(adj_dev->dev.kobj), "master"); + if (ret) + goto free_adj; + list_add_rcu(&adj->list, dev_list); - else + } else { list_add_tail_rcu(&adj->list, dev_list); + } return 0; + +free_adj: + kfree(adj); + + return ret; } void __netdev_adjacent_dev_remove(struct net_device *dev, @@ -4635,6 +4647,9 @@ void __netdev_adjacent_dev_remove(struct net_device *dev, return; } + if (adj->master) + sysfs_remove_link(&(dev->dev.kobj), "master"); + list_del_rcu(&adj->list); pr_debug("dev_put for %s, because link removed from %s to %s\n", adj_dev->name, dev->name, adj_dev->name); -- cgit v1.2.3 From 5831d66e8097aedfa3bc35941cf265ada2352317 Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Wed, 25 Sep 2013 09:20:32 +0200 Subject: net: create sysfs symlinks for neighbour devices Also, remove the same functionality from bonding - it will be already done for any device that links to its lower/upper neighbour. The links will be created for dev's kobject, and will look like lower_eth0 for lower device eth0 and upper_bridge0 for upper device bridge0. CC: Jay Vosburgh CC: Andy Gospodarek CC: "David S. Miller" CC: Eric Dumazet CC: Jiri Pirko CC: Alexander Duyck Signed-off-by: Veaceslav Falico Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 11 +---------- drivers/net/bonding/bond_sysfs.c | 21 --------------------- drivers/net/bonding/bonding.h | 2 -- net/core/dev.c | 35 ++++++++++++++++++++++++++++++++++- 4 files changed, 35 insertions(+), 34 deletions(-) (limited to 'net/core') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index d49404509814..d5c3153226b7 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1612,15 +1612,11 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev) read_unlock(&bond->lock); - res = bond_create_slave_symlinks(bond_dev, slave_dev); - if (res) - goto err_detach; - res = netdev_rx_handler_register(slave_dev, bond_handle_frame, new_slave); if (res) { pr_debug("Error %d calling netdev_rx_handler_register\n", res); - goto err_dest_symlinks; + goto err_detach; } res = bond_master_upper_dev_link(bond_dev, slave_dev, new_slave); @@ -1642,9 +1638,6 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev) err_unregister: netdev_rx_handler_unregister(slave_dev); -err_dest_symlinks: - bond_destroy_slave_symlinks(bond_dev, slave_dev); - err_detach: if (!USES_PRIMARY(bond->params.mode)) bond_hw_addr_flush(bond_dev, slave_dev); @@ -1842,8 +1835,6 @@ static int __bond_release_one(struct net_device *bond_dev, bond_dev->name, slave_dev->name, bond_dev->name); /* must do this from outside any spinlocks */ - bond_destroy_slave_symlinks(bond_dev, slave_dev); - vlan_vids_del_by_dev(slave_dev, bond_dev); /* If the mode USES_PRIMARY, then this cases was handled above by diff --git a/drivers/net/bonding/bond_sysfs.c b/drivers/net/bonding/bond_sysfs.c index 1c5724672204..e06c644470b1 100644 --- a/drivers/net/bonding/bond_sysfs.c +++ b/drivers/net/bonding/bond_sysfs.c @@ -168,27 +168,6 @@ static const struct class_attribute class_attr_bonding_masters = { .namespace = bonding_namespace, }; -int bond_create_slave_symlinks(struct net_device *master, - struct net_device *slave) -{ - char linkname[IFNAMSIZ+7]; - - /* create a link from the master to the slave */ - sprintf(linkname, "slave_%s", slave->name); - return sysfs_create_link(&(master->dev.kobj), &(slave->dev.kobj), - linkname); -} - -void bond_destroy_slave_symlinks(struct net_device *master, - struct net_device *slave) -{ - char linkname[IFNAMSIZ+7]; - - sprintf(linkname, "slave_%s", slave->name); - sysfs_remove_link(&(master->dev.kobj), linkname); -} - - /* * Show the slaves in the current bond. */ diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h index e5c32a61dacc..5b71601666cd 100644 --- a/drivers/net/bonding/bonding.h +++ b/drivers/net/bonding/bonding.h @@ -436,8 +436,6 @@ int bond_create(struct net *net, const char *name); int bond_create_sysfs(struct bond_net *net); void bond_destroy_sysfs(struct bond_net *net); void bond_prepare_sysfs_group(struct bonding *bond); -int bond_create_slave_symlinks(struct net_device *master, struct net_device *slave); -void bond_destroy_slave_symlinks(struct net_device *master, struct net_device *slave); int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev); int bond_release(struct net_device *bond_dev, struct net_device *slave_dev); void bond_mii_monitor(struct work_struct *); diff --git a/net/core/dev.c b/net/core/dev.c index de443ee1b046..25ab6fe80da2 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4584,6 +4584,7 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev, void *private, bool master) { struct netdev_adjacent *adj; + char linkname[IFNAMSIZ+7]; int ret; adj = __netdev_find_adj(dev, adj_dev, dev_list); @@ -4606,12 +4607,26 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev, pr_debug("dev_hold for %s, because of link added from %s to %s\n", adj_dev->name, dev->name, adj_dev->name); + if (dev_list == &dev->adj_list.lower) { + sprintf(linkname, "lower_%s", adj_dev->name); + ret = sysfs_create_link(&(dev->dev.kobj), + &(adj_dev->dev.kobj), linkname); + if (ret) + goto free_adj; + } else if (dev_list == &dev->adj_list.upper) { + sprintf(linkname, "upper_%s", adj_dev->name); + ret = sysfs_create_link(&(dev->dev.kobj), + &(adj_dev->dev.kobj), linkname); + if (ret) + goto free_adj; + } + /* Ensure that master link is always the first item in list. */ if (master) { ret = sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj), "master"); if (ret) - goto free_adj; + goto remove_symlinks; list_add_rcu(&adj->list, dev_list); } else { @@ -4620,6 +4635,15 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev, return 0; +remove_symlinks: + if (dev_list == &dev->adj_list.lower) { + sprintf(linkname, "lower_%s", adj_dev->name); + sysfs_remove_link(&(dev->dev.kobj), linkname); + } else if (dev_list == &dev->adj_list.upper) { + sprintf(linkname, "upper_%s", adj_dev->name); + sysfs_remove_link(&(dev->dev.kobj), linkname); + } + free_adj: kfree(adj); @@ -4631,6 +4655,7 @@ void __netdev_adjacent_dev_remove(struct net_device *dev, struct list_head *dev_list) { struct netdev_adjacent *adj; + char linkname[IFNAMSIZ+7]; adj = __netdev_find_adj(dev, adj_dev, dev_list); @@ -4650,6 +4675,14 @@ void __netdev_adjacent_dev_remove(struct net_device *dev, if (adj->master) sysfs_remove_link(&(dev->dev.kobj), "master"); + if (dev_list == &dev->adj_list.lower) { + sprintf(linkname, "lower_%s", adj_dev->name); + sysfs_remove_link(&(dev->dev.kobj), linkname); + } else if (dev_list == &dev->adj_list.upper) { + sprintf(linkname, "upper_%s", adj_dev->name); + sysfs_remove_link(&(dev->dev.kobj), linkname); + } + list_del_rcu(&adj->list); pr_debug("dev_put for %s, because link removed from %s to %s\n", adj_dev->name, dev->name, adj_dev->name); -- cgit v1.2.3 From 62748f32d501f5d3712a7c372bbb92abc7c62bc7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 24 Sep 2013 08:20:52 -0700 Subject: net: introduce SO_MAX_PACING_RATE As mentioned in commit afe4fd062416b ("pkt_sched: fq: Fair Queue packet scheduler"), this patch adds a new socket option. SO_MAX_PACING_RATE offers the application the ability to cap the rate computed by transport layer. Value is in bytes per second. u32 val = 1000000; setsockopt(sockfd, SOL_SOCKET, SO_MAX_PACING_RATE, &val, sizeof(val)); To be effectively paced, a flow must use FQ packet scheduler. Note that a packet scheduler takes into account the headers for its computations. The effective payload rate depends on MSS and retransmits if any. I chose to make this pacing rate a SOL_SOCKET option instead of a TCP one because this can be used by other protocols. Signed-off-by: Eric Dumazet Cc: Steinar H. Gunderson Cc: Michael Kerrisk Signed-off-by: David S. Miller --- arch/alpha/include/uapi/asm/socket.h | 4 +++- arch/avr32/include/uapi/asm/socket.h | 2 ++ arch/cris/include/uapi/asm/socket.h | 2 ++ arch/frv/include/uapi/asm/socket.h | 2 ++ arch/h8300/include/uapi/asm/socket.h | 2 ++ arch/ia64/include/uapi/asm/socket.h | 2 ++ arch/m32r/include/uapi/asm/socket.h | 2 ++ arch/mips/include/uapi/asm/socket.h | 2 ++ arch/mn10300/include/uapi/asm/socket.h | 2 ++ arch/parisc/include/uapi/asm/socket.h | 2 ++ arch/powerpc/include/uapi/asm/socket.h | 2 ++ arch/s390/include/uapi/asm/socket.h | 2 ++ arch/sparc/include/uapi/asm/socket.h | 2 ++ arch/xtensa/include/uapi/asm/socket.h | 2 ++ include/net/sock.h | 1 + include/uapi/asm-generic/socket.h | 2 ++ net/core/sock.c | 12 ++++++++++++ net/ipv4/tcp_input.c | 2 +- 18 files changed, 45 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h index 467de010ea7e..e3a1491d5073 100644 --- a/arch/alpha/include/uapi/asm/socket.h +++ b/arch/alpha/include/uapi/asm/socket.h @@ -81,6 +81,8 @@ #define SO_SELECT_ERR_QUEUE 45 -#define SO_BUSY_POLL 46 +#define SO_BUSY_POLL 46 + +#define SO_MAX_PACING_RATE 47 #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/avr32/include/uapi/asm/socket.h b/arch/avr32/include/uapi/asm/socket.h index 11c4259c62fb..439936421434 100644 --- a/arch/avr32/include/uapi/asm/socket.h +++ b/arch/avr32/include/uapi/asm/socket.h @@ -76,4 +76,6 @@ #define SO_BUSY_POLL 46 +#define SO_MAX_PACING_RATE 47 + #endif /* __ASM_AVR32_SOCKET_H */ diff --git a/arch/cris/include/uapi/asm/socket.h b/arch/cris/include/uapi/asm/socket.h index eb723e51554e..13829aaaeec5 100644 --- a/arch/cris/include/uapi/asm/socket.h +++ b/arch/cris/include/uapi/asm/socket.h @@ -78,6 +78,8 @@ #define SO_BUSY_POLL 46 +#define SO_MAX_PACING_RATE 47 + #endif /* _ASM_SOCKET_H */ diff --git a/arch/frv/include/uapi/asm/socket.h b/arch/frv/include/uapi/asm/socket.h index f0cb1c341163..5d4299762426 100644 --- a/arch/frv/include/uapi/asm/socket.h +++ b/arch/frv/include/uapi/asm/socket.h @@ -76,5 +76,7 @@ #define SO_BUSY_POLL 46 +#define SO_MAX_PACING_RATE 47 + #endif /* _ASM_SOCKET_H */ diff --git a/arch/h8300/include/uapi/asm/socket.h b/arch/h8300/include/uapi/asm/socket.h index 9490758c5e2b..214ccaf3554a 100644 --- a/arch/h8300/include/uapi/asm/socket.h +++ b/arch/h8300/include/uapi/asm/socket.h @@ -76,4 +76,6 @@ #define SO_BUSY_POLL 46 +#define SO_MAX_PACING_RATE 47 + #endif /* _ASM_SOCKET_H */ diff --git a/arch/ia64/include/uapi/asm/socket.h b/arch/ia64/include/uapi/asm/socket.h index 556d0701a155..c25302fb48d9 100644 --- a/arch/ia64/include/uapi/asm/socket.h +++ b/arch/ia64/include/uapi/asm/socket.h @@ -85,4 +85,6 @@ #define SO_BUSY_POLL 46 +#define SO_MAX_PACING_RATE 47 + #endif /* _ASM_IA64_SOCKET_H */ diff --git a/arch/m32r/include/uapi/asm/socket.h b/arch/m32r/include/uapi/asm/socket.h index 24be7c8da86a..52966650114f 100644 --- a/arch/m32r/include/uapi/asm/socket.h +++ b/arch/m32r/include/uapi/asm/socket.h @@ -76,4 +76,6 @@ #define SO_BUSY_POLL 46 +#define SO_MAX_PACING_RATE 47 + #endif /* _ASM_M32R_SOCKET_H */ diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h index 61c01f054d1b..0df9787cd84d 100644 --- a/arch/mips/include/uapi/asm/socket.h +++ b/arch/mips/include/uapi/asm/socket.h @@ -94,4 +94,6 @@ #define SO_BUSY_POLL 46 +#define SO_MAX_PACING_RATE 47 + #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/mn10300/include/uapi/asm/socket.h b/arch/mn10300/include/uapi/asm/socket.h index e2a2b203eb00..71dedcae55a6 100644 --- a/arch/mn10300/include/uapi/asm/socket.h +++ b/arch/mn10300/include/uapi/asm/socket.h @@ -76,4 +76,6 @@ #define SO_BUSY_POLL 46 +#define SO_MAX_PACING_RATE 47 + #endif /* _ASM_SOCKET_H */ diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h index 71700e636a8e..7c614d01f1fa 100644 --- a/arch/parisc/include/uapi/asm/socket.h +++ b/arch/parisc/include/uapi/asm/socket.h @@ -75,6 +75,8 @@ #define SO_BUSY_POLL 0x4027 +#define SO_MAX_PACING_RATE 0x4048 + /* O_NONBLOCK clashes with the bits used for socket types. Therefore we * have to define SOCK_NONBLOCK to a different value here. */ diff --git a/arch/powerpc/include/uapi/asm/socket.h b/arch/powerpc/include/uapi/asm/socket.h index a6d74467c9ed..fa698324a1fd 100644 --- a/arch/powerpc/include/uapi/asm/socket.h +++ b/arch/powerpc/include/uapi/asm/socket.h @@ -83,4 +83,6 @@ #define SO_BUSY_POLL 46 +#define SO_MAX_PACING_RATE 47 + #endif /* _ASM_POWERPC_SOCKET_H */ diff --git a/arch/s390/include/uapi/asm/socket.h b/arch/s390/include/uapi/asm/socket.h index 92494494692e..c286c2e868f0 100644 --- a/arch/s390/include/uapi/asm/socket.h +++ b/arch/s390/include/uapi/asm/socket.h @@ -82,4 +82,6 @@ #define SO_BUSY_POLL 46 +#define SO_MAX_PACING_RATE 47 + #endif /* _ASM_SOCKET_H */ diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h index 4e1d66c3ce71..0f21e9a5ca18 100644 --- a/arch/sparc/include/uapi/asm/socket.h +++ b/arch/sparc/include/uapi/asm/socket.h @@ -72,6 +72,8 @@ #define SO_BUSY_POLL 0x0030 +#define SO_MAX_PACING_RATE 0x0031 + /* Security levels - as per NRL IPv6 - don't actually do anything */ #define SO_SECURITY_AUTHENTICATION 0x5001 #define SO_SECURITY_ENCRYPTION_TRANSPORT 0x5002 diff --git a/arch/xtensa/include/uapi/asm/socket.h b/arch/xtensa/include/uapi/asm/socket.h index c114483010c1..7db5c22faa68 100644 --- a/arch/xtensa/include/uapi/asm/socket.h +++ b/arch/xtensa/include/uapi/asm/socket.h @@ -87,4 +87,6 @@ #define SO_BUSY_POLL 46 +#define SO_MAX_PACING_RATE 47 + #endif /* _XTENSA_SOCKET_H */ diff --git a/include/net/sock.h b/include/net/sock.h index 4625d2eff461..240aa3f08cd6 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -363,6 +363,7 @@ struct sock { int sk_wmem_queued; gfp_t sk_allocation; u32 sk_pacing_rate; /* bytes per second */ + u32 sk_max_pacing_rate; netdev_features_t sk_route_caps; netdev_features_t sk_route_nocaps; int sk_gso_type; diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h index f04b69b6abf2..38f14d0264c3 100644 --- a/include/uapi/asm-generic/socket.h +++ b/include/uapi/asm-generic/socket.h @@ -78,4 +78,6 @@ #define SO_BUSY_POLL 46 +#define SO_MAX_PACING_RATE 47 + #endif /* __ASM_GENERIC_SOCKET_H */ diff --git a/net/core/sock.c b/net/core/sock.c index 5b6beba494a3..2bd9b3faa0d0 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -914,6 +914,13 @@ set_rcvbuf: } break; #endif + + case SO_MAX_PACING_RATE: + sk->sk_max_pacing_rate = val; + sk->sk_pacing_rate = min(sk->sk_pacing_rate, + sk->sk_max_pacing_rate); + break; + default: ret = -ENOPROTOOPT; break; @@ -1177,6 +1184,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname, break; #endif + case SO_MAX_PACING_RATE: + v.val = sk->sk_max_pacing_rate; + break; + default: return -ENOPROTOOPT; } @@ -2319,6 +2330,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_ll_usec = sysctl_net_busy_read; #endif + sk->sk_max_pacing_rate = ~0U; /* * Before updating sk_refcnt, we must commit prior changes to memory * (Documentation/RCU/rculist_nulls.txt for details) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5d083855c111..66aa816ad30b 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -735,7 +735,7 @@ static void tcp_update_pacing_rate(struct sock *sk) if (tp->srtt > 8 + 2) do_div(rate, tp->srtt); - sk->sk_pacing_rate = min_t(u64, rate, ~0U); + sk->sk_pacing_rate = min_t(u64, rate, sk->sk_max_pacing_rate); } /* Calculate rto without backoff. This is the second half of Van Jacobson's -- cgit v1.2.3 From a528c219df2e865e178c538c7178961dfed5a13c Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Wed, 25 Sep 2013 12:02:44 +0200 Subject: dev: update __dev_notify_flags() to send rtnl msg This patch only prepares the next one, there is no functional change. Now, __dev_notify_flags() can also be used to notify flags changes via rtnetlink. Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 +++- net/core/dev.c | 11 ++++++----- net/core/rtnetlink.c | 3 +-- 3 files changed, 10 insertions(+), 8 deletions(-) (limited to 'net/core') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b4cfb63f264e..f44f99a69977 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2378,7 +2378,9 @@ extern int dev_ethtool(struct net *net, struct ifreq *); extern unsigned int dev_get_flags(const struct net_device *); extern int __dev_change_flags(struct net_device *, unsigned int flags); extern int dev_change_flags(struct net_device *, unsigned int); -extern void __dev_notify_flags(struct net_device *, unsigned int old_flags); +void __dev_notify_flags(struct net_device *, + unsigned int old_flags, + unsigned int gchanges); extern int dev_change_name(struct net_device *, const char *); extern int dev_set_alias(struct net_device *, const char *, size_t); extern int dev_change_net_namespace(struct net_device *, diff --git a/net/core/dev.c b/net/core/dev.c index 25ab6fe80da2..594a6b0ab3da 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5235,10 +5235,14 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags) return ret; } -void __dev_notify_flags(struct net_device *dev, unsigned int old_flags) +void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, + unsigned int gchanges) { unsigned int changes = dev->flags ^ old_flags; + if (gchanges) + rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges); + if (changes & IFF_UP) { if (dev->flags & IFF_UP) call_netdevice_notifiers(NETDEV_UP, dev); @@ -5274,10 +5278,7 @@ int dev_change_flags(struct net_device *dev, unsigned int flags) return ret; changes = old_flags ^ dev->flags; - if (changes) - rtmsg_ifinfo(RTM_NEWLINK, dev, changes); - - __dev_notify_flags(dev, old_flags); + __dev_notify_flags(dev, old_flags, changes); return ret; } EXPORT_SYMBOL(dev_change_flags); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 2a0e21de3060..4aedf03da052 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1647,9 +1647,8 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm) } dev->rtnl_link_state = RTNL_LINK_INITIALIZED; - rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U); - __dev_notify_flags(dev, old_flags); + __dev_notify_flags(dev, old_flags, ~0U); return 0; } EXPORT_SYMBOL(rtnl_configure_link); -- cgit v1.2.3 From 991fb3f74c142e1a1891ff4f7e9a6285a79a8ea1 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Wed, 25 Sep 2013 12:02:45 +0200 Subject: dev: always advertise rx_flags changes via netlink When flags IFF_PROMISC and IFF_ALLMULTI are changed, netlink messages are not consistent. For example, if a multicast daemon is running (flag IFF_ALLMULTI set in dev->flags but not dev->gflags, ie not exported to userspace) and then a user sets it via netlink (flag IFF_ALLMULTI set in dev->flags and dev->gflags, ie exported to userspace), no netlink message is sent. Same for IFF_PROMISC and because dev->promiscuity is exported via IFLA_PROMISCUITY, we may send a netlink message after each change of this counter. Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/core/dev.c | 60 ++++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 37 insertions(+), 23 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 594a6b0ab3da..81340ed7f0f4 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4988,7 +4988,7 @@ static void dev_change_rx_flags(struct net_device *dev, int flags) ops->ndo_change_rx_flags(dev, flags); } -static int __dev_set_promiscuity(struct net_device *dev, int inc) +static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify) { unsigned int old_flags = dev->flags; kuid_t uid; @@ -5031,6 +5031,8 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc) dev_change_rx_flags(dev, IFF_PROMISC); } + if (notify) + __dev_notify_flags(dev, old_flags, IFF_PROMISC); return 0; } @@ -5050,7 +5052,7 @@ int dev_set_promiscuity(struct net_device *dev, int inc) unsigned int old_flags = dev->flags; int err; - err = __dev_set_promiscuity(dev, inc); + err = __dev_set_promiscuity(dev, inc, true); if (err < 0) return err; if (dev->flags != old_flags) @@ -5059,22 +5061,9 @@ int dev_set_promiscuity(struct net_device *dev, int inc) } EXPORT_SYMBOL(dev_set_promiscuity); -/** - * dev_set_allmulti - update allmulti count on a device - * @dev: device - * @inc: modifier - * - * Add or remove reception of all multicast frames to a device. While the - * count in the device remains above zero the interface remains listening - * to all interfaces. Once it hits zero the device reverts back to normal - * filtering operation. A negative @inc value is used to drop the counter - * when releasing a resource needing all multicasts. - * Return 0 if successful or a negative errno code on error. - */ - -int dev_set_allmulti(struct net_device *dev, int inc) +static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify) { - unsigned int old_flags = dev->flags; + unsigned int old_flags = dev->flags, old_gflags = dev->gflags; ASSERT_RTNL(); @@ -5097,9 +5086,30 @@ int dev_set_allmulti(struct net_device *dev, int inc) if (dev->flags ^ old_flags) { dev_change_rx_flags(dev, IFF_ALLMULTI); dev_set_rx_mode(dev); + if (notify) + __dev_notify_flags(dev, old_flags, + dev->gflags ^ old_gflags); } return 0; } + +/** + * dev_set_allmulti - update allmulti count on a device + * @dev: device + * @inc: modifier + * + * Add or remove reception of all multicast frames to a device. While the + * count in the device remains above zero the interface remains listening + * to all interfaces. Once it hits zero the device reverts back to normal + * filtering operation. A negative @inc value is used to drop the counter + * when releasing a resource needing all multicasts. + * Return 0 if successful or a negative errno code on error. + */ + +int dev_set_allmulti(struct net_device *dev, int inc) +{ + return __dev_set_allmulti(dev, inc, true); +} EXPORT_SYMBOL(dev_set_allmulti); /* @@ -5124,10 +5134,10 @@ void __dev_set_rx_mode(struct net_device *dev) * therefore calling __dev_set_promiscuity here is safe. */ if (!netdev_uc_empty(dev) && !dev->uc_promisc) { - __dev_set_promiscuity(dev, 1); + __dev_set_promiscuity(dev, 1, false); dev->uc_promisc = true; } else if (netdev_uc_empty(dev) && dev->uc_promisc) { - __dev_set_promiscuity(dev, -1); + __dev_set_promiscuity(dev, -1, false); dev->uc_promisc = false; } } @@ -5216,9 +5226,13 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags) if ((flags ^ dev->gflags) & IFF_PROMISC) { int inc = (flags & IFF_PROMISC) ? 1 : -1; + unsigned int old_flags = dev->flags; dev->gflags ^= IFF_PROMISC; - dev_set_promiscuity(dev, inc); + + if (__dev_set_promiscuity(dev, inc, false) >= 0) + if (dev->flags != old_flags) + dev_set_rx_mode(dev); } /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI @@ -5229,7 +5243,7 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags) int inc = (flags & IFF_ALLMULTI) ? 1 : -1; dev->gflags ^= IFF_ALLMULTI; - dev_set_allmulti(dev, inc); + __dev_set_allmulti(dev, inc, false); } return ret; @@ -5271,13 +5285,13 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, int dev_change_flags(struct net_device *dev, unsigned int flags) { int ret; - unsigned int changes, old_flags = dev->flags; + unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags; ret = __dev_change_flags(dev, flags); if (ret < 0) return ret; - changes = old_flags ^ dev->flags; + changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags); __dev_notify_flags(dev, old_flags, changes); return ret; } -- cgit v1.2.3 From 357afe9c46c951c34769e39cabdf8d1637e2eecc Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Wed, 2 Oct 2013 13:39:24 +0200 Subject: flow_dissector: factor out the ports extraction in skb_flow_get_ports Factor out the code that extracts the ports from skb_flow_dissect and add a new function skb_flow_get_ports which can be re-used. Suggested-by: Veaceslav Falico Signed-off-by: Nikolay Aleksandrov Acked-by: Eric Dumazet Reviewed-by: Veaceslav Falico Signed-off-by: David S. Miller --- include/net/flow_keys.h | 1 + net/core/flow_dissector.c | 39 ++++++++++++++++++++++++++++----------- 2 files changed, 29 insertions(+), 11 deletions(-) (limited to 'net/core') diff --git a/include/net/flow_keys.h b/include/net/flow_keys.h index ac2439d02f54..7e64bd8bbda9 100644 --- a/include/net/flow_keys.h +++ b/include/net/flow_keys.h @@ -14,4 +14,5 @@ struct flow_keys { }; bool skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow); +__be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto); #endif diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 8d7d0dd72db2..f8e25ac41c6c 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -25,9 +25,35 @@ static void iph_to_flow_copy_addrs(struct flow_keys *flow, const struct iphdr *i memcpy(&flow->src, &iph->saddr, sizeof(flow->src) + sizeof(flow->dst)); } +/** + * skb_flow_get_ports - extract the upper layer ports and return them + * @skb: buffer to extract the ports from + * @thoff: transport header offset + * @ip_proto: protocol for which to get port offset + * + * The function will try to retrieve the ports at offset thoff + poff where poff + * is the protocol port offset returned from proto_ports_offset + */ +__be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto) +{ + int poff = proto_ports_offset(ip_proto); + + if (poff >= 0) { + __be32 *ports, _ports; + + ports = skb_header_pointer(skb, thoff + poff, + sizeof(_ports), &_ports); + if (ports) + return *ports; + } + + return 0; +} +EXPORT_SYMBOL(skb_flow_get_ports); + bool skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow) { - int poff, nhoff = skb_network_offset(skb); + int nhoff = skb_network_offset(skb); u8 ip_proto; __be16 proto = skb->protocol; @@ -150,16 +176,7 @@ ipv6: } flow->ip_proto = ip_proto; - poff = proto_ports_offset(ip_proto); - if (poff >= 0) { - __be32 *ports, _ports; - - ports = skb_header_pointer(skb, nhoff + poff, - sizeof(_ports), &_ports); - if (ports) - flow->ports = *ports; - } - + flow->ports = skb_flow_get_ports(skb, nhoff, ip_proto); flow->thoff = (u16) nhoff; return true; -- cgit v1.2.3 From 5cde282938915f36a2e6769b51c24c4159654859 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 5 Oct 2013 19:26:05 -0700 Subject: net: Separate the close_list and the unreg_list v2 Separate the unreg_list and the close_list in dev_close_many preventing dev_close_many from permuting the unreg_list. The permutations of the unreg_list have resulted in cases where the loopback device is accessed it has been freed in code such as dst_ifdown. Resulting in subtle memory corruption. This is the second bug from sharing the storage between the close_list and the unreg_list. The issues that crop up with sharing are apparently too subtle to show up in normal testing or usage, so let's forget about being clever and use two separate lists. v2: Make all callers pass in a close_list to dev_close_many Signed-off-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + net/core/dev.c | 25 ++++++++++++++----------- net/sched/sch_generic.c | 6 +++--- 3 files changed, 18 insertions(+), 14 deletions(-) (limited to 'net/core') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f5cd464271bf..6d77e0f3cc10 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1143,6 +1143,7 @@ struct net_device { struct list_head dev_list; struct list_head napi_list; struct list_head unreg_list; + struct list_head close_list; /* directly linked devices, like slaves for bonding */ struct { diff --git a/net/core/dev.c b/net/core/dev.c index c25db20a4246..fa0b2b06c1a6 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1307,7 +1307,7 @@ static int __dev_close_many(struct list_head *head) ASSERT_RTNL(); might_sleep(); - list_for_each_entry(dev, head, unreg_list) { + list_for_each_entry(dev, head, close_list) { call_netdevice_notifiers(NETDEV_GOING_DOWN, dev); clear_bit(__LINK_STATE_START, &dev->state); @@ -1323,7 +1323,7 @@ static int __dev_close_many(struct list_head *head) dev_deactivate_many(head); - list_for_each_entry(dev, head, unreg_list) { + list_for_each_entry(dev, head, close_list) { const struct net_device_ops *ops = dev->netdev_ops; /* @@ -1351,7 +1351,7 @@ static int __dev_close(struct net_device *dev) /* Temporarily disable netpoll until the interface is down */ netpoll_rx_disable(dev); - list_add(&dev->unreg_list, &single); + list_add(&dev->close_list, &single); retval = __dev_close_many(&single); list_del(&single); @@ -1362,21 +1362,20 @@ static int __dev_close(struct net_device *dev) static int dev_close_many(struct list_head *head) { struct net_device *dev, *tmp; - LIST_HEAD(tmp_list); - list_for_each_entry_safe(dev, tmp, head, unreg_list) + /* Remove the devices that don't need to be closed */ + list_for_each_entry_safe(dev, tmp, head, close_list) if (!(dev->flags & IFF_UP)) - list_move(&dev->unreg_list, &tmp_list); + list_del_init(&dev->close_list); __dev_close_many(head); - list_for_each_entry(dev, head, unreg_list) { + list_for_each_entry_safe(dev, tmp, head, close_list) { rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); call_netdevice_notifiers(NETDEV_DOWN, dev); + list_del_init(&dev->close_list); } - /* rollback_registered_many needs the complete original list */ - list_splice(&tmp_list, head); return 0; } @@ -1397,7 +1396,7 @@ int dev_close(struct net_device *dev) /* Block netpoll rx while the interface is going down */ netpoll_rx_disable(dev); - list_add(&dev->unreg_list, &single); + list_add(&dev->close_list, &single); dev_close_many(&single); list_del(&single); @@ -5439,6 +5438,7 @@ static void net_set_todo(struct net_device *dev) static void rollback_registered_many(struct list_head *head) { struct net_device *dev, *tmp; + LIST_HEAD(close_head); BUG_ON(dev_boot_phase); ASSERT_RTNL(); @@ -5461,7 +5461,9 @@ static void rollback_registered_many(struct list_head *head) } /* If device is running, close it first. */ - dev_close_many(head); + list_for_each_entry(dev, head, unreg_list) + list_add_tail(&dev->close_list, &close_head); + dev_close_many(&close_head); list_for_each_entry(dev, head, unreg_list) { /* And unlink it from device chain. */ @@ -6257,6 +6259,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, INIT_LIST_HEAD(&dev->napi_list); INIT_LIST_HEAD(&dev->unreg_list); + INIT_LIST_HEAD(&dev->close_list); INIT_LIST_HEAD(&dev->link_watch_list); INIT_LIST_HEAD(&dev->adj_list.upper); INIT_LIST_HEAD(&dev->adj_list.lower); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index e7121d29c4bd..7fc899a943a8 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -829,7 +829,7 @@ void dev_deactivate_many(struct list_head *head) struct net_device *dev; bool sync_needed = false; - list_for_each_entry(dev, head, unreg_list) { + list_for_each_entry(dev, head, close_list) { netdev_for_each_tx_queue(dev, dev_deactivate_queue, &noop_qdisc); if (dev_ingress_queue(dev)) @@ -848,7 +848,7 @@ void dev_deactivate_many(struct list_head *head) synchronize_net(); /* Wait for outstanding qdisc_run calls. */ - list_for_each_entry(dev, head, unreg_list) + list_for_each_entry(dev, head, close_list) while (some_qdisc_is_busy(dev)) yield(); } @@ -857,7 +857,7 @@ void dev_deactivate(struct net_device *dev) { LIST_HEAD(single); - list_add(&dev->unreg_list, &single); + list_add(&dev->close_list, &single); dev_deactivate_many(&single); list_del(&single); } -- cgit v1.2.3 From e1af5e445ef8582e8f690fadcd63797db1e62663 Mon Sep 17 00:00:00 2001 From: Gao feng Date: Tue, 8 Oct 2013 11:05:19 +0800 Subject: cgroup: netprio: remove unnecessary task_netprioidx Since the tasks have been migrated to the cgroup, there is no need to call task_netprioidx to get task's cgroup id. Signed-off-by: Gao feng Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/core/netprio_cgroup.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index d9cd627e6a16..9b7cf6c85f82 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -222,11 +222,10 @@ static void net_prio_attach(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { struct task_struct *p; - void *v; + void *v = (void *)(unsigned long)css->cgroup->id; cgroup_taskset_for_each(p, css, tset) { task_lock(p); - v = (void *)(unsigned long)task_netprioidx(p); iterate_fd(p->files, 0, update_netprio, v); task_unlock(p); } -- cgit v1.2.3 From 8a29111c7ca68d928dfab58636f3f6acf0ac04f7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 8 Oct 2013 09:02:23 -0700 Subject: net: gro: allow to build full sized skb skb_gro_receive() is currently limited to 16 or 17 MSS per GRO skb, typically 24616 bytes, because it fills up to MAX_SKB_FRAGS frags. It's relatively easy to extend the skb using frag_list to allow more frags to be appended into the last sk_buff. This still builds very efficient skbs, and allows reaching 45 MSS per skb. (45 MSS GRO packet uses one skb plus a frag_list containing 2 additional sk_buff) High speed TCP flows benefit from this extension by lowering TCP stack cpu usage (less packets stored in receive queue, less ACK packets processed) Forwarding setups could be hurt, as such skbs will need to be linearized, although its not a new problem, as GRO could already provide skbs with a frag_list. We could make the 65536 bytes threshold a tunable to mitigate this. (First time we need to linearize skb in skb_needs_linearize(), we could lower the tunable to ~16*1460 so that following skb_gro_receive() calls build smaller skbs) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/skbuff.c | 43 ++++++++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 17 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index d81cff119f73..8ead744fcc94 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2936,32 +2936,30 @@ EXPORT_SYMBOL_GPL(skb_segment); int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) { - struct sk_buff *p = *head; - struct sk_buff *nskb; - struct skb_shared_info *skbinfo = skb_shinfo(skb); - struct skb_shared_info *pinfo = skb_shinfo(p); - unsigned int headroom; - unsigned int len = skb_gro_len(skb); + struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb); unsigned int offset = skb_gro_offset(skb); unsigned int headlen = skb_headlen(skb); + struct sk_buff *nskb, *lp, *p = *head; + unsigned int len = skb_gro_len(skb); unsigned int delta_truesize; + unsigned int headroom; - if (p->len + len >= 65536) + if (unlikely(p->len + len >= 65536)) return -E2BIG; - if (pinfo->frag_list) - goto merge; - else if (headlen <= offset) { + lp = NAPI_GRO_CB(p)->last ?: p; + pinfo = skb_shinfo(lp); + + if (headlen <= offset) { skb_frag_t *frag; skb_frag_t *frag2; int i = skbinfo->nr_frags; int nr_frags = pinfo->nr_frags + i; - offset -= headlen; - if (nr_frags > MAX_SKB_FRAGS) - return -E2BIG; + goto merge; + offset -= headlen; pinfo->nr_frags = nr_frags; skbinfo->nr_frags = 0; @@ -2992,7 +2990,7 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) unsigned int first_offset; if (nr_frags + 1 + skbinfo->nr_frags > MAX_SKB_FRAGS) - return -E2BIG; + goto merge; first_offset = skb->data - (unsigned char *)page_address(page) + @@ -3010,7 +3008,10 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) delta_truesize = skb->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff)); NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD; goto done; - } else if (skb_gro_len(p) != pinfo->gso_size) + } + if (pinfo->frag_list) + goto merge; + if (skb_gro_len(p) != pinfo->gso_size) return -E2BIG; headroom = skb_headroom(p); @@ -3062,16 +3063,24 @@ merge: __skb_pull(skb, offset); - NAPI_GRO_CB(p)->last->next = skb; + if (!NAPI_GRO_CB(p)->last) + skb_shinfo(p)->frag_list = skb; + else + NAPI_GRO_CB(p)->last->next = skb; NAPI_GRO_CB(p)->last = skb; skb_header_release(skb); + lp = p; done: NAPI_GRO_CB(p)->count++; p->data_len += len; p->truesize += delta_truesize; p->len += len; - + if (lp != p) { + lp->data_len += len; + lp->truesize += delta_truesize; + lp->len += len; + } NAPI_GRO_CB(skb)->same_flow = 1; return 0; } -- cgit v1.2.3 From 400dfd3ae899849b27d398ca7894e1b44430887f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 17 Oct 2013 16:27:07 -0700 Subject: net: refactor sk_page_frag_refill() While working on virtio_net new allocation strategy to increase payload/truesize ratio, we found that refactoring sk_page_frag_refill() was needed. This patch splits sk_page_frag_refill() into two parts, adding skb_page_frag_refill() which can be used without a socket. While we are at it, add a minimum frag size of 32 for sk_page_frag_refill() Michael will either use netdev_alloc_frag() from softirq context, or skb_page_frag_refill() from process context in refill_work() (GFP_KERNEL allocations) Signed-off-by: Eric Dumazet Cc: Michael Dalton Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 ++ net/core/sock.c | 27 +++++++++++++++++++++++---- 2 files changed, 25 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 1cd32f96055e..ba74474836c0 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2062,6 +2062,8 @@ static inline void skb_frag_set_page(struct sk_buff *skb, int f, __skb_frag_set_page(&skb_shinfo(skb)->frags[f], page); } +bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio); + /** * skb_frag_dma_map - maps a paged fragment via the DMA API * @dev: the device to map the fragment to diff --git a/net/core/sock.c b/net/core/sock.c index fd6afa267475..440afdca1e8f 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1847,7 +1847,17 @@ EXPORT_SYMBOL(sock_alloc_send_skb); /* On 32bit arches, an skb frag is limited to 2^15 */ #define SKB_FRAG_PAGE_ORDER get_order(32768) -bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) +/** + * skb_page_frag_refill - check that a page_frag contains enough room + * @sz: minimum size of the fragment we want to get + * @pfrag: pointer to page_frag + * @prio: priority for memory allocation + * + * Note: While this allocator tries to use high order pages, there is + * no guarantee that allocations succeed. Therefore, @sz MUST be + * less or equal than PAGE_SIZE. + */ +bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio) { int order; @@ -1856,16 +1866,16 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) pfrag->offset = 0; return true; } - if (pfrag->offset < pfrag->size) + if (pfrag->offset + sz <= pfrag->size) return true; put_page(pfrag->page); } /* We restrict high order allocations to users that can afford to wait */ - order = (sk->sk_allocation & __GFP_WAIT) ? SKB_FRAG_PAGE_ORDER : 0; + order = (prio & __GFP_WAIT) ? SKB_FRAG_PAGE_ORDER : 0; do { - gfp_t gfp = sk->sk_allocation; + gfp_t gfp = prio; if (order) gfp |= __GFP_COMP | __GFP_NOWARN; @@ -1877,6 +1887,15 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) } } while (--order >= 0); + return false; +} +EXPORT_SYMBOL(skb_page_frag_refill); + +bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) +{ + if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation))) + return true; + sk_enter_memory_pressure(sk); sk_stream_moderate_sndbuf(sk); return false; -- cgit v1.2.3 From 030737bcc3c404e273e97dbe06fe9561699a411b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 19 Oct 2013 11:42:54 -0700 Subject: net: generalize skb_segment() While implementing GSO/TSO support for IPIP, I found skb_segment() was assuming network header was immediately following mac header. Its not really true in the case inet_gso_segment() is stacked : By the time tcp_gso_segment() is called, network header points to the inner IP header. Let's instead assume nothing and pick the current offsets found in original skb, we have skb_headers_offset_update() helper for that. Also move the csum_start update inside skb_headers_offset_update() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/skbuff.c | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 8ead744fcc94..0ab32faa520f 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -903,6 +903,9 @@ EXPORT_SYMBOL(skb_clone); static void skb_headers_offset_update(struct sk_buff *skb, int off) { + /* Only adjust this if it actually is csum_start rather than csum */ + if (skb->ip_summed == CHECKSUM_PARTIAL) + skb->csum_start += off; /* {transport,network,mac}_header and tail are relative to skb->head */ skb->transport_header += off; skb->network_header += off; @@ -1109,9 +1112,6 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, #endif skb->tail += off; skb_headers_offset_update(skb, nhead); - /* Only adjust this if it actually is csum_start rather than csum */ - if (skb->ip_summed == CHECKSUM_PARTIAL) - skb->csum_start += nhead; skb->cloned = 0; skb->hdr_len = 0; skb->nohdr = 0; @@ -1176,7 +1176,6 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb, NUMA_NO_NODE); int oldheadroom = skb_headroom(skb); int head_copy_len, head_copy_off; - int off; if (!n) return NULL; @@ -1200,11 +1199,7 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb, copy_skb_header(n, skb); - off = newheadroom - oldheadroom; - if (n->ip_summed == CHECKSUM_PARTIAL) - n->csum_start += off; - - skb_headers_offset_update(n, off); + skb_headers_offset_update(n, newheadroom - oldheadroom); return n; } @@ -2837,14 +2832,7 @@ struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features) __copy_skb_header(nskb, skb); nskb->mac_len = skb->mac_len; - /* nskb and skb might have different headroom */ - if (nskb->ip_summed == CHECKSUM_PARTIAL) - nskb->csum_start += skb_headroom(nskb) - headroom; - - skb_reset_mac_header(nskb); - skb_set_network_header(nskb, skb->mac_len); - nskb->transport_header = (nskb->network_header + - skb_network_header_len(skb)); + skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom); skb_copy_from_linear_data_offset(skb, -tnl_hlen, nskb->data - tnl_hlen, -- cgit v1.2.3 From 3347c960295583eee3fd58e5c539fb1972fbc005 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 19 Oct 2013 11:42:56 -0700 Subject: ipv4: gso: make inet_gso_segment() stackable In order to support GSO on IPIP, we need to make inet_gso_segment() stackable. It should not assume network header starts right after mac header. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skbuff.h | 7 +++++-- net/core/dev.c | 2 ++ net/ipv4/af_inet.c | 25 ++++++++++++++++++------- 3 files changed, 25 insertions(+), 9 deletions(-) (limited to 'net/core') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index ba74474836c0..cad1e0c5cc04 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2722,9 +2722,12 @@ static inline struct sec_path *skb_sec_path(struct sk_buff *skb) /* Keeps track of mac header offset relative to skb->head. * It is useful for TSO of Tunneling protocol. e.g. GRE. * For non-tunnel skb it points to skb_mac_header() and for - * tunnel skb it points to outer mac header. */ + * tunnel skb it points to outer mac header. + * Keeps track of level of encapsulation of network headers. + */ struct skb_gso_cb { - int mac_offset; + int mac_offset; + int encap_level; }; #define SKB_GSO_CB(skb) ((struct skb_gso_cb *)(skb)->cb) diff --git a/net/core/dev.c b/net/core/dev.c index 1b6eadf69289..0918aadc20fd 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2377,6 +2377,8 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb, } SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb); + SKB_GSO_CB(skb)->encap_level = 0; + skb_reset_mac_header(skb); skb_reset_mac_len(skb); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 4f8cd4fc451d..5783ab5b5ef8 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1273,16 +1273,17 @@ out: } static struct sk_buff *inet_gso_segment(struct sk_buff *skb, - netdev_features_t features) + netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EINVAL); const struct net_offload *ops; + unsigned int offset = 0; struct iphdr *iph; + bool tunnel; int proto; + int nhoff; int ihl; int id; - unsigned int offset = 0; - bool tunnel; if (unlikely(skb_shinfo(skb)->gso_type & ~(SKB_GSO_TCPV4 | @@ -1296,6 +1297,8 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, 0))) goto out; + skb_reset_network_header(skb); + nhoff = skb_network_header(skb) - skb_mac_header(skb); if (unlikely(!pskb_may_pull(skb, sizeof(*iph)))) goto out; @@ -1312,7 +1315,10 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, goto out; __skb_pull(skb, ihl); - tunnel = !!skb->encapsulation; + tunnel = SKB_GSO_CB(skb)->encap_level > 0; + if (tunnel) + features = skb->dev->hw_enc_features & netif_skb_features(skb); + SKB_GSO_CB(skb)->encap_level += ihl; skb_reset_transport_header(skb); @@ -1327,18 +1333,23 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, skb = segs; do { - iph = ip_hdr(skb); + iph = (struct iphdr *)(skb_mac_header(skb) + nhoff); if (!tunnel && proto == IPPROTO_UDP) { iph->id = htons(id); iph->frag_off = htons(offset >> 3); if (skb->next != NULL) iph->frag_off |= htons(IP_MF); - offset += (skb->len - skb->mac_len - iph->ihl * 4); + offset += skb->len - nhoff - ihl; } else { iph->id = htons(id++); } - iph->tot_len = htons(skb->len - skb->mac_len); + iph->tot_len = htons(skb->len - nhoff); ip_send_check(iph); + if (tunnel) { + skb_reset_inner_headers(skb); + skb->encapsulation = 1; + } + skb->network_header = (u8 *)iph - skb->head; } while ((skb = skb->next)); out: -- cgit v1.2.3 From cb32f511a70be8967ac9025cf49c44324ced9a39 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 19 Oct 2013 11:42:57 -0700 Subject: ipip: add GSO/TSO support Now inet_gso_segment() is stackable, its relatively easy to implement GSO/TSO support for IPIP Performance results, when segmentation is done after tunnel device (as no NIC is yet enabled for TSO IPIP support) : Before patch : lpq83:~# ./netperf -H 7.7.9.84 -Cc MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 7.7.9.84 () port 0 AF_INET Recv Send Send Utilization Service Demand Socket Socket Message Elapsed Send Recv Send Recv Size Size Size Time Throughput local remote local remote bytes bytes bytes secs. 10^6bits/s % S % S us/KB us/KB 87380 16384 16384 10.00 3357.88 5.09 3.70 2.983 2.167 After patch : lpq83:~# ./netperf -H 7.7.9.84 -Cc MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 7.7.9.84 () port 0 AF_INET Recv Send Send Utilization Service Demand Socket Socket Message Elapsed Send Recv Send Recv Size Size Size Time Throughput local remote local remote bytes bytes bytes secs. 10^6bits/s % S % S us/KB us/KB 87380 16384 16384 10.00 7710.19 4.52 6.62 1.152 1.687 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdev_features.h | 2 ++ include/linux/skbuff.h | 6 ++++-- net/core/ethtool.c | 1 + net/ipv4/af_inet.c | 9 +++++++++ net/ipv4/gre_offload.c | 3 ++- net/ipv4/ipip.c | 11 ++++++----- net/ipv4/tcp_offload.c | 1 + net/ipv4/udp_offload.c | 1 + net/ipv6/ip6_offload.c | 1 + net/ipv6/udp_offload.c | 1 + net/mpls/mpls_gso.c | 1 + 11 files changed, 29 insertions(+), 8 deletions(-) (limited to 'net/core') diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index a2a89a5c7be5..8dad68cede1c 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -42,6 +42,7 @@ enum { NETIF_F_TSO6_BIT, /* ... TCPv6 segmentation */ NETIF_F_FSO_BIT, /* ... FCoE segmentation */ NETIF_F_GSO_GRE_BIT, /* ... GRE with TSO */ + NETIF_F_GSO_IPIP_BIT, /* ... IPIP tunnel with TSO */ NETIF_F_GSO_UDP_TUNNEL_BIT, /* ... UDP TUNNEL with TSO */ NETIF_F_GSO_MPLS_BIT, /* ... MPLS segmentation */ /**/NETIF_F_GSO_LAST = /* last bit, see GSO_MASK */ @@ -107,6 +108,7 @@ enum { #define NETIF_F_RXFCS __NETIF_F(RXFCS) #define NETIF_F_RXALL __NETIF_F(RXALL) #define NETIF_F_GSO_GRE __NETIF_F(GSO_GRE) +#define NETIF_F_GSO_IPIP __NETIF_F(GSO_IPIP) #define NETIF_F_GSO_UDP_TUNNEL __NETIF_F(GSO_UDP_TUNNEL) #define NETIF_F_GSO_MPLS __NETIF_F(GSO_MPLS) #define NETIF_F_HW_VLAN_STAG_FILTER __NETIF_F(HW_VLAN_STAG_FILTER) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index cad1e0c5cc04..60729134d253 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -318,9 +318,11 @@ enum { SKB_GSO_GRE = 1 << 6, - SKB_GSO_UDP_TUNNEL = 1 << 7, + SKB_GSO_IPIP = 1 << 7, - SKB_GSO_MPLS = 1 << 8, + SKB_GSO_UDP_TUNNEL = 1 << 8, + + SKB_GSO_MPLS = 1 << 9, }; #if BITS_PER_LONG > 32 diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 78e9d9223e40..8cab7744790e 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -81,6 +81,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_TSO6_BIT] = "tx-tcp6-segmentation", [NETIF_F_FSO_BIT] = "tx-fcoe-segmentation", [NETIF_F_GSO_GRE_BIT] = "tx-gre-segmentation", + [NETIF_F_GSO_IPIP_BIT] = "tx-ipip-segmentation", [NETIF_F_GSO_UDP_TUNNEL_BIT] = "tx-udp_tnl-segmentation", [NETIF_F_GSO_MPLS_BIT] = "tx-mpls-segmentation", diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 5783ab5b5ef8..4049906010f7 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1291,6 +1291,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, SKB_GSO_DODGY | SKB_GSO_TCP_ECN | SKB_GSO_GRE | + SKB_GSO_IPIP | SKB_GSO_TCPV6 | SKB_GSO_UDP_TUNNEL | SKB_GSO_MPLS | @@ -1656,6 +1657,13 @@ static struct packet_offload ip_packet_offload __read_mostly = { }, }; +static const struct net_offload ipip_offload = { + .callbacks = { + .gso_send_check = inet_gso_send_check, + .gso_segment = inet_gso_segment, + }, +}; + static int __init ipv4_offload_init(void) { /* @@ -1667,6 +1675,7 @@ static int __init ipv4_offload_init(void) pr_crit("%s: Cannot add TCP protocol offload\n", __func__); dev_add_offload(&ip_packet_offload); + inet_add_offload(&ipip_offload, IPPROTO_IPIP); return 0; } diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 55e6bfb3a289..e5d436188464 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -39,7 +39,8 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, SKB_GSO_UDP | SKB_GSO_DODGY | SKB_GSO_TCP_ECN | - SKB_GSO_GRE))) + SKB_GSO_GRE | + SKB_GSO_IPIP))) goto out; if (unlikely(!pskb_may_pull(skb, sizeof(*greh)))) diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 7f80fb4b82d3..fe3e9f7f1f0b 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -220,17 +220,17 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) if (unlikely(skb->protocol != htons(ETH_P_IP))) goto tx_error; - if (likely(!skb->encapsulation)) { - skb_reset_inner_headers(skb); - skb->encapsulation = 1; - } + skb = iptunnel_handle_offloads(skb, false, SKB_GSO_IPIP); + if (IS_ERR(skb)) + goto out; ip_tunnel_xmit(skb, dev, tiph, tiph->protocol); return NETDEV_TX_OK; tx_error: - dev->stats.tx_errors++; dev_kfree_skb(skb); +out: + dev->stats.tx_errors++; return NETDEV_TX_OK; } @@ -275,6 +275,7 @@ static const struct net_device_ops ipip_netdev_ops = { #define IPIP_FEATURES (NETIF_F_SG | \ NETIF_F_FRAGLIST | \ NETIF_F_HIGHDMA | \ + NETIF_F_GSO_SOFTWARE | \ NETIF_F_HW_CSUM) static void ipip_tunnel_setup(struct net_device *dev) diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 8e3113f46ec1..dfc96b00673e 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -56,6 +56,7 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, SKB_GSO_TCP_ECN | SKB_GSO_TCPV6 | SKB_GSO_GRE | + SKB_GSO_IPIP | SKB_GSO_MPLS | SKB_GSO_UDP_TUNNEL | 0) || diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index f35eccaa855e..83206de2bc76 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -52,6 +52,7 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY | SKB_GSO_UDP_TUNNEL | + SKB_GSO_IPIP | SKB_GSO_GRE | SKB_GSO_MPLS) || !(type & (SKB_GSO_UDP)))) goto out; diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index b405fba91c72..5c2fc1d04196 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -96,6 +96,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, SKB_GSO_DODGY | SKB_GSO_TCP_ECN | SKB_GSO_GRE | + SKB_GSO_IPIP | SKB_GSO_UDP_TUNNEL | SKB_GSO_MPLS | SKB_GSO_TCPV6 | diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 60559511bd9c..f63780ff3732 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -64,6 +64,7 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, SKB_GSO_DODGY | SKB_GSO_UDP_TUNNEL | SKB_GSO_GRE | + SKB_GSO_IPIP | SKB_GSO_MPLS) || !(type & (SKB_GSO_UDP)))) goto out; diff --git a/net/mpls/mpls_gso.c b/net/mpls/mpls_gso.c index 1bec1219ab81..851cd880b0c0 100644 --- a/net/mpls/mpls_gso.c +++ b/net/mpls/mpls_gso.c @@ -33,6 +33,7 @@ static struct sk_buff *mpls_gso_segment(struct sk_buff *skb, SKB_GSO_DODGY | SKB_GSO_TCP_ECN | SKB_GSO_GRE | + SKB_GSO_IPIP | SKB_GSO_MPLS))) goto out; -- cgit v1.2.3 From a48e42920ff38bc90bbf75143fff4555723d4540 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Sat, 19 Oct 2013 21:48:55 +0200 Subject: net: introduce new macro net_get_random_once net_get_random_once is a new macro which handles the initialization of secret keys. It is possible to call it in the fast path. Only the initialization depends on the spinlock and is rather slow. Otherwise it should get used just before the key is used to delay the entropy extration as late as possible to get better randomness. It returns true if the key got initialized. The usage of static_keys for net_get_random_once is a bit uncommon so it needs some further explanation why this actually works: === In the simple non-HAVE_JUMP_LABEL case we actually have === no constrains to use static_key_(true|false) on keys initialized with STATIC_KEY_INIT_(FALSE|TRUE). So this path just expands in favor of the likely case that the initialization is already done. The key is initialized like this: ___done_key = { .enabled = ATOMIC_INIT(0) } The check if (!static_key_true(&___done_key)) \ expands into (pseudo code) if (!likely(___done_key > 0)) , so we take the fast path as soon as ___done_key is increased from the helper function. === If HAVE_JUMP_LABELs are available this depends === on patching of jumps into the prepared NOPs, which is done in jump_label_init at boot-up time (from start_kernel). It is forbidden and dangerous to use net_get_random_once in functions which are called before that! At compilation time NOPs are generated at the call sites of net_get_random_once. E.g. net/ipv6/inet6_hashtable.c:inet6_ehashfn (we need to call net_get_random_once two times in inet6_ehashfn, so two NOPs): 71: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1) 76: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1) Both will be patched to the actual jumps to the end of the function to call __net_get_random_once at boot time as explained above. arch_static_branch is optimized and inlined for false as return value and actually also returns false in case the NOP is placed in the instruction stream. So in the fast case we get a "return false". But because we initialize ___done_key with (enabled != (entries & 1)) this call-site will get patched up at boot thus returning true. The final check looks like this: if (!static_key_true(&___done_key)) \ ___ret = __net_get_random_once(buf, \ expands to if (!!static_key_false(&___done_key)) \ ___ret = __net_get_random_once(buf, \ So we get true at boot time and as soon as static_key_slow_inc is called on the key it will invert the logic and return false for the fast path. static_key_slow_inc will change the branch because it got initialized with .enabled == 0. After static_key_slow_inc is called on the key the branch is replaced with a nop again. === Misc: === The helper defers the increment into a workqueue so we don't have problems calling this code from atomic sections. A seperate boolean (___done) guards the case where we enter net_get_random_once again before the increment happend. Cc: Ingo Molnar Cc: Steven Rostedt Cc: Jason Baron Cc: Peter Zijlstra Cc: Eric Dumazet Cc: "David S. Miller" Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/linux/net.h | 25 +++++++++++++++++++++++++ net/core/utils.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+) (limited to 'net/core') diff --git a/include/linux/net.h b/include/linux/net.h index ca9ec8540905..a489705f6fa3 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -239,6 +239,31 @@ do { \ #define net_random() prandom_u32() #define net_srandom(seed) prandom_seed((__force u32)(seed)) +bool __net_get_random_once(void *buf, int nbytes, bool *done, + struct static_key *done_key); + +#ifdef HAVE_JUMP_LABEL +#define ___NET_RANDOM_STATIC_KEY_INIT ((struct static_key) \ + { .enabled = ATOMIC_INIT(0), .entries = (void *)1 }) +#else /* !HAVE_JUMP_LABEL */ +#define ___NET_RANDOM_STATIC_KEY_INIT STATIC_KEY_INIT_FALSE +#endif /* HAVE_JUMP_LABEL */ + +/* BE CAREFUL: this function is not interrupt safe */ +#define net_get_random_once(buf, nbytes) \ + ({ \ + bool ___ret = false; \ + static bool ___done = false; \ + static struct static_key ___done_key = \ + ___NET_RANDOM_STATIC_KEY_INIT; \ + if (!static_key_true(&___done_key)) \ + ___ret = __net_get_random_once(buf, \ + nbytes, \ + &___done, \ + &___done_key); \ + ___ret; \ + }) + int kernel_sendmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t num, size_t len); int kernel_recvmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, diff --git a/net/core/utils.c b/net/core/utils.c index aa88e23fc87a..bf09371e19b1 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -338,3 +338,51 @@ void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb, csum_unfold(*sum))); } EXPORT_SYMBOL(inet_proto_csum_replace16); + +struct __net_random_once_work { + struct work_struct work; + struct static_key *key; +}; + +static void __net_random_once_deferred(struct work_struct *w) +{ + struct __net_random_once_work *work = + container_of(w, struct __net_random_once_work, work); + if (!static_key_enabled(work->key)) + static_key_slow_inc(work->key); + kfree(work); +} + +static void __net_random_once_disable_jump(struct static_key *key) +{ + struct __net_random_once_work *w; + + w = kmalloc(sizeof(*w), GFP_ATOMIC); + if (!w) + return; + + INIT_WORK(&w->work, __net_random_once_deferred); + w->key = key; + schedule_work(&w->work); +} + +bool __net_get_random_once(void *buf, int nbytes, bool *done, + struct static_key *done_key) +{ + static DEFINE_SPINLOCK(lock); + + spin_lock_bh(&lock); + if (*done) { + spin_unlock_bh(&lock); + return false; + } + + get_random_bytes(buf, nbytes); + *done = true; + spin_unlock_bh(&lock); + + __net_random_once_disable_jump(done_key); + + return true; +} +EXPORT_SYMBOL(__net_get_random_once); -- cgit v1.2.3 From e34c9a69970d8664a36b46e6445a7cc879111cfd Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Sat, 19 Oct 2013 21:48:59 +0200 Subject: net: switch net_secret key generation to net_get_random_once Cc: Eric Dumazet Cc: "David S. Miller" Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/core/secure_seq.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) (limited to 'net/core') diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c index 3f1ec1586ae1..b02fd16b8942 100644 --- a/net/core/secure_seq.c +++ b/net/core/secure_seq.c @@ -7,6 +7,7 @@ #include #include #include +#include #include @@ -16,18 +17,7 @@ static u32 net_secret[NET_SECRET_SIZE] ____cacheline_aligned; static void net_secret_init(void) { - u32 tmp; - int i; - - if (likely(net_secret[0])) - return; - - for (i = NET_SECRET_SIZE; i > 0;) { - do { - get_random_bytes(&tmp, sizeof(tmp)); - } while (!tmp); - cmpxchg(&net_secret[--i], 0, tmp); - } + net_get_random_once(net_secret, sizeof(net_secret)); } #ifdef CONFIG_INET -- cgit v1.2.3 From 61c1db7fae21ed33c614356a43bf6580c5e53118 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 20 Oct 2013 20:47:30 -0700 Subject: ipv6: sit: add GSO/TSO support Now ipv6_gso_segment() is stackable, its relatively easy to implement GSO/TSO support for SIT tunnels Performance results, when segmentation is done after tunnel device (as no NIC is yet enabled for TSO SIT support) : Before patch : lpq84:~# ./netperf -H 2002:af6:1153:: -Cc MIGRATED TCP STREAM TEST from ::0 (::) port 0 AF_INET6 to 2002:af6:1153:: () port 0 AF_INET6 Recv Send Send Utilization Service Demand Socket Socket Message Elapsed Send Recv Send Recv Size Size Size Time Throughput local remote local remote bytes bytes bytes secs. 10^6bits/s % S % S us/KB us/KB 87380 16384 16384 10.00 3168.31 4.81 4.64 2.988 2.877 After patch : lpq84:~# ./netperf -H 2002:af6:1153:: -Cc MIGRATED TCP STREAM TEST from ::0 (::) port 0 AF_INET6 to 2002:af6:1153:: () port 0 AF_INET6 Recv Send Send Utilization Service Demand Socket Socket Message Elapsed Send Recv Send Recv Size Size Size Time Throughput local remote local remote bytes bytes bytes secs. 10^6bits/s % S % S us/KB us/KB 87380 16384 16384 10.00 5525.00 7.76 5.17 2.763 1.840 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdev_features.h | 2 ++ include/linux/skbuff.h | 6 ++++-- net/core/ethtool.c | 1 + net/ipv4/af_inet.c | 1 + net/ipv4/tcp_offload.c | 1 + net/ipv6/ip6_offload.c | 11 +++++++++++ net/ipv6/sit.c | 28 +++++++++++++++++++--------- net/ipv6/udp_offload.c | 1 + 8 files changed, 40 insertions(+), 11 deletions(-) (limited to 'net/core') diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index 8dad68cede1c..b05a4b501ab5 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -43,6 +43,7 @@ enum { NETIF_F_FSO_BIT, /* ... FCoE segmentation */ NETIF_F_GSO_GRE_BIT, /* ... GRE with TSO */ NETIF_F_GSO_IPIP_BIT, /* ... IPIP tunnel with TSO */ + NETIF_F_GSO_SIT_BIT, /* ... SIT tunnel with TSO */ NETIF_F_GSO_UDP_TUNNEL_BIT, /* ... UDP TUNNEL with TSO */ NETIF_F_GSO_MPLS_BIT, /* ... MPLS segmentation */ /**/NETIF_F_GSO_LAST = /* last bit, see GSO_MASK */ @@ -109,6 +110,7 @@ enum { #define NETIF_F_RXALL __NETIF_F(RXALL) #define NETIF_F_GSO_GRE __NETIF_F(GSO_GRE) #define NETIF_F_GSO_IPIP __NETIF_F(GSO_IPIP) +#define NETIF_F_GSO_SIT __NETIF_F(GSO_SIT) #define NETIF_F_GSO_UDP_TUNNEL __NETIF_F(GSO_UDP_TUNNEL) #define NETIF_F_GSO_MPLS __NETIF_F(GSO_MPLS) #define NETIF_F_HW_VLAN_STAG_FILTER __NETIF_F(HW_VLAN_STAG_FILTER) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 60729134d253..2c154976394b 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -320,9 +320,11 @@ enum { SKB_GSO_IPIP = 1 << 7, - SKB_GSO_UDP_TUNNEL = 1 << 8, + SKB_GSO_SIT = 1 << 8, - SKB_GSO_MPLS = 1 << 9, + SKB_GSO_UDP_TUNNEL = 1 << 9, + + SKB_GSO_MPLS = 1 << 10, }; #if BITS_PER_LONG > 32 diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 8cab7744790e..862989898f61 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -82,6 +82,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_FSO_BIT] = "tx-fcoe-segmentation", [NETIF_F_GSO_GRE_BIT] = "tx-gre-segmentation", [NETIF_F_GSO_IPIP_BIT] = "tx-ipip-segmentation", + [NETIF_F_GSO_SIT_BIT] = "tx-sit-segmentation", [NETIF_F_GSO_UDP_TUNNEL_BIT] = "tx-udp_tnl-segmentation", [NETIF_F_GSO_MPLS_BIT] = "tx-mpls-segmentation", diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 24a53fc275b0..f4a159e705c0 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1265,6 +1265,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, SKB_GSO_TCP_ECN | SKB_GSO_GRE | SKB_GSO_IPIP | + SKB_GSO_SIT | SKB_GSO_TCPV6 | SKB_GSO_UDP_TUNNEL | SKB_GSO_MPLS | diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index dfc96b00673e..a7a5583eab04 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -57,6 +57,7 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, SKB_GSO_TCPV6 | SKB_GSO_GRE | SKB_GSO_IPIP | + SKB_GSO_SIT | SKB_GSO_MPLS | SKB_GSO_UDP_TUNNEL | 0) || diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index f9b33d82bb9d..4b851692b1f6 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -98,6 +98,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, SKB_GSO_TCP_ECN | SKB_GSO_GRE | SKB_GSO_IPIP | + SKB_GSO_SIT | SKB_GSO_UDP_TUNNEL | SKB_GSO_MPLS | SKB_GSO_TCPV6 | @@ -276,6 +277,13 @@ static struct packet_offload ipv6_packet_offload __read_mostly = { }, }; +static const struct net_offload sit_offload = { + .callbacks = { + .gso_send_check = ipv6_gso_send_check, + .gso_segment = ipv6_gso_segment, + }, +}; + static int __init ipv6_offload_init(void) { @@ -287,6 +295,9 @@ static int __init ipv6_offload_init(void) pr_crit("%s: Cannot add EXTHDRS protocol offload\n", __func__); dev_add_offload(&ipv6_packet_offload); + + inet_add_offload(&sit_offload, IPPROTO_IPV6); + return 0; } diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 19269453a8ea..3a9038dd818d 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -933,10 +933,9 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, ttl = iph6->hop_limit; tos = INET_ECN_encapsulate(tos, ipv6_get_dsfield(iph6)); - if (likely(!skb->encapsulation)) { - skb_reset_inner_headers(skb); - skb->encapsulation = 1; - } + skb = iptunnel_handle_offloads(skb, false, SKB_GSO_SIT); + if (IS_ERR(skb)) + goto out; err = iptunnel_xmit(rt, skb, fl4.saddr, fl4.daddr, IPPROTO_IPV6, tos, ttl, df, !net_eq(tunnel->net, dev_net(dev))); @@ -946,8 +945,9 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, tx_error_icmp: dst_link_failure(skb); tx_error: - dev->stats.tx_errors++; dev_kfree_skb(skb); +out: + dev->stats.tx_errors++; return NETDEV_TX_OK; } @@ -956,13 +956,15 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) struct ip_tunnel *tunnel = netdev_priv(dev); const struct iphdr *tiph = &tunnel->parms.iph; - if (likely(!skb->encapsulation)) { - skb_reset_inner_headers(skb); - skb->encapsulation = 1; - } + skb = iptunnel_handle_offloads(skb, false, SKB_GSO_IPIP); + if (IS_ERR(skb)) + goto out; ip_tunnel_xmit(skb, dev, tiph, IPPROTO_IPIP); return NETDEV_TX_OK; +out: + dev->stats.tx_errors++; + return NETDEV_TX_OK; } static netdev_tx_t sit_tunnel_xmit(struct sk_buff *skb, @@ -1292,6 +1294,12 @@ static void ipip6_dev_free(struct net_device *dev) free_netdev(dev); } +#define SIT_FEATURES (NETIF_F_SG | \ + NETIF_F_FRAGLIST | \ + NETIF_F_HIGHDMA | \ + NETIF_F_GSO_SOFTWARE | \ + NETIF_F_HW_CSUM) + static void ipip6_tunnel_setup(struct net_device *dev) { dev->netdev_ops = &ipip6_netdev_ops; @@ -1305,6 +1313,8 @@ static void ipip6_tunnel_setup(struct net_device *dev) dev->iflink = 0; dev->addr_len = 4; dev->features |= NETIF_F_LLTX; + dev->features |= SIT_FEATURES; + dev->hw_features |= SIT_FEATURES; } static int ipip6_tunnel_init(struct net_device *dev) diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index f63780ff3732..08e23b0bf302 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -65,6 +65,7 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, SKB_GSO_UDP_TUNNEL | SKB_GSO_GRE | SKB_GSO_IPIP | + SKB_GSO_SIT | SKB_GSO_MPLS) || !(type & (SKB_GSO_UDP)))) goto out; -- cgit v1.2.3 From 0a6957e7d47096bbeedda4e1d926359eb487dcfc Mon Sep 17 00:00:00 2001 From: ZHAO Gang Date: Tue, 22 Oct 2013 16:23:38 +0800 Subject: net: remove function sk_reset_txq() What sk_reset_txq() does is just calls function sk_tx_queue_reset(), and sk_reset_txq() is used only in sock.h, by dst_negative_advice(). Let dst_negative_advice() calls sk_tx_queue_reset() directly so we can remove unneeded sk_reset_txq(). Signed-off-by: ZHAO Gang Signed-off-by: David S. Miller --- include/net/sock.h | 4 +--- net/core/sock.c | 6 ------ 2 files changed, 1 insertion(+), 9 deletions(-) (limited to 'net/core') diff --git a/include/net/sock.h b/include/net/sock.h index 86bb0668c581..c93542f92420 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1746,8 +1746,6 @@ sk_dst_get(struct sock *sk) return dst; } -void sk_reset_txq(struct sock *sk); - static inline void dst_negative_advice(struct sock *sk) { struct dst_entry *ndst, *dst = __sk_dst_get(sk); @@ -1757,7 +1755,7 @@ static inline void dst_negative_advice(struct sock *sk) if (ndst != dst) { rcu_assign_pointer(sk->sk_dst_cache, ndst); - sk_reset_txq(sk); + sk_tx_queue_clear(sk); } } } diff --git a/net/core/sock.c b/net/core/sock.c index 440afdca1e8f..ab20ed9b0f31 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -475,12 +475,6 @@ discard_and_relse: } EXPORT_SYMBOL(sk_receive_skb); -void sk_reset_txq(struct sock *sk) -{ - sk_tx_queue_clear(sk); -} -EXPORT_SYMBOL(sk_reset_txq); - struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) { struct dst_entry *dst = __sk_dst_get(sk); -- cgit v1.2.3 From 34d92d5315b64a3e5292b7e9511c1bb617227fb6 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Wed, 23 Oct 2013 08:44:50 +0200 Subject: net: always inline net_secret_init Currently net_secret_init does not get inlined, so we always have a call to net_secret_init even in the fast path. Let's specify net_secret_init as __always_inline so we have the nop in the fast-path without the call to net_secret_init and the unlikely path at the epilogue of the function. jump_labels handle the inlining correctly. Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/core/secure_seq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c index b02fd16b8942..90e8a8250255 100644 --- a/net/core/secure_seq.c +++ b/net/core/secure_seq.c @@ -15,7 +15,7 @@ static u32 net_secret[NET_SECRET_SIZE] ____cacheline_aligned; -static void net_secret_init(void) +static __always_inline void net_secret_init(void) { net_get_random_once(net_secret, sizeof(net_secret)); } -- cgit v1.2.3 From 974daef7f8bb5d7be78fae3a240fcce43cae0135 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Wed, 23 Oct 2013 15:28:56 +0200 Subject: net: add missing dev_put() in __netdev_adjacent_dev_insert I think that a dev_put() is needed in the error path to preserve the proper dev refcount. CC: Veaceslav Falico Signed-off-by: Nikolay Aleksandrov Acked-by: Veaceslav Falico Signed-off-by: David S. Miller --- net/core/dev.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 0918aadc20fd..bdffd654edc4 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4648,6 +4648,7 @@ remove_symlinks: free_adj: kfree(adj); + dev_put(adj_dev); return ret; } -- cgit v1.2.3 From f84be2bd96a108b09c8440263fa3adb3fb225fa3 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Wed, 23 Oct 2013 20:05:27 +0200 Subject: net: make net_get_random_once irq safe I initial build non irq safe version of net_get_random_once because I would liked to have the freedom to defer even the extraction process of get_random_bytes until the nonblocking pool is fully seeded. I don't think this is a good idea anymore and thus this patch makes net_get_random_once irq safe. Now someone using net_get_random_once does not need to care from where it is called. Cc: David S. Miller Cc: Eric Dumazet Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/linux/net.h | 1 - net/core/utils.c | 7 ++++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/include/linux/net.h b/include/linux/net.h index aca446b46754..b292a0435571 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -250,7 +250,6 @@ bool __net_get_random_once(void *buf, int nbytes, bool *done, #define ___NET_RANDOM_STATIC_KEY_INIT STATIC_KEY_INIT_FALSE #endif /* HAVE_JUMP_LABEL */ -/* BE CAREFUL: this function is not interrupt safe */ #define net_get_random_once(buf, nbytes) \ ({ \ bool ___ret = false; \ diff --git a/net/core/utils.c b/net/core/utils.c index bf09371e19b1..2f737bf90b3f 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -370,16 +370,17 @@ bool __net_get_random_once(void *buf, int nbytes, bool *done, struct static_key *done_key) { static DEFINE_SPINLOCK(lock); + unsigned long flags; - spin_lock_bh(&lock); + spin_lock_irqsave(&lock, flags); if (*done) { - spin_unlock_bh(&lock); + spin_unlock_irqrestore(&lock, flags); return false; } get_random_bytes(buf, nbytes); *done = true; - spin_unlock_bh(&lock); + spin_unlock_irqrestore(&lock, flags); __net_random_once_disable_jump(done_key); -- cgit v1.2.3 From 66415cf8a1b99d101317f5aa08574b1ec8832672 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Wed, 23 Oct 2013 20:06:00 +0200 Subject: net: initialize hashrnd in flow_dissector with net_get_random_once We also can defer the initialization of hashrnd in flow_dissector to its first use. Since net_get_random_once is irq safe now we don't have to audit the call paths if one of this functions get called by an interrupt handler. Cc: David S. Miller Cc: Eric Dumazet Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) (limited to 'net/core') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index f8e25ac41c6c..5cac36e6ccd1 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -184,6 +184,22 @@ ipv6: EXPORT_SYMBOL(skb_flow_dissect); static u32 hashrnd __read_mostly; +static __always_inline void __flow_hash_secret_init(void) +{ + net_get_random_once(&hashrnd, sizeof(hashrnd)); +} + +static __always_inline u32 __flow_hash_3words(u32 a, u32 b, u32 c) +{ + __flow_hash_secret_init(); + return jhash_3words(a, b, c, hashrnd); +} + +static __always_inline u32 __flow_hash_1word(u32 a) +{ + __flow_hash_secret_init(); + return jhash_1word(a, hashrnd); +} /* * __skb_get_rxhash: calculate a flow hash based on src/dst addresses @@ -210,9 +226,9 @@ void __skb_get_rxhash(struct sk_buff *skb) swap(keys.port16[0], keys.port16[1]); } - hash = jhash_3words((__force u32)keys.dst, - (__force u32)keys.src, - (__force u32)keys.ports, hashrnd); + hash = __flow_hash_3words((__force u32)keys.dst, + (__force u32)keys.src, + (__force u32)keys.ports); if (!hash) hash = 1; @@ -248,7 +264,7 @@ u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb, hash = skb->sk->sk_hash; else hash = (__force u16) skb->protocol; - hash = jhash_1word(hash, hashrnd); + hash = __flow_hash_1word(hash); return (u16) (((u64) hash * qcount) >> 32) + qoffset; } @@ -340,7 +356,7 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) else hash = (__force u16) skb->protocol ^ skb->rxhash; - hash = jhash_1word(hash, hashrnd); + hash = __flow_hash_1word(hash); queue_index = map->queues[ ((u64)hash * map->len) >> 32]; } @@ -395,11 +411,3 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev, skb_set_queue_mapping(skb, queue_index); return netdev_get_tx_queue(dev, queue_index); } - -static int __init initialize_hashrnd(void) -{ - get_random_bytes(&hashrnd, sizeof(hashrnd)); - return 0; -} - -late_initcall_sync(initialize_hashrnd); -- cgit v1.2.3 From 7f29405403d7c17f539c099987972b862e7e5255 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 23 Oct 2013 16:02:42 -0700 Subject: net: fix rtnl notification in atomic context commit 991fb3f74c "dev: always advertise rx_flags changes via netlink" introduced rtnl notification from __dev_set_promiscuity(), which can be called in atomic context. Steps to reproduce: ip tuntap add dev tap1 mode tap ifconfig tap1 up tcpdump -nei tap1 & ip tuntap del dev tap1 mode tap [ 271.627994] device tap1 left promiscuous mode [ 271.639897] BUG: sleeping function called from invalid context at mm/slub.c:940 [ 271.664491] in_atomic(): 1, irqs_disabled(): 0, pid: 3394, name: ip [ 271.677525] INFO: lockdep is turned off. [ 271.690503] CPU: 0 PID: 3394 Comm: ip Tainted: G W 3.12.0-rc3+ #73 [ 271.703996] Hardware name: System manufacturer System Product Name/P8Z77 WS, BIOS 3007 07/26/2012 [ 271.731254] ffffffff81a58506 ffff8807f0d57a58 ffffffff817544e5 ffff88082fa0f428 [ 271.760261] ffff8808071f5f40 ffff8807f0d57a88 ffffffff8108bad1 ffffffff81110ff8 [ 271.790683] 0000000000000010 00000000000000d0 00000000000000d0 ffff8807f0d57af8 [ 271.822332] Call Trace: [ 271.838234] [] dump_stack+0x55/0x76 [ 271.854446] [] __might_sleep+0x181/0x240 [ 271.870836] [] ? rcu_irq_exit+0x68/0xb0 [ 271.887076] [] kmem_cache_alloc_node+0x4e/0x2a0 [ 271.903368] [] ? vprintk_emit+0x1dc/0x5a0 [ 271.919716] [] ? __alloc_skb+0x57/0x2a0 [ 271.936088] [] ? vprintk_emit+0x1e0/0x5a0 [ 271.952504] [] __alloc_skb+0x57/0x2a0 [ 271.968902] [] rtmsg_ifinfo+0x52/0x100 [ 271.985302] [] __dev_notify_flags+0xad/0xc0 [ 272.001642] [] __dev_set_promiscuity+0x8c/0x1c0 [ 272.017917] [] ? packet_notifier+0x5/0x380 [ 272.033961] [] dev_set_promiscuity+0x29/0x50 [ 272.049855] [] packet_dev_mc+0x87/0xc0 [ 272.065494] [] packet_notifier+0x1b2/0x380 [ 272.080915] [] ? packet_notifier+0x5/0x380 [ 272.096009] [] notifier_call_chain+0x66/0x150 [ 272.110803] [] __raw_notifier_call_chain+0xe/0x10 [ 272.125468] [] raw_notifier_call_chain+0x16/0x20 [ 272.139984] [] call_netdevice_notifiers_info+0x40/0x70 [ 272.154523] [] call_netdevice_notifiers+0x16/0x20 [ 272.168552] [] rollback_registered_many+0x145/0x240 [ 272.182263] [] rollback_registered+0x31/0x40 [ 272.195369] [] unregister_netdevice_queue+0x58/0x90 [ 272.208230] [] __tun_detach+0x140/0x340 [ 272.220686] [] tun_chr_close+0x36/0x60 Signed-off-by: Alexei Starovoitov Acked-by: Nicolas Dichtel Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 4 ++-- include/linux/rtnetlink.h | 2 +- net/core/dev.c | 16 ++++++++-------- net/core/rtnetlink.c | 9 +++++---- 4 files changed, 16 insertions(+), 15 deletions(-) (limited to 'net/core') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 2daa066c6cdd..a141f406cb98 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1213,7 +1213,7 @@ static int bond_master_upper_dev_link(struct net_device *bond_dev, if (err) return err; slave_dev->flags |= IFF_SLAVE; - rtmsg_ifinfo(RTM_NEWLINK, slave_dev, IFF_SLAVE); + rtmsg_ifinfo(RTM_NEWLINK, slave_dev, IFF_SLAVE, GFP_KERNEL); return 0; } @@ -1222,7 +1222,7 @@ static void bond_upper_dev_unlink(struct net_device *bond_dev, { netdev_upper_dev_unlink(slave_dev, bond_dev); slave_dev->flags &= ~IFF_SLAVE; - rtmsg_ifinfo(RTM_NEWLINK, slave_dev, IFF_SLAVE); + rtmsg_ifinfo(RTM_NEWLINK, slave_dev, IFF_SLAVE, GFP_KERNEL); } /* enslave device to bond device */ diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index f28544b2f9af..939428ad25ac 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -15,7 +15,7 @@ extern int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics); extern int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, long expires, u32 error); -extern void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change); +void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change, gfp_t flags); /* RTNL is used as a global lock for all changes to network configuration */ extern void rtnl_lock(void); diff --git a/net/core/dev.c b/net/core/dev.c index bdffd654edc4..0054c8c75f50 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1203,7 +1203,7 @@ void netdev_state_change(struct net_device *dev) { if (dev->flags & IFF_UP) { call_netdevice_notifiers(NETDEV_CHANGE, dev); - rtmsg_ifinfo(RTM_NEWLINK, dev, 0); + rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL); } } EXPORT_SYMBOL(netdev_state_change); @@ -1293,7 +1293,7 @@ int dev_open(struct net_device *dev) if (ret < 0) return ret; - rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); + rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL); call_netdevice_notifiers(NETDEV_UP, dev); return ret; @@ -1371,7 +1371,7 @@ static int dev_close_many(struct list_head *head) __dev_close_many(head); list_for_each_entry_safe(dev, tmp, head, close_list) { - rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); + rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL); call_netdevice_notifiers(NETDEV_DOWN, dev); list_del_init(&dev->close_list); } @@ -5258,7 +5258,7 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, unsigned int changes = dev->flags ^ old_flags; if (gchanges) - rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges); + rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC); if (changes & IFF_UP) { if (dev->flags & IFF_UP) @@ -5490,7 +5490,7 @@ static void rollback_registered_many(struct list_head *head) if (!dev->rtnl_link_ops || dev->rtnl_link_state == RTNL_LINK_INITIALIZED) - rtmsg_ifinfo(RTM_DELLINK, dev, ~0U); + rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL); /* * Flush the unicast and multicast chains @@ -5889,7 +5889,7 @@ int register_netdevice(struct net_device *dev) */ if (!dev->rtnl_link_ops || dev->rtnl_link_state == RTNL_LINK_INITIALIZED) - rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U); + rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL); out: return ret; @@ -6501,7 +6501,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char call_netdevice_notifiers(NETDEV_UNREGISTER, dev); rcu_barrier(); call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); - rtmsg_ifinfo(RTM_DELLINK, dev, ~0U); + rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL); /* * Flush the unicast and multicast chains @@ -6540,7 +6540,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char * Prevent userspace races by waiting until the network * device is fully setup before sending notifications. */ - rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U); + rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL); synchronize_net(); err = 0; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 4aedf03da052..cf67144d3e3c 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1984,14 +1984,15 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } -void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change) +void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, + gfp_t flags) { struct net *net = dev_net(dev); struct sk_buff *skb; int err = -ENOBUFS; size_t if_info_size; - skb = nlmsg_new((if_info_size = if_nlmsg_size(dev, 0)), GFP_KERNEL); + skb = nlmsg_new((if_info_size = if_nlmsg_size(dev, 0)), flags); if (skb == NULL) goto errout; @@ -2002,7 +2003,7 @@ void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change) kfree_skb(skb); goto errout; } - rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_KERNEL); + rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, flags); return; errout: if (err < 0) @@ -2716,7 +2717,7 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi case NETDEV_JOIN: break; default: - rtmsg_ifinfo(RTM_NEWLINK, dev, 0); + rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL); break; } return NOTIFY_DONE; -- cgit v1.2.3 From c4e819d16c0f46fbdd3706adbd990b3b292d726c Mon Sep 17 00:00:00 2001 From: Zhi Yong Wu Date: Mon, 28 Oct 2013 14:01:49 +0800 Subject: net, datagram: fix the incorrect comment in zerocopy_sg_from_iovec() Signed-off-by: Zhi Yong Wu Signed-off-by: David S. Miller --- net/core/datagram.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/datagram.c b/net/core/datagram.c index af814e764206..a16ed7bbe376 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -577,7 +577,7 @@ EXPORT_SYMBOL(skb_copy_datagram_from_iovec); /** * zerocopy_sg_from_iovec - Build a zerocopy datagram from an iovec * @skb: buffer to copy - * @from: io vector to copy to + * @from: io vector to copy from * @offset: offset in the io vector to start copying from * @count: amount of vectors to copy to buffer from * -- cgit v1.2.3 From ab1a2d7773b23dbbb863fd63fcf83b67cf361e34 Mon Sep 17 00:00:00 2001 From: Zhi Yong Wu Date: Mon, 28 Oct 2013 14:01:50 +0800 Subject: net, iovec: fix the incorrect comment in memcpy_fromiovecend() Signed-off-by: Zhi Yong Wu Signed-off-by: David S. Miller --- net/core/iovec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/iovec.c b/net/core/iovec.c index b77eeecc0011..4cdb7c48dad6 100644 --- a/net/core/iovec.c +++ b/net/core/iovec.c @@ -100,7 +100,7 @@ int memcpy_toiovecend(const struct iovec *iov, unsigned char *kdata, EXPORT_SYMBOL(memcpy_toiovecend); /* - * Copy iovec from kernel. Returns -EFAULT on error. + * Copy iovec to kernel. Returns -EFAULT on error. */ int memcpy_fromiovecend(unsigned char *kdata, const struct iovec *iov, -- cgit v1.2.3 From cdfb97bc010d9e9d994eb68f2cbac3a8ada26104 Mon Sep 17 00:00:00 2001 From: Zhi Yong Wu Date: Mon, 28 Oct 2013 16:15:50 +0800 Subject: net, mc: fix the incorrect comments in two mc-related functions Signed-off-by: Zhi Yong Wu Signed-off-by: David S. Miller --- net/core/dev_addr_lists.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index 6cda4e2c2132..ec40a849fc42 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -752,7 +752,7 @@ int dev_mc_del_global(struct net_device *dev, const unsigned char *addr) EXPORT_SYMBOL(dev_mc_del_global); /** - * dev_mc_sync - Synchronize device's unicast list to another device + * dev_mc_sync - Synchronize device's multicast list to another device * @to: destination device * @from: source device * @@ -780,7 +780,7 @@ int dev_mc_sync(struct net_device *to, struct net_device *from) EXPORT_SYMBOL(dev_mc_sync); /** - * dev_mc_sync_multiple - Synchronize device's unicast list to another + * dev_mc_sync_multiple - Synchronize device's multicast list to another * device, but allow for multiple calls to sync to multiple devices. * @to: destination device * @from: source device -- cgit v1.2.3 From 2817a336d4d533fb8b68719723cd60ea7dd7c09e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 30 Oct 2013 11:50:51 +0100 Subject: net: skb_checksum: allow custom update/combine for walking skb Currently, skb_checksum walks over 1) linearized, 2) frags[], and 3) frag_list data and calculats the one's complement, a 32 bit result suitable for feeding into itself or csum_tcpudp_magic(), but unsuitable for SCTP as we're calculating CRC32c there. Hence, in order to not re-implement the very same function in SCTP (and maybe other protocols) over and over again, use an update() + combine() callback internally to allow for walking over the skb with different algorithms. Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/skbuff.h | 13 ++++++++++--- include/net/checksum.h | 6 ++++++ net/core/skbuff.c | 31 +++++++++++++++++++++---------- 3 files changed, 37 insertions(+), 13 deletions(-) (limited to 'net/core') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 2c154976394b..44727b5d4981 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2360,8 +2360,6 @@ int skb_copy_datagram_const_iovec(const struct sk_buff *from, int offset, void skb_free_datagram(struct sock *sk, struct sk_buff *skb); void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb); int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags); -__wsum skb_checksum(const struct sk_buff *skb, int offset, int len, - __wsum csum); int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len); int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len); __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, u8 *to, @@ -2373,9 +2371,18 @@ void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to); void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len); int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen); void skb_scrub_packet(struct sk_buff *skb, bool xnet); - struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features); +struct skb_checksum_ops { + __wsum (*update)(const void *mem, int len, __wsum wsum); + __wsum (*combine)(__wsum csum, __wsum csum2, int offset, int len); +}; + +__wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, + __wsum csum, const struct skb_checksum_ops *ops); +__wsum skb_checksum(const struct sk_buff *skb, int offset, int len, + __wsum csum); + static inline void *skb_header_pointer(const struct sk_buff *skb, int offset, int len, void *buffer) { diff --git a/include/net/checksum.h b/include/net/checksum.h index 8f59ca50477c..15f33fde826e 100644 --- a/include/net/checksum.h +++ b/include/net/checksum.h @@ -78,6 +78,12 @@ csum_block_add(__wsum csum, __wsum csum2, int offset) return csum_add(csum, (__force __wsum)sum); } +static inline __wsum +csum_block_add_ext(__wsum csum, __wsum csum2, int offset, int len) +{ + return csum_block_add(csum, csum2, offset); +} + static inline __wsum csum_block_sub(__wsum csum, __wsum csum2, int offset) { diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 0ab32faa520f..31aab5376cb4 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1928,9 +1928,8 @@ fault: EXPORT_SYMBOL(skb_store_bits); /* Checksum skb data. */ - -__wsum skb_checksum(const struct sk_buff *skb, int offset, - int len, __wsum csum) +__wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, + __wsum csum, const struct skb_checksum_ops *ops) { int start = skb_headlen(skb); int i, copy = start - offset; @@ -1941,7 +1940,7 @@ __wsum skb_checksum(const struct sk_buff *skb, int offset, if (copy > 0) { if (copy > len) copy = len; - csum = csum_partial(skb->data + offset, copy, csum); + csum = ops->update(skb->data + offset, copy, csum); if ((len -= copy) == 0) return csum; offset += copy; @@ -1962,10 +1961,10 @@ __wsum skb_checksum(const struct sk_buff *skb, int offset, if (copy > len) copy = len; vaddr = kmap_atomic(skb_frag_page(frag)); - csum2 = csum_partial(vaddr + frag->page_offset + - offset - start, copy, 0); + csum2 = ops->update(vaddr + frag->page_offset + + offset - start, copy, 0); kunmap_atomic(vaddr); - csum = csum_block_add(csum, csum2, pos); + csum = ops->combine(csum, csum2, pos, copy); if (!(len -= copy)) return csum; offset += copy; @@ -1984,9 +1983,9 @@ __wsum skb_checksum(const struct sk_buff *skb, int offset, __wsum csum2; if (copy > len) copy = len; - csum2 = skb_checksum(frag_iter, offset - start, - copy, 0); - csum = csum_block_add(csum, csum2, pos); + csum2 = __skb_checksum(frag_iter, offset - start, + copy, 0, ops); + csum = ops->combine(csum, csum2, pos, copy); if ((len -= copy) == 0) return csum; offset += copy; @@ -1998,6 +1997,18 @@ __wsum skb_checksum(const struct sk_buff *skb, int offset, return csum; } +EXPORT_SYMBOL(__skb_checksum); + +__wsum skb_checksum(const struct sk_buff *skb, int offset, + int len, __wsum csum) +{ + const struct skb_checksum_ops ops = { + .update = csum_partial, + .combine = csum_block_add_ext, + }; + + return __skb_checksum(skb, offset, len, csum, &ops); +} EXPORT_SYMBOL(skb_checksum); /* Both of above in one bottle. */ -- cgit v1.2.3 From 74d332c13b2148ae934ea94dac1745ae92efe8e5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 30 Oct 2013 13:10:44 -0700 Subject: net: extend net_device allocation to vmalloc() Joby Poriyath provided a xen-netback patch to reduce the size of xenvif structure as some netdev allocation could fail under memory pressure/fragmentation. This patch is handling the problem at the core level, allowing any netdev structures to use vmalloc() if kmalloc() failed. As vmalloc() adds overhead on a critical network path, add __GFP_REPEAT to kzalloc() flags to do this fallback only when really needed. Signed-off-by: Eric Dumazet Reported-by: Joby Poriyath Cc: Ben Hutchings Signed-off-by: David S. Miller --- Documentation/networking/netdevices.txt | 10 +++++----- include/linux/netdevice.h | 1 + net/core/dev.c | 22 +++++++++++++++++----- net/core/net-sysfs.c | 2 +- 4 files changed, 24 insertions(+), 11 deletions(-) (limited to 'net/core') diff --git a/Documentation/networking/netdevices.txt b/Documentation/networking/netdevices.txt index c7ecc7080494..0b1cf6b2a592 100644 --- a/Documentation/networking/netdevices.txt +++ b/Documentation/networking/netdevices.txt @@ -10,12 +10,12 @@ network devices. struct net_device allocation rules ================================== Network device structures need to persist even after module is unloaded and -must be allocated with kmalloc. If device has registered successfully, -it will be freed on last use by free_netdev. This is required to handle the -pathologic case cleanly (example: rmmod mydriver padded; + + if (is_vmalloc_addr(addr)) + vfree(addr); + else + kfree(addr); +} + /** * alloc_netdev_mqs - allocate network device * @sizeof_priv: size of private data to allocate space for @@ -6239,7 +6249,9 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, /* ensure 32-byte alignment of whole construct */ alloc_size += NETDEV_ALIGN - 1; - p = kzalloc(alloc_size, GFP_KERNEL); + p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); + if (!p) + p = vzalloc(alloc_size); if (!p) return NULL; @@ -6248,7 +6260,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev->pcpu_refcnt = alloc_percpu(int); if (!dev->pcpu_refcnt) - goto free_p; + goto free_dev; if (dev_addr_init(dev)) goto free_pcpu; @@ -6301,8 +6313,8 @@ free_pcpu: kfree(dev->_rx); #endif -free_p: - kfree(p); +free_dev: + netdev_freemem(dev); return NULL; } EXPORT_SYMBOL(alloc_netdev_mqs); @@ -6339,7 +6351,7 @@ void free_netdev(struct net_device *dev) /* Compatibility with error handling in drivers */ if (dev->reg_state == NETREG_UNINITIALIZED) { - kfree((char *)dev - dev->padded); + netdev_freemem(dev); return; } diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index d954b56b4e47..d03f2c9750fa 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1263,7 +1263,7 @@ static void netdev_release(struct device *d) BUG_ON(dev->reg_state != NETREG_RELEASED); kfree(dev->ifalias); - kfree((char *)dev - dev->padded); + netdev_freemem(dev); } static const void *net_namespace(struct device *d) -- cgit v1.2.3 From cea80ea8d2a4c646f240a8fd6ece5c8e7bc969d3 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 4 Nov 2013 17:10:25 +0100 Subject: net: checksum: fix warning in skb_checksum This patch fixes a build warning in skb_checksum() by wrapping the csum_partial() usage in skb_checksum(). The problem is that on a few architectures, csum_partial is used with prefix asmlinkage whereas on most architectures it's not. So fix this up generically as we did with csum_block_add_ext() to match the signature. Introduced by 2817a336d4d ("net: skb_checksum: allow custom update/combine for walking skb"). Reported-by: Fengguang Wu Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/net/checksum.h | 5 +++++ net/core/skbuff.c | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/include/net/checksum.h b/include/net/checksum.h index 15f33fde826e..37a0e24adbe7 100644 --- a/include/net/checksum.h +++ b/include/net/checksum.h @@ -98,6 +98,11 @@ static inline __wsum csum_unfold(__sum16 n) return (__force __wsum)n; } +static inline __wsum csum_partial_ext(const void *buff, int len, __wsum sum) +{ + return csum_partial(buff, len, sum); +} + #define CSUM_MANGLED_0 ((__force __sum16)0xffff) static inline void csum_replace4(__sum16 *sum, __be32 from, __be32 to) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 31aab5376cb4..e4115597b38b 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2003,7 +2003,7 @@ __wsum skb_checksum(const struct sk_buff *skb, int offset, int len, __wsum csum) { const struct skb_checksum_ops ops = { - .update = csum_partial, + .update = csum_partial_ext, .combine = csum_block_add_ext, }; -- cgit v1.2.3 From f8e617e100d7369a0108f96abf4414e9fb82ced7 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Fri, 1 Nov 2013 14:07:47 +0800 Subject: net: introduce skb_coalesce_rx_frag() Sometimes we need to coalesce the rx frags to avoid frag list. One example is virtio-net driver which tries to use small frags for both MTU sized packet and GSO packet. So this patch introduce skb_coalesce_rx_frag() to do this. Cc: Rusty Russell Cc: Michael S. Tsirkin Cc: Michael Dalton Cc: Eric Dumazet Acked-by: Michael S. Tsirkin Signed-off-by: Jason Wang Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skbuff.h | 3 +++ net/core/skbuff.c | 12 ++++++++++++ 2 files changed, 15 insertions(+) (limited to 'net/core') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 44727b5d4981..2e153b69d318 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1372,6 +1372,9 @@ static inline void skb_fill_page_desc(struct sk_buff *skb, int i, void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, int size, unsigned int truesize); +void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size, + unsigned int truesize); + #define SKB_PAGE_ASSERT(skb) BUG_ON(skb_shinfo(skb)->nr_frags) #define SKB_FRAG_ASSERT(skb) BUG_ON(skb_has_frag_list(skb)) #define SKB_LINEAR_ASSERT(skb) BUG_ON(skb_is_nonlinear(skb)) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index e4115597b38b..3735fad5616e 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -476,6 +476,18 @@ void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, } EXPORT_SYMBOL(skb_add_rx_frag); +void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size, + unsigned int truesize) +{ + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + skb_frag_size_add(frag, size); + skb->len += size; + skb->data_len += size; + skb->truesize += truesize; +} +EXPORT_SYMBOL(skb_coalesce_rx_frag); + static void skb_drop_list(struct sk_buff **listp) { kfree_skb_list(*listp); -- cgit v1.2.3 From a6cc0cfa72e0b6d9f2c8fd858aacc32313c4f272 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 6 Nov 2013 09:54:46 -0800 Subject: net: Add layer 2 hardware acceleration operations for macvlan devices Add a operations structure that allows a network interface to export the fact that it supports package forwarding in hardware between physical interfaces and other mac layer devices assigned to it (such as macvlans). This operaions structure can be used by virtual mac devices to bypass software switching so that forwarding can be done in hardware more efficiently. Signed-off-by: John Fastabend Signed-off-by: Neil Horman CC: Andy Gospodarek CC: "David S. Miller" Signed-off-by: David S. Miller --- drivers/net/macvlan.c | 36 +++++++++++++++++++++++++++++++++++- include/linux/if_macvlan.h | 1 + include/linux/netdev_features.h | 2 ++ include/linux/netdevice.h | 36 +++++++++++++++++++++++++++++++++++- include/uapi/linux/if.h | 1 + net/core/dev.c | 18 +++++++++++++----- net/core/ethtool.c | 1 + net/sched/sch_generic.c | 2 +- 8 files changed, 89 insertions(+), 8 deletions(-) (limited to 'net/core') diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index cc9845ec91c1..af4aaa5893ff 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -297,7 +297,13 @@ netdev_tx_t macvlan_start_xmit(struct sk_buff *skb, int ret; const struct macvlan_dev *vlan = netdev_priv(dev); - ret = macvlan_queue_xmit(skb, dev); + if (vlan->fwd_priv) { + skb->dev = vlan->lowerdev; + ret = dev_hard_start_xmit(skb, skb->dev, NULL, vlan->fwd_priv); + } else { + ret = macvlan_queue_xmit(skb, dev); + } + if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) { struct macvlan_pcpu_stats *pcpu_stats; @@ -347,6 +353,21 @@ static int macvlan_open(struct net_device *dev) goto hash_add; } + if (lowerdev->features & NETIF_F_HW_L2FW_DOFFLOAD) { + vlan->fwd_priv = + lowerdev->netdev_ops->ndo_dfwd_add_station(lowerdev, dev); + + /* If we get a NULL pointer back, or if we get an error + * then we should just fall through to the non accelerated path + */ + if (IS_ERR_OR_NULL(vlan->fwd_priv)) { + vlan->fwd_priv = NULL; + } else { + dev->features &= ~NETIF_F_LLTX; + return 0; + } + } + err = -EBUSY; if (macvlan_addr_busy(vlan->port, dev->dev_addr)) goto out; @@ -367,6 +388,11 @@ hash_add: del_unicast: dev_uc_del(lowerdev, dev->dev_addr); out: + if (vlan->fwd_priv) { + lowerdev->netdev_ops->ndo_dfwd_del_station(lowerdev, + vlan->fwd_priv); + vlan->fwd_priv = NULL; + } return err; } @@ -375,6 +401,13 @@ static int macvlan_stop(struct net_device *dev) struct macvlan_dev *vlan = netdev_priv(dev); struct net_device *lowerdev = vlan->lowerdev; + if (vlan->fwd_priv) { + lowerdev->netdev_ops->ndo_dfwd_del_station(lowerdev, + vlan->fwd_priv); + vlan->fwd_priv = NULL; + return 0; + } + dev_uc_unsync(lowerdev, dev); dev_mc_unsync(lowerdev, dev); @@ -833,6 +866,7 @@ int macvlan_common_newlink(struct net *src_net, struct net_device *dev, if (err < 0) goto destroy_port; + dev->priv_flags |= IFF_MACVLAN; err = netdev_upper_dev_link(lowerdev, dev); if (err) goto destroy_port; diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h index ddd33fd5904d..c2702856295e 100644 --- a/include/linux/if_macvlan.h +++ b/include/linux/if_macvlan.h @@ -61,6 +61,7 @@ struct macvlan_dev { struct hlist_node hlist; struct macvlan_port *port; struct net_device *lowerdev; + void *fwd_priv; struct macvlan_pcpu_stats __percpu *pcpu_stats; DECLARE_BITMAP(mc_filter, MACVLAN_MC_FILTER_SZ); diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index b05a4b501ab5..1005ebf17575 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -62,6 +62,7 @@ enum { NETIF_F_HW_VLAN_STAG_TX_BIT, /* Transmit VLAN STAG HW acceleration */ NETIF_F_HW_VLAN_STAG_RX_BIT, /* Receive VLAN STAG HW acceleration */ NETIF_F_HW_VLAN_STAG_FILTER_BIT,/* Receive filtering on VLAN STAGs */ + NETIF_F_HW_L2FW_DOFFLOAD_BIT, /* Allow L2 Forwarding in Hardware */ /* * Add your fresh new feature above and remember to update @@ -116,6 +117,7 @@ enum { #define NETIF_F_HW_VLAN_STAG_FILTER __NETIF_F(HW_VLAN_STAG_FILTER) #define NETIF_F_HW_VLAN_STAG_RX __NETIF_F(HW_VLAN_STAG_RX) #define NETIF_F_HW_VLAN_STAG_TX __NETIF_F(HW_VLAN_STAG_TX) +#define NETIF_F_HW_L2FW_DOFFLOAD __NETIF_F(HW_L2FW_DOFFLOAD) /* Features valid for ethtool to change */ /* = all defined minus driver/device-class-related */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b6f6efbcfc74..15fa01c9a3bf 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -962,6 +962,25 @@ struct netdev_phys_port_id { * Called by vxlan to notify the driver about a UDP port and socket * address family that vxlan is not listening to anymore. The operation * is protected by the vxlan_net->sock_lock. + * + * void* (*ndo_dfwd_add_station)(struct net_device *pdev, + * struct net_device *dev) + * Called by upper layer devices to accelerate switching or other + * station functionality into hardware. 'pdev is the lowerdev + * to use for the offload and 'dev' is the net device that will + * back the offload. Returns a pointer to the private structure + * the upper layer will maintain. + * void (*ndo_dfwd_del_station)(struct net_device *pdev, void *priv) + * Called by upper layer device to delete the station created + * by 'ndo_dfwd_add_station'. 'pdev' is the net device backing + * the station and priv is the structure returned by the add + * operation. + * netdev_tx_t (*ndo_dfwd_start_xmit)(struct sk_buff *skb, + * struct net_device *dev, + * void *priv); + * Callback to use for xmit over the accelerated station. This + * is used in place of ndo_start_xmit on accelerated net + * devices. */ struct net_device_ops { int (*ndo_init)(struct net_device *dev); @@ -1098,6 +1117,15 @@ struct net_device_ops { void (*ndo_del_vxlan_port)(struct net_device *dev, sa_family_t sa_family, __be16 port); + + void* (*ndo_dfwd_add_station)(struct net_device *pdev, + struct net_device *dev); + void (*ndo_dfwd_del_station)(struct net_device *pdev, + void *priv); + + netdev_tx_t (*ndo_dfwd_start_xmit) (struct sk_buff *skb, + struct net_device *dev, + void *priv); }; /* @@ -1195,6 +1223,7 @@ struct net_device { /* Management operations */ const struct net_device_ops *netdev_ops; const struct ethtool_ops *ethtool_ops; + const struct forwarding_accel_ops *fwd_ops; /* Hardware header description */ const struct header_ops *header_ops; @@ -2388,7 +2417,7 @@ int dev_change_carrier(struct net_device *, bool new_carrier); int dev_get_phys_port_id(struct net_device *dev, struct netdev_phys_port_id *ppid); int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, - struct netdev_queue *txq); + struct netdev_queue *txq, void *accel_priv); int dev_forward_skb(struct net_device *dev, struct sk_buff *skb); extern int netdev_budget; @@ -2967,6 +2996,11 @@ static inline void netif_set_gso_max_size(struct net_device *dev, dev->gso_max_size = size; } +static inline bool netif_is_macvlan(struct net_device *dev) +{ + return dev->priv_flags & IFF_MACVLAN; +} + static inline bool netif_is_bond_master(struct net_device *dev) { return dev->flags & IFF_MASTER && dev->priv_flags & IFF_BONDING; diff --git a/include/uapi/linux/if.h b/include/uapi/linux/if.h index 1ec407b01e46..d758163b0e43 100644 --- a/include/uapi/linux/if.h +++ b/include/uapi/linux/if.h @@ -83,6 +83,7 @@ #define IFF_SUPP_NOFCS 0x80000 /* device supports sending custom FCS */ #define IFF_LIVE_ADDR_CHANGE 0x100000 /* device supports hardware address * change when it's running */ +#define IFF_MACVLAN 0x200000 /* Macvlan device */ #define IF_GET_IFACE 0x0001 /* for querying only */ diff --git a/net/core/dev.c b/net/core/dev.c index 0e6136546a8c..8ffc52e01ece 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2538,7 +2538,7 @@ static inline int skb_needs_linearize(struct sk_buff *skb, } int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, - struct netdev_queue *txq) + struct netdev_queue *txq, void *accel_priv) { const struct net_device_ops *ops = dev->netdev_ops; int rc = NETDEV_TX_OK; @@ -2604,9 +2604,13 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, dev_queue_xmit_nit(skb, dev); skb_len = skb->len; - rc = ops->ndo_start_xmit(skb, dev); + if (accel_priv) + rc = ops->ndo_dfwd_start_xmit(skb, dev, accel_priv); + else + rc = ops->ndo_start_xmit(skb, dev); + trace_net_dev_xmit(skb, rc, dev, skb_len); - if (rc == NETDEV_TX_OK) + if (rc == NETDEV_TX_OK && txq) txq_trans_update(txq); return rc; } @@ -2622,7 +2626,10 @@ gso: dev_queue_xmit_nit(nskb, dev); skb_len = nskb->len; - rc = ops->ndo_start_xmit(nskb, dev); + if (accel_priv) + rc = ops->ndo_dfwd_start_xmit(nskb, dev, accel_priv); + else + rc = ops->ndo_start_xmit(nskb, dev); trace_net_dev_xmit(nskb, rc, dev, skb_len); if (unlikely(rc != NETDEV_TX_OK)) { if (rc & ~NETDEV_TX_MASK) @@ -2647,6 +2654,7 @@ out_kfree_skb: out: return rc; } +EXPORT_SYMBOL_GPL(dev_hard_start_xmit); static void qdisc_pkt_len_init(struct sk_buff *skb) { @@ -2854,7 +2862,7 @@ int dev_queue_xmit(struct sk_buff *skb) if (!netif_xmit_stopped(txq)) { __this_cpu_inc(xmit_recursion); - rc = dev_hard_start_xmit(skb, dev, txq); + rc = dev_hard_start_xmit(skb, dev, txq, NULL); __this_cpu_dec(xmit_recursion); if (dev_xmit_complete(rc)) { HARD_TX_UNLOCK(dev, txq); diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 862989898f61..30071dec287a 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -96,6 +96,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_LOOPBACK_BIT] = "loopback", [NETIF_F_RXFCS_BIT] = "rx-fcs", [NETIF_F_RXALL_BIT] = "rx-all", + [NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload", }; static int ethtool_get_features(struct net_device *dev, void __user *useraddr) diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 7fc899a943a8..922a09406ba7 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -126,7 +126,7 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, HARD_TX_LOCK(dev, txq, smp_processor_id()); if (!netif_xmit_frozen_or_stopped(txq)) - ret = dev_hard_start_xmit(skb, dev, txq); + ret = dev_hard_start_xmit(skb, dev, txq, NULL); HARD_TX_UNLOCK(dev, txq); -- cgit v1.2.3 From 0c7ddf36c29c3ce12f2d2931a357ccaa0861035a Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Thu, 7 Nov 2013 14:18:24 +0100 Subject: net: move pskb_put() to core code This function has usage beside IPsec so move it to the core skbuff code. While doing so, give it some documentation and change its return type to 'unsigned char *' to be in line with skb_put(). Signed-off-by: Mathias Krause Cc: Steffen Klassert Cc: "David S. Miller" Cc: Herbert Xu Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 + include/net/esp.h | 2 -- net/core/skbuff.c | 23 +++++++++++++++++++++++ net/xfrm/xfrm_algo.c | 13 ------------- 4 files changed, 24 insertions(+), 15 deletions(-) (limited to 'net/core') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 2e153b69d318..491dd6c2c6cc 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1417,6 +1417,7 @@ static inline void skb_set_tail_pointer(struct sk_buff *skb, const int offset) /* * Add data to an sk_buff */ +unsigned char *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len); unsigned char *skb_put(struct sk_buff *skb, unsigned int len); static inline unsigned char *__skb_put(struct sk_buff *skb, unsigned int len) { diff --git a/include/net/esp.h b/include/net/esp.h index c92213c38312..a43be85aedc4 100644 --- a/include/net/esp.h +++ b/include/net/esp.h @@ -3,8 +3,6 @@ #include -void *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len); - struct ip_esp_hdr; static inline struct ip_esp_hdr *ip_esp_hdr(const struct sk_buff *skb) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 3735fad5616e..2fbea08c4f50 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1263,6 +1263,29 @@ free_skb: } EXPORT_SYMBOL(skb_pad); +/** + * pskb_put - add data to the tail of a potentially fragmented buffer + * @skb: start of the buffer to use + * @tail: tail fragment of the buffer to use + * @len: amount of data to add + * + * This function extends the used data area of the potentially + * fragmented buffer. @tail must be the last fragment of @skb -- or + * @skb itself. If this would exceed the total buffer size the kernel + * will panic. A pointer to the first byte of the extra data is + * returned. + */ + +unsigned char *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len) +{ + if (tail != skb) { + skb->data_len += len; + skb->len += len; + } + return skb_put(tail, len); +} +EXPORT_SYMBOL_GPL(pskb_put); + /** * skb_put - add data to a buffer * @skb: buffer to use diff --git a/net/xfrm/xfrm_algo.c b/net/xfrm/xfrm_algo.c index ab4ef72f0b1d..debe733386f8 100644 --- a/net/xfrm/xfrm_algo.c +++ b/net/xfrm/xfrm_algo.c @@ -802,17 +802,4 @@ int xfrm_count_pfkey_enc_supported(void) } EXPORT_SYMBOL_GPL(xfrm_count_pfkey_enc_supported); -#if defined(CONFIG_INET_ESP) || defined(CONFIG_INET_ESP_MODULE) || defined(CONFIG_INET6_ESP) || defined(CONFIG_INET6_ESP_MODULE) - -void *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len) -{ - if (tail != skb) { - skb->data_len += len; - skb->len += len; - } - return skb_put(tail, len); -} -EXPORT_SYMBOL_GPL(pskb_put); -#endif - MODULE_LICENSE("GPL"); -- cgit v1.2.3 From bc32383cd6496d595e6a25cdc7cff1da6b694462 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Thu, 7 Nov 2013 14:18:26 +0100 Subject: net: skbuff - kernel-doc fixes Use "@" to refer to parameters in the kernel-doc description. According to Documentation/kernel-doc-nano-HOWTO.txt "&" shall be used to refer to structures only. Signed-off-by: Mathias Krause Cc: "David S. Miller" Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 +- net/core/skbuff.c | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 491dd6c2c6cc..036ec7d8a83a 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1357,7 +1357,7 @@ static inline void __skb_fill_page_desc(struct sk_buff *skb, int i, * @size: the length of the data * * As per __skb_fill_page_desc() -- initialises the @i'th fragment of - * @skb to point to &size bytes at offset @off within @page. In + * @skb to point to @size bytes at offset @off within @page. In * addition updates @skb such that @i is the last fragment. * * Does not take any additional reference on the fragment. diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 2fbea08c4f50..8c5197fe55a4 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1051,8 +1051,8 @@ EXPORT_SYMBOL(__pskb_copy); * @ntail: room to add at tail * @gfp_mask: allocation priority * - * Expands (or creates identical copy, if &nhead and &ntail are zero) - * header of skb. &sk_buff itself is not changed. &sk_buff MUST have + * Expands (or creates identical copy, if @nhead and @ntail are zero) + * header of @skb. &sk_buff itself is not changed. &sk_buff MUST have * reference count of 1. Returns zero in the case of success or error, * if expansion failed. In the last case, &sk_buff is not changed. * @@ -2563,14 +2563,14 @@ EXPORT_SYMBOL(skb_prepare_seq_read); * @data: destination pointer for data to be returned * @st: state variable * - * Reads a block of skb data at &consumed relative to the + * Reads a block of skb data at @consumed relative to the * lower offset specified to skb_prepare_seq_read(). Assigns - * the head of the data block to &data and returns the length + * the head of the data block to @data and returns the length * of the block or 0 if the end of the skb data or the upper * offset has been reached. * * The caller is not required to consume all of the data - * returned, i.e. &consumed is typically set to the number + * returned, i.e. @consumed is typically set to the number * of bytes already consumed and the next call to * skb_seq_read() will return the remaining part of the block. * -- cgit v1.2.3 From 3797d3e8462efdaadb64164ca540626b55fe8336 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 Nov 2013 08:37:28 -0800 Subject: net: flow_dissector: small optimizations in IPv4 dissect By moving code around, we avoid : 1) A reload of iph->ihl (bit field, so needs a mask) 2) A conditional test (replaced by a conditional mov on x86) Fast path loads iph->protocol anyway. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 0242035192f1..d6ef17322500 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -68,13 +68,13 @@ ip: iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); if (!iph || iph->ihl < 5) return false; + nhoff += iph->ihl * 4; + ip_proto = iph->protocol; if (ip_is_fragment(iph)) ip_proto = 0; - else - ip_proto = iph->protocol; + iph_to_flow_copy_addrs(flow, iph); - nhoff += iph->ihl * 4; break; } case __constant_htons(ETH_P_IPV6): { -- cgit v1.2.3 From 13eb2ab2d33c57ebddc57437a7d341995fc9138c Mon Sep 17 00:00:00 2001 From: Andreas Henriksson Date: Thu, 7 Nov 2013 18:26:38 +0100 Subject: net: Fix "ip rule delete table 256" When trying to delete a table >= 256 using iproute2 the local table will be deleted. The table id is specified as a netlink attribute when it needs more then 8 bits and iproute2 then sets the table field to RT_TABLE_UNSPEC (0). Preconditions to matching the table id in the rule delete code doesn't seem to take the "table id in netlink attribute" into condition so the frh_get_table helper function never gets to do its job when matching against current rule. Use the helper function twice instead of peaking at the table value directly. Originally reported at: http://bugs.debian.org/724783 Reported-by: Nicolas HICHER Signed-off-by: Andreas Henriksson Signed-off-by: David S. Miller --- net/core/fib_rules.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 2e654138433c..f409e0bd35c0 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -460,7 +460,8 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh) if (frh->action && (frh->action != rule->action)) continue; - if (frh->table && (frh_get_table(frh, tb) != rule->table)) + if (frh_get_table(frh, tb) && + (frh_get_table(frh, tb) != rule->table)) continue; if (tb[FRA_PRIORITY] && -- cgit v1.2.3 From 6aafeef03b9d9ecf255f3a80ed85ee070260e1ae Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 6 Nov 2013 17:52:20 +0100 Subject: netfilter: push reasm skb through instead of original frag skbs Pushing original fragments through causes several problems. For example for matching, frags may not be matched correctly. Take following example: On HOSTA do: ip6tables -I INPUT -p icmpv6 -j DROP ip6tables -I INPUT -p icmpv6 -m icmp6 --icmpv6-type 128 -j ACCEPT and on HOSTB you do: ping6 HOSTA -s2000 (MTU is 1500) Incoming echo requests will be filtered out on HOSTA. This issue does not occur with smaller packets than MTU (where fragmentation does not happen) As was discussed previously, the only correct solution seems to be to use reassembled skb instead of separete frags. Doing this has positive side effects in reducing sk_buff by one pointer (nfct_reasm) and also the reams dances in ipvs and conntrack can be removed. Future plan is to remove net/ipv6/netfilter/nf_conntrack_reasm.c entirely and use code in net/ipv6/reassembly.c instead. Signed-off-by: Jiri Pirko Acked-by: Julian Anastasov Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/linux/skbuff.h | 32 --------------- include/net/ip_vs.h | 32 +-------------- include/net/netfilter/ipv6/nf_defrag_ipv6.h | 4 +- net/core/skbuff.c | 3 -- net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c | 56 +------------------------- net/ipv6/netfilter/nf_conntrack_reasm.c | 19 +-------- net/ipv6/netfilter/nf_defrag_ipv6_hooks.c | 7 +++- net/netfilter/ipvs/ip_vs_core.c | 55 +------------------------ net/netfilter/ipvs/ip_vs_pe_sip.c | 8 +--- 9 files changed, 13 insertions(+), 203 deletions(-) (limited to 'net/core') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 036ec7d8a83a..215b5ea1cb30 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -337,11 +337,6 @@ typedef unsigned int sk_buff_data_t; typedef unsigned char *sk_buff_data_t; #endif -#if defined(CONFIG_NF_DEFRAG_IPV4) || defined(CONFIG_NF_DEFRAG_IPV4_MODULE) || \ - defined(CONFIG_NF_DEFRAG_IPV6) || defined(CONFIG_NF_DEFRAG_IPV6_MODULE) -#define NET_SKBUFF_NF_DEFRAG_NEEDED 1 -#endif - /** * struct sk_buff - socket buffer * @next: Next buffer in list @@ -374,7 +369,6 @@ typedef unsigned char *sk_buff_data_t; * @protocol: Packet protocol from driver * @destructor: Destruct function * @nfct: Associated connection, if any - * @nfct_reasm: netfilter conntrack re-assembly pointer * @nf_bridge: Saved data about a bridged frame - see br_netfilter.c * @skb_iif: ifindex of device we arrived on * @tc_index: Traffic control index @@ -463,9 +457,6 @@ struct sk_buff { #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) struct nf_conntrack *nfct; #endif -#ifdef NET_SKBUFF_NF_DEFRAG_NEEDED - struct sk_buff *nfct_reasm; -#endif #ifdef CONFIG_BRIDGE_NETFILTER struct nf_bridge_info *nf_bridge; #endif @@ -2595,18 +2586,6 @@ static inline void nf_conntrack_get(struct nf_conntrack *nfct) atomic_inc(&nfct->use); } #endif -#ifdef NET_SKBUFF_NF_DEFRAG_NEEDED -static inline void nf_conntrack_get_reasm(struct sk_buff *skb) -{ - if (skb) - atomic_inc(&skb->users); -} -static inline void nf_conntrack_put_reasm(struct sk_buff *skb) -{ - if (skb) - kfree_skb(skb); -} -#endif #ifdef CONFIG_BRIDGE_NETFILTER static inline void nf_bridge_put(struct nf_bridge_info *nf_bridge) { @@ -2625,10 +2604,6 @@ static inline void nf_reset(struct sk_buff *skb) nf_conntrack_put(skb->nfct); skb->nfct = NULL; #endif -#ifdef NET_SKBUFF_NF_DEFRAG_NEEDED - nf_conntrack_put_reasm(skb->nfct_reasm); - skb->nfct_reasm = NULL; -#endif #ifdef CONFIG_BRIDGE_NETFILTER nf_bridge_put(skb->nf_bridge); skb->nf_bridge = NULL; @@ -2650,10 +2625,6 @@ static inline void __nf_copy(struct sk_buff *dst, const struct sk_buff *src) nf_conntrack_get(src->nfct); dst->nfctinfo = src->nfctinfo; #endif -#ifdef NET_SKBUFF_NF_DEFRAG_NEEDED - dst->nfct_reasm = src->nfct_reasm; - nf_conntrack_get_reasm(src->nfct_reasm); -#endif #ifdef CONFIG_BRIDGE_NETFILTER dst->nf_bridge = src->nf_bridge; nf_bridge_get(src->nf_bridge); @@ -2665,9 +2636,6 @@ static inline void nf_copy(struct sk_buff *dst, const struct sk_buff *src) #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) nf_conntrack_put(dst->nfct); #endif -#ifdef NET_SKBUFF_NF_DEFRAG_NEEDED - nf_conntrack_put_reasm(dst->nfct_reasm); -#endif #ifdef CONFIG_BRIDGE_NETFILTER nf_bridge_put(dst->nf_bridge); #endif diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index cd7275f9c463..5679d927562b 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -109,7 +109,6 @@ extern int ip_vs_conn_tab_size; struct ip_vs_iphdr { __u32 len; /* IPv4 simply where L4 starts IPv6 where L4 Transport Header starts */ - __u32 thoff_reasm; /* Transport Header Offset in nfct_reasm skb */ __u16 fragoffs; /* IPv6 fragment offset, 0 if first frag (or not frag)*/ __s16 protocol; __s32 flags; @@ -117,34 +116,12 @@ struct ip_vs_iphdr { union nf_inet_addr daddr; }; -/* Dependency to module: nf_defrag_ipv6 */ -#if defined(CONFIG_NF_DEFRAG_IPV6) || defined(CONFIG_NF_DEFRAG_IPV6_MODULE) -static inline struct sk_buff *skb_nfct_reasm(const struct sk_buff *skb) -{ - return skb->nfct_reasm; -} -static inline void *frag_safe_skb_hp(const struct sk_buff *skb, int offset, - int len, void *buffer, - const struct ip_vs_iphdr *ipvsh) -{ - if (unlikely(ipvsh->fragoffs && skb_nfct_reasm(skb))) - return skb_header_pointer(skb_nfct_reasm(skb), - ipvsh->thoff_reasm, len, buffer); - - return skb_header_pointer(skb, offset, len, buffer); -} -#else -static inline struct sk_buff *skb_nfct_reasm(const struct sk_buff *skb) -{ - return NULL; -} static inline void *frag_safe_skb_hp(const struct sk_buff *skb, int offset, int len, void *buffer, const struct ip_vs_iphdr *ipvsh) { return skb_header_pointer(skb, offset, len, buffer); } -#endif static inline void ip_vs_fill_ip4hdr(const void *nh, struct ip_vs_iphdr *iphdr) @@ -171,19 +148,12 @@ ip_vs_fill_iph_skb(int af, const struct sk_buff *skb, struct ip_vs_iphdr *iphdr) (struct ipv6hdr *)skb_network_header(skb); iphdr->saddr.in6 = iph->saddr; iphdr->daddr.in6 = iph->daddr; - /* ipv6_find_hdr() updates len, flags, thoff_reasm */ - iphdr->thoff_reasm = 0; + /* ipv6_find_hdr() updates len, flags */ iphdr->len = 0; iphdr->flags = 0; iphdr->protocol = ipv6_find_hdr(skb, &iphdr->len, -1, &iphdr->fragoffs, &iphdr->flags); - /* get proto from re-assembled packet and it's offset */ - if (skb_nfct_reasm(skb)) - iphdr->protocol = ipv6_find_hdr(skb_nfct_reasm(skb), - &iphdr->thoff_reasm, - -1, NULL, NULL); - } else #endif { diff --git a/include/net/netfilter/ipv6/nf_defrag_ipv6.h b/include/net/netfilter/ipv6/nf_defrag_ipv6.h index 5613412e7dc2..27666d8a0bd0 100644 --- a/include/net/netfilter/ipv6/nf_defrag_ipv6.h +++ b/include/net/netfilter/ipv6/nf_defrag_ipv6.h @@ -6,9 +6,7 @@ void nf_defrag_ipv6_enable(void); int nf_ct_frag6_init(void); void nf_ct_frag6_cleanup(void); struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb, u32 user); -void nf_ct_frag6_output(unsigned int hooknum, struct sk_buff *skb, - struct net_device *in, struct net_device *out, - int (*okfn)(struct sk_buff *)); +void nf_ct_frag6_consume_orig(struct sk_buff *skb); struct inet_frags_ctl; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 8c5197fe55a4..8cec1e6b844d 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -592,9 +592,6 @@ static void skb_release_head_state(struct sk_buff *skb) #if IS_ENABLED(CONFIG_NF_CONNTRACK) nf_conntrack_put(skb->nfct); #endif -#ifdef NET_SKBUFF_NF_DEFRAG_NEEDED - nf_conntrack_put_reasm(skb->nfct_reasm); -#endif #ifdef CONFIG_BRIDGE_NETFILTER nf_bridge_put(skb->nf_bridge); #endif diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c index 486545eb42ce..4cbc6b290dd5 100644 --- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c @@ -169,64 +169,13 @@ out: return nf_conntrack_confirm(skb); } -static unsigned int __ipv6_conntrack_in(struct net *net, - unsigned int hooknum, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - struct sk_buff *reasm = skb->nfct_reasm; - const struct nf_conn_help *help; - struct nf_conn *ct; - enum ip_conntrack_info ctinfo; - - /* This packet is fragmented and has reassembled packet. */ - if (reasm) { - /* Reassembled packet isn't parsed yet ? */ - if (!reasm->nfct) { - unsigned int ret; - - ret = nf_conntrack_in(net, PF_INET6, hooknum, reasm); - if (ret != NF_ACCEPT) - return ret; - } - - /* Conntrack helpers need the entire reassembled packet in the - * POST_ROUTING hook. In case of unconfirmed connections NAT - * might reassign a helper, so the entire packet is also - * required. - */ - ct = nf_ct_get(reasm, &ctinfo); - if (ct != NULL && !nf_ct_is_untracked(ct)) { - help = nfct_help(ct); - if ((help && help->helper) || !nf_ct_is_confirmed(ct)) { - nf_conntrack_get_reasm(reasm); - NF_HOOK_THRESH(NFPROTO_IPV6, hooknum, reasm, - (struct net_device *)in, - (struct net_device *)out, - okfn, NF_IP6_PRI_CONNTRACK + 1); - return NF_DROP_ERR(-ECANCELED); - } - } - - nf_conntrack_get(reasm->nfct); - skb->nfct = reasm->nfct; - skb->nfctinfo = reasm->nfctinfo; - return NF_ACCEPT; - } - - return nf_conntrack_in(net, PF_INET6, hooknum, skb); -} - static unsigned int ipv6_conntrack_in(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - return __ipv6_conntrack_in(dev_net(in), ops->hooknum, skb, in, out, - okfn); + return nf_conntrack_in(dev_net(in), PF_INET6, ops->hooknum, skb); } static unsigned int ipv6_conntrack_local(const struct nf_hook_ops *ops, @@ -240,8 +189,7 @@ static unsigned int ipv6_conntrack_local(const struct nf_hook_ops *ops, net_notice_ratelimited("ipv6_conntrack_local: packet too short\n"); return NF_ACCEPT; } - return __ipv6_conntrack_in(dev_net(out), ops->hooknum, skb, in, out, - okfn); + return nf_conntrack_in(dev_net(out), PF_INET6, ops->hooknum, skb); } static struct nf_hook_ops ipv6_conntrack_ops[] __read_mostly = { diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 4a258263d8ec..767ab8da8218 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -633,31 +633,16 @@ ret_orig: return skb; } -void nf_ct_frag6_output(unsigned int hooknum, struct sk_buff *skb, - struct net_device *in, struct net_device *out, - int (*okfn)(struct sk_buff *)) +void nf_ct_frag6_consume_orig(struct sk_buff *skb) { struct sk_buff *s, *s2; - unsigned int ret = 0; for (s = NFCT_FRAG6_CB(skb)->orig; s;) { - nf_conntrack_put_reasm(s->nfct_reasm); - nf_conntrack_get_reasm(skb); - s->nfct_reasm = skb; - s2 = s->next; s->next = NULL; - - if (ret != -ECANCELED) - ret = NF_HOOK_THRESH(NFPROTO_IPV6, hooknum, s, - in, out, okfn, - NF_IP6_PRI_CONNTRACK_DEFRAG + 1); - else - kfree_skb(s); - + consume_skb(s); s = s2; } - nf_conntrack_put_reasm(skb); } static int nf_ct_net_init(struct net *net) diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c index ec483aa3f60f..7b9a748c6bac 100644 --- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c +++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c @@ -75,8 +75,11 @@ static unsigned int ipv6_defrag(const struct nf_hook_ops *ops, if (reasm == skb) return NF_ACCEPT; - nf_ct_frag6_output(ops->hooknum, reasm, (struct net_device *)in, - (struct net_device *)out, okfn); + nf_ct_frag6_consume_orig(reasm); + + NF_HOOK_THRESH(NFPROTO_IPV6, ops->hooknum, reasm, + (struct net_device *) in, (struct net_device *) out, + okfn, NF_IP6_PRI_CONNTRACK_DEFRAG + 1); return NF_STOLEN; } diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 34fda62f40f6..4f26ee46b51f 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -1139,12 +1139,6 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) ip_vs_fill_iph_skb(af, skb, &iph); #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { - if (!iph.fragoffs && skb_nfct_reasm(skb)) { - struct sk_buff *reasm = skb_nfct_reasm(skb); - /* Save fw mark for coming frags */ - reasm->ipvs_property = 1; - reasm->mark = skb->mark; - } if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { int related; int verdict = ip_vs_out_icmp_v6(skb, &related, @@ -1614,12 +1608,6 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { - if (!iph.fragoffs && skb_nfct_reasm(skb)) { - struct sk_buff *reasm = skb_nfct_reasm(skb); - /* Save fw mark for coming frags. */ - reasm->ipvs_property = 1; - reasm->mark = skb->mark; - } if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { int related; int verdict = ip_vs_in_icmp_v6(skb, &related, hooknum, @@ -1671,9 +1659,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) /* sorry, all this trouble for a no-hit :) */ IP_VS_DBG_PKT(12, af, pp, skb, 0, "ip_vs_in: packet continues traversal as normal"); - if (iph.fragoffs && !skb_nfct_reasm(skb)) { + if (iph.fragoffs) { /* Fragment that couldn't be mapped to a conn entry - * and don't have any pointer to a reasm skb * is missing module nf_defrag_ipv6 */ IP_VS_DBG_RL("Unhandled frag, load nf_defrag_ipv6\n"); @@ -1755,38 +1742,6 @@ ip_vs_local_request4(const struct nf_hook_ops *ops, struct sk_buff *skb, #ifdef CONFIG_IP_VS_IPV6 -/* - * AF_INET6 fragment handling - * Copy info from first fragment, to the rest of them. - */ -static unsigned int -ip_vs_preroute_frag6(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - struct sk_buff *reasm = skb_nfct_reasm(skb); - struct net *net; - - /* Skip if not a "replay" from nf_ct_frag6_output or first fragment. - * ipvs_property is set when checking first fragment - * in ip_vs_in() and ip_vs_out(). - */ - if (reasm) - IP_VS_DBG(2, "Fragment recv prop:%d\n", reasm->ipvs_property); - if (!reasm || !reasm->ipvs_property) - return NF_ACCEPT; - - net = skb_net(skb); - if (!net_ipvs(net)->enable) - return NF_ACCEPT; - - /* Copy stored fw mark, saved in ip_vs_{in,out} */ - skb->mark = reasm->mark; - - return NF_ACCEPT; -} - /* * AF_INET6 handler in NF_INET_LOCAL_IN chain * Schedule and forward packets from remote clients @@ -1924,14 +1879,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { .priority = 100, }, #ifdef CONFIG_IP_VS_IPV6 - /* After mangle & nat fetch 2:nd fragment and following */ - { - .hook = ip_vs_preroute_frag6, - .owner = THIS_MODULE, - .pf = NFPROTO_IPV6, - .hooknum = NF_INET_PRE_ROUTING, - .priority = NF_IP6_PRI_NAT_DST + 1, - }, /* After packet filtering, change source only for VS/NAT */ { .hook = ip_vs_reply6, diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c index 9ef22bdce9f1..bed5f7042529 100644 --- a/net/netfilter/ipvs/ip_vs_pe_sip.c +++ b/net/netfilter/ipvs/ip_vs_pe_sip.c @@ -65,7 +65,6 @@ static int get_callid(const char *dptr, unsigned int dataoff, static int ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb) { - struct sk_buff *reasm = skb_nfct_reasm(skb); struct ip_vs_iphdr iph; unsigned int dataoff, datalen, matchoff, matchlen; const char *dptr; @@ -79,15 +78,10 @@ ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb) /* todo: IPv6 fragments: * I think this only should be done for the first fragment. /HS */ - if (reasm) { - skb = reasm; - dataoff = iph.thoff_reasm + sizeof(struct udphdr); - } else - dataoff = iph.len + sizeof(struct udphdr); + dataoff = iph.len + sizeof(struct udphdr); if (dataoff >= skb->len) return -EINVAL; - /* todo: Check if this will mess-up the reasm skb !!! /HS */ retc = skb_linearize(skb); if (retc < 0) return retc; -- cgit v1.2.3